├── .gitignore ├── README.md ├── benchmark ├── chatlmsys │ ├── README.md │ ├── cfg_gen.py │ ├── chatlmsys_translation.py │ ├── merged │ │ ├── merged_requests_over_time_models_days_from_day55_to_day55_condense1200_N17_maxrate_19_avgrate_6_placement_requests_over_time_models_days_from_day55_to_day55_condense1200_N17_maxrate_19_avgrate_6_GPUnum32_flexsm.json │ │ ├── merged_requests_over_time_models_days_from_day55_to_day55_condense1200_N17_maxrate_19_avgrate_6_placement_requests_over_time_models_days_from_day55_to_day55_condense1200_N17_maxrate_19_avgrate_6_GPUnum32_spatial.json │ │ ├── merged_requests_over_time_models_days_from_day55_to_day55_condense1200_N17_maxrate_19_avgrate_6_placement_requests_over_time_models_days_from_day55_to_day55_condense1200_N17_maxrate_19_avgrate_6_GPUnum32_temporal.json │ │ ├── merged_requests_over_time_models_days_from_day55_to_day55_condense1600_N17_maxrate_26_avgrate_8_placement_requests_over_time_models_days_from_day55_to_day55_condense1600_N17_maxrate_26_avgrate_8_GPUnum32_flexsm.json │ │ ├── merged_requests_over_time_models_days_from_day55_to_day55_condense1600_N17_maxrate_26_avgrate_8_placement_requests_over_time_models_days_from_day55_to_day55_condense1600_N17_maxrate_26_avgrate_8_GPUnum32_spatial.json │ │ ├── merged_requests_over_time_models_days_from_day55_to_day55_condense1600_N17_maxrate_26_avgrate_8_placement_requests_over_time_models_days_from_day55_to_day55_condense1600_N17_maxrate_26_avgrate_8_GPUnum32_temporal.json │ │ ├── merged_requests_over_time_models_days_from_day55_to_day55_condense2000_N17_maxrate_32_avgrate_11_placement_requests_over_time_models_days_from_day55_to_day55_condense2000_N17_maxrate_32_avgrate_11_GPUnum32_flexsm.json │ │ ├── merged_requests_over_time_models_days_from_day55_to_day55_condense2000_N17_maxrate_32_avgrate_11_placement_requests_over_time_models_days_from_day55_to_day55_condense2000_N17_maxrate_32_avgrate_11_GPUnum32_spatial.json │ │ ├── merged_requests_over_time_models_days_from_day55_to_day55_condense2000_N17_maxrate_32_avgrate_11_placement_requests_over_time_models_days_from_day55_to_day55_condense2000_N17_maxrate_32_avgrate_11_GPUnum32_temporal.json │ │ ├── merged_requests_over_time_models_days_from_day55_to_day55_condense500_N17_maxrate_8_avgrate_2_placement_requests_over_time_models_days_from_day55_to_day55_condense500_N17_maxrate_8_avgrate_2_GPUnum32_flexsm.json │ │ ├── merged_requests_over_time_models_days_from_day55_to_day55_condense500_N17_maxrate_8_avgrate_2_placement_requests_over_time_models_days_from_day55_to_day55_condense500_N17_maxrate_8_avgrate_2_GPUnum32_spatial.json │ │ ├── merged_requests_over_time_models_days_from_day55_to_day55_condense500_N17_maxrate_8_avgrate_2_placement_requests_over_time_models_days_from_day55_to_day55_condense500_N17_maxrate_8_avgrate_2_GPUnum32_temporal.json │ │ ├── merged_requests_over_time_models_days_from_day55_to_day55_condense800_N17_maxrate_13_avgrate_4_placement_requests_over_time_models_days_from_day55_to_day55_condense800_N17_maxrate_13_avgrate_4_GPUnum32_flexsm.json │ │ ├── merged_requests_over_time_models_days_from_day55_to_day55_condense800_N17_maxrate_13_avgrate_4_placement_requests_over_time_models_days_from_day55_to_day55_condense800_N17_maxrate_13_avgrate_4_GPUnum32_spatial.json │ │ └── merged_requests_over_time_models_days_from_day55_to_day55_condense800_N17_maxrate_13_avgrate_4_placement_requests_over_time_models_days_from_day55_to_day55_condense800_N17_maxrate_13_avgrate_4_GPUnum32_temporal.json │ ├── muxserve2spatial.py │ ├── muxserve2temporal.py │ ├── plot.py │ ├── profile.sh │ ├── scan.py │ └── yamls │ │ ├── .gitignore │ │ ├── flexsm │ │ ├── requests_over_time_models_days_from_day55_to_day55_condense1200_N17_maxrate_19_avgrate_6_GPUnum32_mesh_size1_idx10.yaml │ │ ├── requests_over_time_models_days_from_day55_to_day55_condense1200_N17_maxrate_19_avgrate_6_GPUnum32_mesh_size1_idx7.yaml │ │ ├── requests_over_time_models_days_from_day55_to_day55_condense1200_N17_maxrate_19_avgrate_6_GPUnum32_mesh_size1_idx8.yaml │ │ ├── requests_over_time_models_days_from_day55_to_day55_condense1200_N17_maxrate_19_avgrate_6_GPUnum32_mesh_size1_idx9.yaml │ │ ├── requests_over_time_models_days_from_day55_to_day55_condense1200_N17_maxrate_19_avgrate_6_GPUnum32_mesh_size2_idx5.yaml │ │ ├── requests_over_time_models_days_from_day55_to_day55_condense1200_N17_maxrate_19_avgrate_6_GPUnum32_mesh_size2_idx6.yaml │ │ ├── requests_over_time_models_days_from_day55_to_day55_condense1200_N17_maxrate_19_avgrate_6_GPUnum32_mesh_size4_idx1.yaml │ │ ├── requests_over_time_models_days_from_day55_to_day55_condense1200_N17_maxrate_19_avgrate_6_GPUnum32_mesh_size4_idx3.yaml │ │ ├── requests_over_time_models_days_from_day55_to_day55_condense1200_N17_maxrate_19_avgrate_6_GPUnum32_mesh_size4_idx4.yaml │ │ ├── requests_over_time_models_days_from_day55_to_day55_condense1200_N17_maxrate_19_avgrate_6_GPUnum32_mesh_size8_idx0.yaml │ │ ├── requests_over_time_models_days_from_day55_to_day55_condense1600_N17_maxrate_26_avgrate_8_GPUnum32_mesh_size2_idx6.yaml │ │ ├── requests_over_time_models_days_from_day55_to_day55_condense1600_N17_maxrate_26_avgrate_8_GPUnum32_mesh_size2_idx7.yaml │ │ ├── requests_over_time_models_days_from_day55_to_day55_condense1600_N17_maxrate_26_avgrate_8_GPUnum32_mesh_size4_idx1.yaml │ │ ├── requests_over_time_models_days_from_day55_to_day55_condense1600_N17_maxrate_26_avgrate_8_GPUnum32_mesh_size4_idx2.yaml │ │ ├── requests_over_time_models_days_from_day55_to_day55_condense1600_N17_maxrate_26_avgrate_8_GPUnum32_mesh_size4_idx3.yaml │ │ ├── requests_over_time_models_days_from_day55_to_day55_condense1600_N17_maxrate_26_avgrate_8_GPUnum32_mesh_size4_idx4.yaml │ │ ├── requests_over_time_models_days_from_day55_to_day55_condense1600_N17_maxrate_26_avgrate_8_GPUnum32_mesh_size4_idx5.yaml │ │ ├── requests_over_time_models_days_from_day55_to_day55_condense1600_N17_maxrate_26_avgrate_8_GPUnum32_mesh_size8_idx0.yaml │ │ ├── requests_over_time_models_days_from_day55_to_day55_condense2000_N17_maxrate_32_avgrate_11_GPUnum32_mesh_size2_idx6.yaml │ │ ├── requests_over_time_models_days_from_day55_to_day55_condense2000_N17_maxrate_32_avgrate_11_GPUnum32_mesh_size2_idx7.yaml │ │ ├── requests_over_time_models_days_from_day55_to_day55_condense2000_N17_maxrate_32_avgrate_11_GPUnum32_mesh_size4_idx1.yaml │ │ ├── requests_over_time_models_days_from_day55_to_day55_condense2000_N17_maxrate_32_avgrate_11_GPUnum32_mesh_size4_idx2.yaml │ │ ├── requests_over_time_models_days_from_day55_to_day55_condense2000_N17_maxrate_32_avgrate_11_GPUnum32_mesh_size4_idx3.yaml │ │ ├── requests_over_time_models_days_from_day55_to_day55_condense2000_N17_maxrate_32_avgrate_11_GPUnum32_mesh_size4_idx4.yaml │ │ ├── requests_over_time_models_days_from_day55_to_day55_condense2000_N17_maxrate_32_avgrate_11_GPUnum32_mesh_size4_idx5.yaml │ │ ├── requests_over_time_models_days_from_day55_to_day55_condense2000_N17_maxrate_32_avgrate_11_GPUnum32_mesh_size8_idx0.yaml │ │ ├── requests_over_time_models_days_from_day55_to_day55_condense500_N17_maxrate_8_avgrate_2_GPUnum32_mesh_size2_idx3.yaml │ │ ├── requests_over_time_models_days_from_day55_to_day55_condense500_N17_maxrate_8_avgrate_2_GPUnum32_mesh_size2_idx4.yaml │ │ ├── requests_over_time_models_days_from_day55_to_day55_condense500_N17_maxrate_8_avgrate_2_GPUnum32_mesh_size2_idx5.yaml │ │ ├── requests_over_time_models_days_from_day55_to_day55_condense500_N17_maxrate_8_avgrate_2_GPUnum32_mesh_size2_idx6.yaml │ │ ├── requests_over_time_models_days_from_day55_to_day55_condense500_N17_maxrate_8_avgrate_2_GPUnum32_mesh_size2_idx7.yaml │ │ ├── requests_over_time_models_days_from_day55_to_day55_condense500_N17_maxrate_8_avgrate_2_GPUnum32_mesh_size2_idx9.yaml │ │ ├── requests_over_time_models_days_from_day55_to_day55_condense500_N17_maxrate_8_avgrate_2_GPUnum32_mesh_size4_idx1.yaml │ │ ├── requests_over_time_models_days_from_day55_to_day55_condense500_N17_maxrate_8_avgrate_2_GPUnum32_mesh_size4_idx2.yaml │ │ ├── requests_over_time_models_days_from_day55_to_day55_condense500_N17_maxrate_8_avgrate_2_GPUnum32_mesh_size8_idx0.yaml │ │ ├── requests_over_time_models_days_from_day55_to_day55_condense800_N17_maxrate_13_avgrate_4_GPUnum32_mesh_size4_idx1.yaml │ │ ├── requests_over_time_models_days_from_day55_to_day55_condense800_N17_maxrate_13_avgrate_4_GPUnum32_mesh_size4_idx3.yaml │ │ ├── requests_over_time_models_days_from_day55_to_day55_condense800_N17_maxrate_13_avgrate_4_GPUnum32_mesh_size4_idx4.yaml │ │ ├── requests_over_time_models_days_from_day55_to_day55_condense800_N17_maxrate_13_avgrate_4_GPUnum32_mesh_size4_idx5.yaml │ │ ├── requests_over_time_models_days_from_day55_to_day55_condense800_N17_maxrate_13_avgrate_4_GPUnum32_mesh_size4_idx6.yaml │ │ └── requests_over_time_models_days_from_day55_to_day55_condense800_N17_maxrate_13_avgrate_4_GPUnum32_mesh_size8_idx0.yaml │ │ ├── requests_over_time_models_days_from_day55_to_day55_condense1200_N17_maxrate_19_avgrate_6.yaml │ │ ├── requests_over_time_models_days_from_day55_to_day55_condense1600_N17_maxrate_26_avgrate_8.yaml │ │ ├── requests_over_time_models_days_from_day55_to_day55_condense2000_N17_maxrate_32_avgrate_11.yaml │ │ ├── requests_over_time_models_days_from_day55_to_day55_condense500_N17_maxrate_8_avgrate_2.yaml │ │ ├── requests_over_time_models_days_from_day55_to_day55_condense800_N17_maxrate_13_avgrate_4.yaml │ │ ├── spatial │ │ ├── requests_over_time_models_days_from_day55_to_day55_condense1200_N17_maxrate_19_avgrate_6_GPUnum32_mesh_size1_idx0_spatial.yaml │ │ ├── requests_over_time_models_days_from_day55_to_day55_condense1200_N17_maxrate_19_avgrate_6_GPUnum32_mesh_size1_idx12_spatial.yaml │ │ ├── requests_over_time_models_days_from_day55_to_day55_condense1200_N17_maxrate_19_avgrate_6_GPUnum32_mesh_size1_idx14_spatial.yaml │ │ ├── requests_over_time_models_days_from_day55_to_day55_condense1200_N17_maxrate_19_avgrate_6_GPUnum32_mesh_size1_idx2_spatial.yaml │ │ ├── requests_over_time_models_days_from_day55_to_day55_condense1200_N17_maxrate_19_avgrate_6_GPUnum32_mesh_size1_idx3_spatial.yaml │ │ ├── requests_over_time_models_days_from_day55_to_day55_condense1200_N17_maxrate_19_avgrate_6_GPUnum32_mesh_size1_idx6_spatial.yaml │ │ ├── requests_over_time_models_days_from_day55_to_day55_condense1200_N17_maxrate_19_avgrate_6_GPUnum32_mesh_size2_idx10_spatial.yaml │ │ ├── requests_over_time_models_days_from_day55_to_day55_condense1200_N17_maxrate_19_avgrate_6_GPUnum32_mesh_size2_idx11_spatial.yaml │ │ ├── requests_over_time_models_days_from_day55_to_day55_condense1200_N17_maxrate_19_avgrate_6_GPUnum32_mesh_size2_idx1_spatial.yaml │ │ ├── requests_over_time_models_days_from_day55_to_day55_condense1200_N17_maxrate_19_avgrate_6_GPUnum32_mesh_size2_idx4_spatial.yaml │ │ ├── requests_over_time_models_days_from_day55_to_day55_condense1200_N17_maxrate_19_avgrate_6_GPUnum32_mesh_size2_idx5_spatial.yaml │ │ ├── requests_over_time_models_days_from_day55_to_day55_condense1200_N17_maxrate_19_avgrate_6_GPUnum32_mesh_size2_idx7_spatial.yaml │ │ ├── requests_over_time_models_days_from_day55_to_day55_condense1200_N17_maxrate_19_avgrate_6_GPUnum32_mesh_size2_idx8_spatial.yaml │ │ ├── requests_over_time_models_days_from_day55_to_day55_condense1200_N17_maxrate_19_avgrate_6_GPUnum32_mesh_size4_idx13_spatial.yaml │ │ ├── requests_over_time_models_days_from_day55_to_day55_condense1200_N17_maxrate_19_avgrate_6_GPUnum32_mesh_size4_idx15_spatial.yaml │ │ ├── requests_over_time_models_days_from_day55_to_day55_condense1200_N17_maxrate_19_avgrate_6_GPUnum32_mesh_size4_idx9_spatial.yaml │ │ ├── requests_over_time_models_days_from_day55_to_day55_condense1600_N17_maxrate_26_avgrate_8_GPUnum32_mesh_size1_idx0_spatial.yaml │ │ ├── requests_over_time_models_days_from_day55_to_day55_condense1600_N17_maxrate_26_avgrate_8_GPUnum32_mesh_size1_idx12_spatial.yaml │ │ ├── requests_over_time_models_days_from_day55_to_day55_condense1600_N17_maxrate_26_avgrate_8_GPUnum32_mesh_size1_idx14_spatial.yaml │ │ ├── requests_over_time_models_days_from_day55_to_day55_condense1600_N17_maxrate_26_avgrate_8_GPUnum32_mesh_size1_idx2_spatial.yaml │ │ ├── requests_over_time_models_days_from_day55_to_day55_condense1600_N17_maxrate_26_avgrate_8_GPUnum32_mesh_size1_idx3_spatial.yaml │ │ ├── requests_over_time_models_days_from_day55_to_day55_condense1600_N17_maxrate_26_avgrate_8_GPUnum32_mesh_size1_idx6_spatial.yaml │ │ ├── requests_over_time_models_days_from_day55_to_day55_condense1600_N17_maxrate_26_avgrate_8_GPUnum32_mesh_size2_idx10_spatial.yaml │ │ ├── requests_over_time_models_days_from_day55_to_day55_condense1600_N17_maxrate_26_avgrate_8_GPUnum32_mesh_size2_idx11_spatial.yaml │ │ ├── requests_over_time_models_days_from_day55_to_day55_condense1600_N17_maxrate_26_avgrate_8_GPUnum32_mesh_size2_idx1_spatial.yaml │ │ ├── requests_over_time_models_days_from_day55_to_day55_condense1600_N17_maxrate_26_avgrate_8_GPUnum32_mesh_size2_idx4_spatial.yaml │ │ ├── requests_over_time_models_days_from_day55_to_day55_condense1600_N17_maxrate_26_avgrate_8_GPUnum32_mesh_size2_idx5_spatial.yaml │ │ ├── requests_over_time_models_days_from_day55_to_day55_condense1600_N17_maxrate_26_avgrate_8_GPUnum32_mesh_size2_idx7_spatial.yaml │ │ ├── requests_over_time_models_days_from_day55_to_day55_condense1600_N17_maxrate_26_avgrate_8_GPUnum32_mesh_size2_idx8_spatial.yaml │ │ ├── requests_over_time_models_days_from_day55_to_day55_condense1600_N17_maxrate_26_avgrate_8_GPUnum32_mesh_size4_idx13_spatial.yaml │ │ ├── requests_over_time_models_days_from_day55_to_day55_condense1600_N17_maxrate_26_avgrate_8_GPUnum32_mesh_size4_idx15_spatial.yaml │ │ ├── requests_over_time_models_days_from_day55_to_day55_condense1600_N17_maxrate_26_avgrate_8_GPUnum32_mesh_size4_idx9_spatial.yaml │ │ ├── requests_over_time_models_days_from_day55_to_day55_condense2000_N17_maxrate_32_avgrate_11_GPUnum32_mesh_size1_idx0_spatial.yaml │ │ ├── requests_over_time_models_days_from_day55_to_day55_condense2000_N17_maxrate_32_avgrate_11_GPUnum32_mesh_size1_idx12_spatial.yaml │ │ ├── requests_over_time_models_days_from_day55_to_day55_condense2000_N17_maxrate_32_avgrate_11_GPUnum32_mesh_size1_idx14_spatial.yaml │ │ ├── requests_over_time_models_days_from_day55_to_day55_condense2000_N17_maxrate_32_avgrate_11_GPUnum32_mesh_size1_idx2_spatial.yaml │ │ ├── requests_over_time_models_days_from_day55_to_day55_condense2000_N17_maxrate_32_avgrate_11_GPUnum32_mesh_size1_idx3_spatial.yaml │ │ ├── requests_over_time_models_days_from_day55_to_day55_condense2000_N17_maxrate_32_avgrate_11_GPUnum32_mesh_size1_idx6_spatial.yaml │ │ ├── requests_over_time_models_days_from_day55_to_day55_condense2000_N17_maxrate_32_avgrate_11_GPUnum32_mesh_size2_idx10_spatial.yaml │ │ ├── requests_over_time_models_days_from_day55_to_day55_condense2000_N17_maxrate_32_avgrate_11_GPUnum32_mesh_size2_idx11_spatial.yaml │ │ ├── requests_over_time_models_days_from_day55_to_day55_condense2000_N17_maxrate_32_avgrate_11_GPUnum32_mesh_size2_idx1_spatial.yaml │ │ ├── requests_over_time_models_days_from_day55_to_day55_condense2000_N17_maxrate_32_avgrate_11_GPUnum32_mesh_size2_idx4_spatial.yaml │ │ ├── requests_over_time_models_days_from_day55_to_day55_condense2000_N17_maxrate_32_avgrate_11_GPUnum32_mesh_size2_idx5_spatial.yaml │ │ ├── requests_over_time_models_days_from_day55_to_day55_condense2000_N17_maxrate_32_avgrate_11_GPUnum32_mesh_size2_idx7_spatial.yaml │ │ ├── requests_over_time_models_days_from_day55_to_day55_condense2000_N17_maxrate_32_avgrate_11_GPUnum32_mesh_size2_idx8_spatial.yaml │ │ ├── requests_over_time_models_days_from_day55_to_day55_condense2000_N17_maxrate_32_avgrate_11_GPUnum32_mesh_size4_idx13_spatial.yaml │ │ ├── requests_over_time_models_days_from_day55_to_day55_condense2000_N17_maxrate_32_avgrate_11_GPUnum32_mesh_size4_idx15_spatial.yaml │ │ ├── requests_over_time_models_days_from_day55_to_day55_condense2000_N17_maxrate_32_avgrate_11_GPUnum32_mesh_size4_idx9_spatial.yaml │ │ ├── requests_over_time_models_days_from_day55_to_day55_condense500_N17_maxrate_8_avgrate_2_GPUnum32_mesh_size1_idx0_spatial.yaml │ │ ├── requests_over_time_models_days_from_day55_to_day55_condense500_N17_maxrate_8_avgrate_2_GPUnum32_mesh_size1_idx12_spatial.yaml │ │ ├── requests_over_time_models_days_from_day55_to_day55_condense500_N17_maxrate_8_avgrate_2_GPUnum32_mesh_size1_idx14_spatial.yaml │ │ ├── requests_over_time_models_days_from_day55_to_day55_condense500_N17_maxrate_8_avgrate_2_GPUnum32_mesh_size1_idx2_spatial.yaml │ │ ├── requests_over_time_models_days_from_day55_to_day55_condense500_N17_maxrate_8_avgrate_2_GPUnum32_mesh_size1_idx3_spatial.yaml │ │ ├── requests_over_time_models_days_from_day55_to_day55_condense500_N17_maxrate_8_avgrate_2_GPUnum32_mesh_size1_idx6_spatial.yaml │ │ ├── requests_over_time_models_days_from_day55_to_day55_condense500_N17_maxrate_8_avgrate_2_GPUnum32_mesh_size2_idx10_spatial.yaml │ │ ├── requests_over_time_models_days_from_day55_to_day55_condense500_N17_maxrate_8_avgrate_2_GPUnum32_mesh_size2_idx11_spatial.yaml │ │ ├── requests_over_time_models_days_from_day55_to_day55_condense500_N17_maxrate_8_avgrate_2_GPUnum32_mesh_size2_idx1_spatial.yaml │ │ ├── requests_over_time_models_days_from_day55_to_day55_condense500_N17_maxrate_8_avgrate_2_GPUnum32_mesh_size2_idx4_spatial.yaml │ │ ├── requests_over_time_models_days_from_day55_to_day55_condense500_N17_maxrate_8_avgrate_2_GPUnum32_mesh_size2_idx5_spatial.yaml │ │ ├── requests_over_time_models_days_from_day55_to_day55_condense500_N17_maxrate_8_avgrate_2_GPUnum32_mesh_size2_idx7_spatial.yaml │ │ ├── requests_over_time_models_days_from_day55_to_day55_condense500_N17_maxrate_8_avgrate_2_GPUnum32_mesh_size2_idx8_spatial.yaml │ │ ├── requests_over_time_models_days_from_day55_to_day55_condense500_N17_maxrate_8_avgrate_2_GPUnum32_mesh_size4_idx13_spatial.yaml │ │ ├── requests_over_time_models_days_from_day55_to_day55_condense500_N17_maxrate_8_avgrate_2_GPUnum32_mesh_size4_idx15_spatial.yaml │ │ ├── requests_over_time_models_days_from_day55_to_day55_condense500_N17_maxrate_8_avgrate_2_GPUnum32_mesh_size4_idx9_spatial.yaml │ │ ├── requests_over_time_models_days_from_day55_to_day55_condense800_N17_maxrate_13_avgrate_4_GPUnum32_mesh_size1_idx0_spatial.yaml │ │ ├── requests_over_time_models_days_from_day55_to_day55_condense800_N17_maxrate_13_avgrate_4_GPUnum32_mesh_size1_idx12_spatial.yaml │ │ ├── requests_over_time_models_days_from_day55_to_day55_condense800_N17_maxrate_13_avgrate_4_GPUnum32_mesh_size1_idx14_spatial.yaml │ │ ├── requests_over_time_models_days_from_day55_to_day55_condense800_N17_maxrate_13_avgrate_4_GPUnum32_mesh_size1_idx2_spatial.yaml │ │ ├── requests_over_time_models_days_from_day55_to_day55_condense800_N17_maxrate_13_avgrate_4_GPUnum32_mesh_size1_idx3_spatial.yaml │ │ ├── requests_over_time_models_days_from_day55_to_day55_condense800_N17_maxrate_13_avgrate_4_GPUnum32_mesh_size1_idx6_spatial.yaml │ │ ├── requests_over_time_models_days_from_day55_to_day55_condense800_N17_maxrate_13_avgrate_4_GPUnum32_mesh_size2_idx10_spatial.yaml │ │ ├── requests_over_time_models_days_from_day55_to_day55_condense800_N17_maxrate_13_avgrate_4_GPUnum32_mesh_size2_idx11_spatial.yaml │ │ ├── requests_over_time_models_days_from_day55_to_day55_condense800_N17_maxrate_13_avgrate_4_GPUnum32_mesh_size2_idx1_spatial.yaml │ │ ├── requests_over_time_models_days_from_day55_to_day55_condense800_N17_maxrate_13_avgrate_4_GPUnum32_mesh_size2_idx4_spatial.yaml │ │ ├── requests_over_time_models_days_from_day55_to_day55_condense800_N17_maxrate_13_avgrate_4_GPUnum32_mesh_size2_idx5_spatial.yaml │ │ ├── requests_over_time_models_days_from_day55_to_day55_condense800_N17_maxrate_13_avgrate_4_GPUnum32_mesh_size2_idx7_spatial.yaml │ │ ├── requests_over_time_models_days_from_day55_to_day55_condense800_N17_maxrate_13_avgrate_4_GPUnum32_mesh_size2_idx8_spatial.yaml │ │ ├── requests_over_time_models_days_from_day55_to_day55_condense800_N17_maxrate_13_avgrate_4_GPUnum32_mesh_size4_idx13_spatial.yaml │ │ ├── requests_over_time_models_days_from_day55_to_day55_condense800_N17_maxrate_13_avgrate_4_GPUnum32_mesh_size4_idx15_spatial.yaml │ │ └── requests_over_time_models_days_from_day55_to_day55_condense800_N17_maxrate_13_avgrate_4_GPUnum32_mesh_size4_idx9_spatial.yaml │ │ └── temporal │ │ ├── requests_over_time_models_days_from_day55_to_day55_condense1200_N17_maxrate_19_avgrate_6_GPUnum32_mesh_size1_idx10_temporal.yaml │ │ ├── requests_over_time_models_days_from_day55_to_day55_condense1200_N17_maxrate_19_avgrate_6_GPUnum32_mesh_size1_idx7_temporal.yaml │ │ ├── requests_over_time_models_days_from_day55_to_day55_condense1200_N17_maxrate_19_avgrate_6_GPUnum32_mesh_size1_idx8_temporal.yaml │ │ ├── requests_over_time_models_days_from_day55_to_day55_condense1200_N17_maxrate_19_avgrate_6_GPUnum32_mesh_size1_idx9_temporal.yaml │ │ ├── requests_over_time_models_days_from_day55_to_day55_condense1200_N17_maxrate_19_avgrate_6_GPUnum32_mesh_size2_idx5_temporal.yaml │ │ ├── requests_over_time_models_days_from_day55_to_day55_condense1200_N17_maxrate_19_avgrate_6_GPUnum32_mesh_size2_idx6_temporal.yaml │ │ ├── requests_over_time_models_days_from_day55_to_day55_condense1200_N17_maxrate_19_avgrate_6_GPUnum32_mesh_size4_idx1_temporal.yaml │ │ ├── requests_over_time_models_days_from_day55_to_day55_condense1200_N17_maxrate_19_avgrate_6_GPUnum32_mesh_size4_idx3_temporal.yaml │ │ ├── requests_over_time_models_days_from_day55_to_day55_condense1200_N17_maxrate_19_avgrate_6_GPUnum32_mesh_size4_idx4_temporal.yaml │ │ ├── requests_over_time_models_days_from_day55_to_day55_condense1200_N17_maxrate_19_avgrate_6_GPUnum32_mesh_size8_idx0_temporal.yaml │ │ ├── requests_over_time_models_days_from_day55_to_day55_condense1600_N17_maxrate_26_avgrate_8_GPUnum32_mesh_size2_idx6_temporal.yaml │ │ ├── requests_over_time_models_days_from_day55_to_day55_condense1600_N17_maxrate_26_avgrate_8_GPUnum32_mesh_size2_idx7_temporal.yaml │ │ ├── requests_over_time_models_days_from_day55_to_day55_condense1600_N17_maxrate_26_avgrate_8_GPUnum32_mesh_size4_idx1_temporal.yaml │ │ ├── requests_over_time_models_days_from_day55_to_day55_condense1600_N17_maxrate_26_avgrate_8_GPUnum32_mesh_size4_idx2_temporal.yaml │ │ ├── requests_over_time_models_days_from_day55_to_day55_condense1600_N17_maxrate_26_avgrate_8_GPUnum32_mesh_size4_idx3_temporal.yaml │ │ ├── requests_over_time_models_days_from_day55_to_day55_condense1600_N17_maxrate_26_avgrate_8_GPUnum32_mesh_size4_idx4_temporal.yaml │ │ ├── requests_over_time_models_days_from_day55_to_day55_condense1600_N17_maxrate_26_avgrate_8_GPUnum32_mesh_size4_idx5_temporal.yaml │ │ ├── requests_over_time_models_days_from_day55_to_day55_condense1600_N17_maxrate_26_avgrate_8_GPUnum32_mesh_size8_idx0_temporal.yaml │ │ ├── requests_over_time_models_days_from_day55_to_day55_condense2000_N17_maxrate_32_avgrate_11_GPUnum32_mesh_size2_idx6_temporal.yaml │ │ ├── requests_over_time_models_days_from_day55_to_day55_condense2000_N17_maxrate_32_avgrate_11_GPUnum32_mesh_size2_idx7_temporal.yaml │ │ ├── requests_over_time_models_days_from_day55_to_day55_condense2000_N17_maxrate_32_avgrate_11_GPUnum32_mesh_size4_idx1_temporal.yaml │ │ ├── requests_over_time_models_days_from_day55_to_day55_condense2000_N17_maxrate_32_avgrate_11_GPUnum32_mesh_size4_idx2_temporal.yaml │ │ ├── requests_over_time_models_days_from_day55_to_day55_condense2000_N17_maxrate_32_avgrate_11_GPUnum32_mesh_size4_idx3_temporal.yaml │ │ ├── requests_over_time_models_days_from_day55_to_day55_condense2000_N17_maxrate_32_avgrate_11_GPUnum32_mesh_size4_idx4_temporal.yaml │ │ ├── requests_over_time_models_days_from_day55_to_day55_condense2000_N17_maxrate_32_avgrate_11_GPUnum32_mesh_size4_idx5_temporal.yaml │ │ ├── requests_over_time_models_days_from_day55_to_day55_condense2000_N17_maxrate_32_avgrate_11_GPUnum32_mesh_size8_idx0_temporal.yaml │ │ ├── requests_over_time_models_days_from_day55_to_day55_condense500_N17_maxrate_8_avgrate_2_GPUnum32_mesh_size2_idx3_temporal.yaml │ │ ├── requests_over_time_models_days_from_day55_to_day55_condense500_N17_maxrate_8_avgrate_2_GPUnum32_mesh_size2_idx4_temporal.yaml │ │ ├── requests_over_time_models_days_from_day55_to_day55_condense500_N17_maxrate_8_avgrate_2_GPUnum32_mesh_size2_idx5_temporal.yaml │ │ ├── requests_over_time_models_days_from_day55_to_day55_condense500_N17_maxrate_8_avgrate_2_GPUnum32_mesh_size2_idx6_temporal.yaml │ │ ├── requests_over_time_models_days_from_day55_to_day55_condense500_N17_maxrate_8_avgrate_2_GPUnum32_mesh_size2_idx7_temporal.yaml │ │ ├── requests_over_time_models_days_from_day55_to_day55_condense500_N17_maxrate_8_avgrate_2_GPUnum32_mesh_size2_idx9_temporal.yaml │ │ ├── requests_over_time_models_days_from_day55_to_day55_condense500_N17_maxrate_8_avgrate_2_GPUnum32_mesh_size4_idx1_temporal.yaml │ │ ├── requests_over_time_models_days_from_day55_to_day55_condense500_N17_maxrate_8_avgrate_2_GPUnum32_mesh_size4_idx2_temporal.yaml │ │ ├── requests_over_time_models_days_from_day55_to_day55_condense500_N17_maxrate_8_avgrate_2_GPUnum32_mesh_size8_idx0_temporal.yaml │ │ ├── requests_over_time_models_days_from_day55_to_day55_condense800_N17_maxrate_13_avgrate_4_GPUnum32_mesh_size4_idx1_temporal.yaml │ │ ├── requests_over_time_models_days_from_day55_to_day55_condense800_N17_maxrate_13_avgrate_4_GPUnum32_mesh_size4_idx3_temporal.yaml │ │ ├── requests_over_time_models_days_from_day55_to_day55_condense800_N17_maxrate_13_avgrate_4_GPUnum32_mesh_size4_idx4_temporal.yaml │ │ ├── requests_over_time_models_days_from_day55_to_day55_condense800_N17_maxrate_13_avgrate_4_GPUnum32_mesh_size4_idx5_temporal.yaml │ │ ├── requests_over_time_models_days_from_day55_to_day55_condense800_N17_maxrate_13_avgrate_4_GPUnum32_mesh_size4_idx6_temporal.yaml │ │ └── requests_over_time_models_days_from_day55_to_day55_condense800_N17_maxrate_13_avgrate_4_GPUnum32_mesh_size8_idx0_temporal.yaml ├── end_to_end │ ├── README.md │ ├── bench_end_to_end_muxserve.py │ ├── plot_alpha_rate_dist.py │ ├── plot_p_latency.py │ ├── process_log_file.ipynb │ ├── run_end_to_end.sh │ └── srun_end_to_end.sh └── parallelism │ ├── bench_parallelism.sh │ ├── benchmark_parallelism.py │ ├── llama-13b_n1.yaml │ ├── llama-13b_n2.yaml │ ├── llama-13b_n4.yaml │ ├── llama-13b_n8.yaml │ ├── llama-30b_n2.yaml │ ├── llama-30b_n4.yaml │ ├── llama-65b_n16.yaml │ ├── llama-65b_n4.yaml │ ├── llama-65b_n8.yaml │ ├── llama-7b_n1.yaml │ ├── llama-7b_n2.yaml │ ├── llama-7b_n4.yaml │ └── llama-7b_n8.yaml ├── csrc ├── batch_scheduler.cpp ├── batch_scheduler.h ├── memory_manager.cpp └── memory_manager.h ├── examples ├── basic │ ├── model_config.yaml │ ├── model_config_spatial_0.yaml │ ├── model_config_spatial_1.yaml │ ├── model_config_spatial_2.yaml │ └── models.yaml ├── model_cfg.yaml └── placement │ ├── README.md │ └── llama.json ├── muxserve ├── __init__.py ├── arg_utils.py ├── config.py ├── constants.py ├── entrypoint.py ├── flexserver │ ├── __init__.py │ ├── dist_utils.py │ ├── llm_runtime.py │ ├── model_loader.py │ ├── models │ │ ├── __init__.py │ │ └── llama.py │ ├── muxserve_server.py │ ├── p2p_communication.py │ ├── pipeworker.py │ └── sampler.py ├── flexstore │ ├── __init__.py │ ├── manager.py │ └── weight_utils.py ├── launch.py ├── logger.py ├── muxsched │ ├── __init__.py │ ├── launcher.py │ ├── placement.py │ ├── resource.py │ ├── scheduler.py │ └── workload_utils.py ├── shm_utils.py ├── tracer.py └── zmq_utils.py ├── requirements.txt ├── scripts ├── batch_flexsm.sh ├── batch_temporal.sh ├── bench_end_to_end.sh ├── bench_end_to_end_local.sh ├── run_flexserver.sh ├── srun_flexsm.sh ├── srun_spatial.sh ├── srun_temporal.sh ├── start_mps.sh ├── stop_mps.sh └── test.sh ├── setup.py ├── tests └── test_mem_manager.py └── tools ├── ipc_tensor ├── __init__.py ├── zmq_client.py ├── zmq_server.py └── zmq_tool.py └── plots ├── plot_sm.ipynb ├── plot_traffic.py └── throughpt_slo.ipynb /benchmark/chatlmsys/README.md: -------------------------------------------------------------------------------- 1 | # End-to-End Evaluation For Real Scenarios 2 | 3 | * `basic setting:` 4 | ``` 5 | 4 x 8 A100 6 | 55-55.5 days, 8091 requests 7 | 16 models, 4x7b, 6x13b, 3x30b, 3x65b 8 | maxrate from 8 to 32 (req/s) 9 | avg rate from 2 to 11 (req/s) 10 | ``` 11 | 12 | * `file structure:` 13 | ``` 14 | --- 15 | - merged: The tpt data folder of chatlmsys 16 | - plot.py: plot the tpt and slo 17 | - yamls: basic yaml file for placement gen 18 | 19 | - chatlmsys_translation.py: translate the dataset 20 | - cfg_gen.py: generate the placement for muxserve 21 | - muxserve2spatial.py: translate the muxserve config into spatial config 22 | - muxserve2temporal.py: translate the muxserve config into temporal config 23 | - cmd_gen.py: generate the running command for muxserve,spatial and temporal 24 | - profile.sh: script for run 25 | - merge.py: merge the file into `merged` folder 26 | ``` 27 | -------------------------------------------------------------------------------- /benchmark/chatlmsys/cfg_gen.py: -------------------------------------------------------------------------------- 1 | from muxserve.muxsched.placement import PlacementOptimizer 2 | import yaml 3 | import os 4 | 5 | COST_FILE = "/mnt/afs/lurunyu/projects/MuxServe/examples/placement/llama.json" 6 | 7 | 8 | def get_placement_from_cfg( 9 | models_yaml: str, 10 | costfile: str, 11 | is_greedy=False, 12 | dump_to_yaml=True, 13 | dump_dir: str = None, 14 | verbose: bool = False, 15 | avg_output_len=337, # sharegpt data 16 | avg_prompt_len=161 # sharegpt data 17 | ): 18 | 19 | opt = PlacementOptimizer(models_yaml, costfile) 20 | 21 | return opt.optimize(is_greedy, 22 | dump_dir=dump_dir, 23 | dump_to_yaml=dump_to_yaml, 24 | verbose=verbose, 25 | avg_output_len=avg_output_len, 26 | avg_prompt_len=avg_prompt_len) 27 | 28 | 29 | if __name__ == "__main__": 30 | import glob 31 | 32 | to_scan = 'benchmark/chatlmsys/yamls/' 33 | # files = glob.glob(to_scan + 'requests_over_time_models_days_from_day100_to_day105*.yaml') 34 | # files = glob.glob(to_scan + '*.yaml') 35 | files = glob.glob( 36 | to_scan + 'requests_over_time_models_days_from_day55_to_day55*.yaml') 37 | # to_scan + 'requests_over_time_models_days_from_day60_to_day65*.yaml') 38 | 39 | dump_dir = 'benchmark/chatlmsys/yamls/muxserve' 40 | 41 | for file in files: 42 | 43 | with open(file, 'r') as f: 44 | yml = yaml.safe_load(f) 45 | avg_output_len = yml['avg_output_len'] 46 | avg_prompt_len = yml['avg_prompt_len'] 47 | 48 | get_placement_from_cfg( 49 | file, 50 | COST_FILE, 51 | False, 52 | dump_to_yaml=True, 53 | dump_dir=dump_dir, 54 | verbose=True, 55 | avg_output_len=avg_output_len, # sharegpt data 56 | avg_prompt_len=avg_prompt_len # sharegpt data 57 | ) 58 | -------------------------------------------------------------------------------- /benchmark/chatlmsys/muxserve2spatial.py: -------------------------------------------------------------------------------- 1 | import yaml 2 | import glob 3 | import copy 4 | 5 | MAP_MESH = { 6 | '7b': (1, 0.65), 7 | '13b': (2, 0.5), 8 | '30b': (2, 0.375), 9 | '65b': (4, 0.375), 10 | } 11 | 12 | if __name__ == "__main__": 13 | files = glob.glob("benchmark/chatlmsys/yamls/*.yaml") 14 | 15 | for file in files: 16 | if 'day55_to_day55' not in file: 17 | continue 18 | 19 | with open(file, 'r') as f: 20 | yml = yaml.safe_load(f) 21 | 22 | total_gpu_num = 0 23 | for idx, instance in enumerate(yml['models']): 24 | model_size = instance['model'].split('-')[-1] 25 | name = instance['name'] 26 | model = instance['model'] 27 | mesh_size, util = MAP_MESH[model_size] 28 | 29 | if name == 'llm-1' or name == 'llm-3': 30 | mesh_size = 1 31 | 32 | filename = file.split('/')[-1] 33 | filename_stem = filename.split('.')[0] 34 | 35 | out_file = f'benchmark/chatlmsys/yamls/spatial/{filename_stem}_GPUnum32_mesh_size{mesh_size}_idx{idx}_spatial.yaml' 36 | 37 | out_data = { 38 | "num_gpus": 39 | mesh_size, 40 | "max_num_seqs": 41 | 256, 42 | "overload_threshold": 43 | 2, 44 | "gpu_memory_utilization": 45 | util, 46 | "models": [{ 47 | "name": name, 48 | "model": model, 49 | "tensor_parallel_size": mesh_size, 50 | "pipeline_parallel_size": 1, 51 | "placement": [list(range(mesh_size))], 52 | "mps_percentage": [100, 90], 53 | "max_num_seqs": 256, 54 | "model_dtype": "fp16" 55 | }], 56 | "workloads": { 57 | "workload_file": None 58 | } 59 | } 60 | 61 | total_gpu_num += mesh_size 62 | 63 | with open(out_file, "w") as f: 64 | yaml.dump(out_data, f, sort_keys=False) 65 | print(out_file) 66 | print(f"total gpu: {total_gpu_num}") 67 | -------------------------------------------------------------------------------- /benchmark/chatlmsys/muxserve2temporal.py: -------------------------------------------------------------------------------- 1 | import yaml 2 | import glob 3 | import copy 4 | 5 | if __name__ == "__main__": 6 | # files = glob.glob("benchmark/chatlmsys/yamls/placement_gen/*.yaml") 7 | files = glob.glob("benchmark/chatlmsys/yamls/muxserve/*.yaml") 8 | 9 | for file in files: 10 | with open(file, 'r') as f: 11 | yml = yaml.safe_load(f) 12 | 13 | temporal = copy.deepcopy(yml) 14 | for i in range(len(temporal['models'])): 15 | temporal['models'][i]['mps_percentage'] = [100, 90] 16 | 17 | filename = file.split('/')[-1] 18 | filename_stem = filename.split('.')[0] 19 | 20 | out_file = f'benchmark/chatlmsys/yamls/temporal/{filename_stem}_temporal.yaml' 21 | 22 | with open(out_file, 'w') as f: 23 | yaml.dump(temporal, f, sort_keys=False) 24 | -------------------------------------------------------------------------------- /benchmark/chatlmsys/profile.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | export PYTHONPATH=$(pwd):$PYTHONPATH 3 | 4 | 5 | if [ "$#" -ne 6 ]; then 6 | echo "Usage: $0 nnodes nprocs yaml mpsfile workloads cuda_devices" 7 | echo "sudo bash benchmark/chatlmsys/profile.sh 1 2 benchmark/chatlmsys/yamls/placement_gen/requests_over_time_models_days_from_day60_to_day65_condense500_N19_maxrate_7_avgrate_1_GPUnum32_mesh_size2_idx8.yaml /mnt/afs/lurunyu/projects/MuxServe/log/mps1 /mnt/afs/lurunyu/data/requests_over_time_models_days_from_day60_to_day65_condense500_N19_maxrate_7_avgrate_1.json 0,1" 8 | echo "sudo bash benchmark/chatlmsys/profile.sh 1 4 benchmark/chatlmsys/yamls/placement_gen/requests_over_time_models_days_from_day30_to_day35_condense1000_N14_maxrate_19_avgrate_6_GPUnum32_mesh_size4_idx2.yaml /mnt/afs/lurunyu/projects/MuxServe/log/mps1 /mnt/afs/lurunyu/data/requests_over_time_models_days_from_day30_to_day35_condense1000_N14_maxrate_19_avgrate_6.json 4,5,6,7" 9 | echo "sudo bash benchmark/chatlmsys/profile.sh 1 4 benchmark/chatlmsys/yamls/placement_gen/requests_over_time_models_days_from_day60_to_day65_condense800_N19_maxrate_11_avgrate_2_GPUnum32_mesh_size2_idx7.yaml /mnt/afs/lurunyu/projects/MuxServe/log/mps1 /mnt/afs/lurunyu/data/requests_over_time_models_days_from_day60_to_day65_condense800_N19_maxrate_11_avgrate_2_GPUnum32_mesh_size2_idx7.json 2,3" 10 | exit 1 11 | fi 12 | 13 | get_available_port() { 14 | local port 15 | port=$(python3 -c 'import socket; s=socket.socket(); s.bind(("127.0.0.1", 0)); print(s.getsockname()[1]); s.close()') 16 | echo "$port" 17 | } 18 | 19 | echo "You should begin to open MPS $MPS_FILE First!!" 20 | echo "You should begin to open MPS $MPS_FILE First!!" 21 | echo "You should begin to open MPS $MPS_FILE First!!" 22 | echo "sudo bash scripts/start_mps.sh $MPS_FILE" 23 | 24 | NNODES="$1" 25 | NPROCS="$2" 26 | YAML="$3" 27 | MPS_FILE=${4:-"/mnt/afs/lurunyu/projects/MuxServe/log/mps"} 28 | workload_file="$5" 29 | # IFS=',' read -ra scales <<< "$6" 30 | CUDA_DEVICE="$6" 31 | export CUDA_VISIBLE_DEVICES=$CUDA_DEVICE 32 | 33 | directory=$(dirname "$YAML") 34 | filename=$(basename "$YAML" .yaml) 35 | non_yaml_part="${directory}/${filename}" 36 | 37 | LOGDIR="log/$non_yaml_part" 38 | mkdir -p ${LOGDIR} 39 | echo "log file: $LOGDIR" 40 | 41 | # for scale in "${scales[@]}"; do 42 | # bash scripts/stop_mps.sh $MPS_FILE 43 | # bash scripts/start_mps.sh $MPS_FILE 44 | # WORKLOAD="${workload_file}_${scale}.json" 45 | WORKLOAD=${workload_file} 46 | export CUDA_MPS_PIPE_DIRECTORY=$MPS_FILE/nvidia-mps 47 | export CUDA_MPS_LOG_DIRECTORY=$MPS_FILE/nvidia-log 48 | 49 | export PATH=/home/lurunyu/envs/miniconda3/envs/muxserve/bin/:$PATH 50 | FLEXSM_SHM_PREFIX="placement_${filename}" python -m muxserve.launch ${YAML} \ 51 | --nnodes=$NNODES --node-rank=0 --master-addr=127.0.0.1 \ 52 | --nproc_per_node=$NPROCS \ 53 | --schedule-approach adbs \ 54 | --workload-file ${WORKLOAD} \ 55 | --max-num-batched-tokens 2048 \ 56 | --server-port $(get_available_port) --flexstore-port $(get_available_port) \ 57 | 2>&1 | tee ${LOGDIR}/log.log 58 | # done 59 | -------------------------------------------------------------------------------- /benchmark/chatlmsys/yamls/.gitignore: -------------------------------------------------------------------------------- 1 | placement_gen/ 2 | #muxserve/ 3 | #spatial/ 4 | #temporal/ 5 | -------------------------------------------------------------------------------- /benchmark/chatlmsys/yamls/flexsm/requests_over_time_models_days_from_day55_to_day55_condense1200_N17_maxrate_19_avgrate_6_GPUnum32_mesh_size1_idx10.yaml: -------------------------------------------------------------------------------- 1 | num_gpus: 1 2 | max_num_seqs: 256 3 | overload_threshold: 2 4 | gpu_memory_utilization: 0.5549999999999999 5 | models: 6 | - name: llm-6 7 | model: /mnt/afs/share/LLMCKPTs/huggyllama/llama-13b 8 | tensor_parallel_size: 1 9 | pipeline_parallel_size: 1 10 | placement: 11 | - - 0 12 | mps_percentage: 13 | - 90 14 | - 60 15 | max_num_seqs: 62 16 | model_dtype: fp16 17 | workloads: 18 | workload_file: 19 | -------------------------------------------------------------------------------- /benchmark/chatlmsys/yamls/flexsm/requests_over_time_models_days_from_day55_to_day55_condense1200_N17_maxrate_19_avgrate_6_GPUnum32_mesh_size1_idx7.yaml: -------------------------------------------------------------------------------- 1 | num_gpus: 1 2 | max_num_seqs: 256 3 | overload_threshold: 2 4 | gpu_memory_utilization: 0.5549999999999999 5 | models: 6 | - name: llm-2 7 | model: /mnt/afs/share/LLMCKPTs/huggyllama/llama-13b 8 | tensor_parallel_size: 1 9 | pipeline_parallel_size: 1 10 | placement: 11 | - - 0 12 | mps_percentage: 13 | - 90 14 | - 30 15 | max_num_seqs: 49 16 | model_dtype: fp16 17 | workloads: 18 | workload_file: 19 | -------------------------------------------------------------------------------- /benchmark/chatlmsys/yamls/flexsm/requests_over_time_models_days_from_day55_to_day55_condense1200_N17_maxrate_19_avgrate_6_GPUnum32_mesh_size1_idx8.yaml: -------------------------------------------------------------------------------- 1 | num_gpus: 1 2 | max_num_seqs: 256 3 | overload_threshold: 2 4 | gpu_memory_utilization: 0.25999999999999995 5 | models: 6 | - name: llm-5 7 | model: /mnt/afs/share/LLMCKPTs/huggyllama/llama-13b 8 | tensor_parallel_size: 1 9 | pipeline_parallel_size: 1 10 | placement: 11 | - - 0 12 | mps_percentage: 13 | - 90 14 | - 30 15 | max_num_seqs: 45 16 | model_dtype: fp16 17 | - name: llm-7 18 | model: /mnt/afs/share/LLMCKPTs/huggyllama/llama-7b 19 | tensor_parallel_size: 1 20 | pipeline_parallel_size: 1 21 | placement: 22 | - - 0 23 | mps_percentage: 24 | - 80 25 | - 30 26 | max_num_seqs: 17 27 | model_dtype: fp16 28 | workloads: 29 | workload_file: 30 | -------------------------------------------------------------------------------- /benchmark/chatlmsys/yamls/flexsm/requests_over_time_models_days_from_day55_to_day55_condense1200_N17_maxrate_19_avgrate_6_GPUnum32_mesh_size1_idx9.yaml: -------------------------------------------------------------------------------- 1 | num_gpus: 1 2 | max_num_seqs: 256 3 | overload_threshold: 2 4 | gpu_memory_utilization: 0.7050000000000001 5 | models: 6 | - name: llm-15 7 | model: /mnt/afs/share/LLMCKPTs/huggyllama/llama-7b 8 | tensor_parallel_size: 1 9 | pipeline_parallel_size: 1 10 | placement: 11 | - - 0 12 | mps_percentage: 13 | - 80 14 | - 60 15 | max_num_seqs: 111 16 | model_dtype: fp16 17 | workloads: 18 | workload_file: 19 | -------------------------------------------------------------------------------- /benchmark/chatlmsys/yamls/flexsm/requests_over_time_models_days_from_day55_to_day55_condense1200_N17_maxrate_19_avgrate_6_GPUnum32_mesh_size2_idx5.yaml: -------------------------------------------------------------------------------- 1 | num_gpus: 2 2 | max_num_seqs: 256 3 | overload_threshold: 2 4 | gpu_memory_utilization: 0.2975 5 | models: 6 | - name: llm-9 7 | model: /mnt/afs/share/LLMCKPTs/huggyllama/llama-30b 8 | tensor_parallel_size: 2 9 | pipeline_parallel_size: 1 10 | placement: 11 | - - 0 12 | - 1 13 | mps_percentage: 14 | - 100 15 | - 30 16 | max_num_seqs: 73 17 | model_dtype: fp16 18 | - name: llm-13 19 | model: /mnt/afs/share/LLMCKPTs/huggyllama/llama-7b 20 | tensor_parallel_size: 2 21 | pipeline_parallel_size: 1 22 | placement: 23 | - - 0 24 | - 1 25 | mps_percentage: 26 | - 80 27 | - 30 28 | max_num_seqs: 28 29 | model_dtype: fp16 30 | workloads: 31 | workload_file: 32 | -------------------------------------------------------------------------------- /benchmark/chatlmsys/yamls/flexsm/requests_over_time_models_days_from_day55_to_day55_condense1200_N17_maxrate_19_avgrate_6_GPUnum32_mesh_size2_idx6.yaml: -------------------------------------------------------------------------------- 1 | num_gpus: 2 2 | max_num_seqs: 256 3 | overload_threshold: 2 4 | gpu_memory_utilization: 0.505 5 | models: 6 | - name: llm-8 7 | model: /mnt/afs/share/LLMCKPTs/huggyllama/llama-30b 8 | tensor_parallel_size: 2 9 | pipeline_parallel_size: 1 10 | placement: 11 | - - 0 12 | - 1 13 | mps_percentage: 14 | - 100 15 | - 80 16 | max_num_seqs: 73 17 | model_dtype: fp16 18 | workloads: 19 | workload_file: 20 | -------------------------------------------------------------------------------- /benchmark/chatlmsys/yamls/flexsm/requests_over_time_models_days_from_day55_to_day55_condense1200_N17_maxrate_19_avgrate_6_GPUnum32_mesh_size4_idx1.yaml: -------------------------------------------------------------------------------- 1 | num_gpus: 4 2 | max_num_seqs: 256 3 | overload_threshold: 2 4 | gpu_memory_utilization: 0.49124999999999996 5 | models: 6 | - name: llm-11 7 | model: /mnt/afs/share/LLMCKPTs/huggyllama/llama-30b 8 | tensor_parallel_size: 4 9 | pipeline_parallel_size: 1 10 | placement: 11 | - - 0 12 | - 1 13 | - 2 14 | - 3 15 | mps_percentage: 16 | - 100 17 | - 90 18 | max_num_seqs: 147 19 | model_dtype: fp16 20 | - name: llm-12 21 | model: /mnt/afs/share/LLMCKPTs/huggyllama/llama-13b 22 | tensor_parallel_size: 4 23 | pipeline_parallel_size: 1 24 | placement: 25 | - - 0 26 | - 1 27 | - 2 28 | - 3 29 | mps_percentage: 30 | - 90 31 | - 30 32 | max_num_seqs: 34 33 | model_dtype: fp16 34 | workloads: 35 | workload_file: 36 | -------------------------------------------------------------------------------- /benchmark/chatlmsys/yamls/flexsm/requests_over_time_models_days_from_day55_to_day55_condense1200_N17_maxrate_19_avgrate_6_GPUnum32_mesh_size4_idx3.yaml: -------------------------------------------------------------------------------- 1 | num_gpus: 4 2 | max_num_seqs: 256 3 | overload_threshold: 2 4 | gpu_memory_utilization: 0.30999999999999994 5 | models: 6 | - name: llm-4 7 | model: /mnt/afs/share/LLMCKPTs/huggyllama/llama-7b 8 | tensor_parallel_size: 4 9 | pipeline_parallel_size: 1 10 | placement: 11 | - - 0 12 | - 1 13 | - 2 14 | - 3 15 | mps_percentage: 16 | - 80 17 | - 90 18 | max_num_seqs: 450 19 | model_dtype: fp16 20 | - name: llm-14 21 | model: /mnt/afs/share/LLMCKPTs/huggyllama/llama-65b 22 | tensor_parallel_size: 4 23 | pipeline_parallel_size: 1 24 | placement: 25 | - - 0 26 | - 1 27 | - 2 28 | - 3 29 | mps_percentage: 30 | - 100 31 | - 30 32 | max_num_seqs: 89 33 | model_dtype: fp16 34 | workloads: 35 | workload_file: 36 | -------------------------------------------------------------------------------- /benchmark/chatlmsys/yamls/flexsm/requests_over_time_models_days_from_day55_to_day55_condense1200_N17_maxrate_19_avgrate_6_GPUnum32_mesh_size4_idx4.yaml: -------------------------------------------------------------------------------- 1 | num_gpus: 4 2 | max_num_seqs: 256 3 | overload_threshold: 2 4 | gpu_memory_utilization: 0.27249999999999996 5 | models: 6 | - name: llm-3 7 | model: /mnt/afs/share/LLMCKPTs/huggyllama/llama-13b 8 | tensor_parallel_size: 4 9 | pipeline_parallel_size: 1 10 | placement: 11 | - - 0 12 | - 1 13 | - 2 14 | - 3 15 | mps_percentage: 16 | - 90 17 | - 90 18 | max_num_seqs: 287 19 | model_dtype: fp16 20 | - name: llm-10 21 | model: /mnt/afs/share/LLMCKPTs/huggyllama/llama-65b 22 | tensor_parallel_size: 4 23 | pipeline_parallel_size: 1 24 | placement: 25 | - - 0 26 | - 1 27 | - 2 28 | - 3 29 | mps_percentage: 30 | - 100 31 | - 30 32 | max_num_seqs: 86 33 | model_dtype: fp16 34 | workloads: 35 | workload_file: 36 | -------------------------------------------------------------------------------- /benchmark/chatlmsys/yamls/flexsm/requests_over_time_models_days_from_day55_to_day55_condense1200_N17_maxrate_19_avgrate_6_GPUnum32_mesh_size8_idx0.yaml: -------------------------------------------------------------------------------- 1 | num_gpus: 8 2 | max_num_seqs: 256 3 | overload_threshold: 2 4 | gpu_memory_utilization: 0.51625 5 | models: 6 | - name: llm-1 7 | model: /mnt/afs/share/LLMCKPTs/huggyllama/llama-13b 8 | tensor_parallel_size: 8 9 | pipeline_parallel_size: 1 10 | placement: 11 | - - 0 12 | - 1 13 | - 2 14 | - 3 15 | - 4 16 | - 5 17 | - 6 18 | - 7 19 | mps_percentage: 20 | - 90 21 | - 90 22 | max_num_seqs: 576 23 | model_dtype: fp16 24 | - name: llm-16 25 | model: /mnt/afs/share/LLMCKPTs/huggyllama/llama-65b 26 | tensor_parallel_size: 8 27 | pipeline_parallel_size: 1 28 | placement: 29 | - - 0 30 | - 1 31 | - 2 32 | - 3 33 | - 4 34 | - 5 35 | - 6 36 | - 7 37 | mps_percentage: 38 | - 100 39 | - 30 40 | max_num_seqs: 158 41 | model_dtype: fp16 42 | workloads: 43 | workload_file: 44 | -------------------------------------------------------------------------------- /benchmark/chatlmsys/yamls/flexsm/requests_over_time_models_days_from_day55_to_day55_condense1600_N17_maxrate_26_avgrate_8_GPUnum32_mesh_size2_idx6.yaml: -------------------------------------------------------------------------------- 1 | num_gpus: 2 2 | max_num_seqs: 256 3 | overload_threshold: 2 4 | gpu_memory_utilization: 0.51 5 | models: 6 | - name: llm-15 7 | model: /mnt/afs/share/LLMCKPTs/huggyllama/llama-7b 8 | tensor_parallel_size: 2 9 | pipeline_parallel_size: 1 10 | placement: 11 | - - 0 12 | - 1 13 | mps_percentage: 14 | - 80 15 | - 60 16 | max_num_seqs: 220 17 | model_dtype: fp16 18 | - name: llm-2 19 | model: /mnt/afs/share/LLMCKPTs/huggyllama/llama-13b 20 | tensor_parallel_size: 2 21 | pipeline_parallel_size: 1 22 | placement: 23 | - - 0 24 | - 1 25 | mps_percentage: 26 | - 90 27 | - 30 28 | max_num_seqs: 84 29 | model_dtype: fp16 30 | workloads: 31 | workload_file: 32 | -------------------------------------------------------------------------------- /benchmark/chatlmsys/yamls/flexsm/requests_over_time_models_days_from_day55_to_day55_condense1600_N17_maxrate_26_avgrate_8_GPUnum32_mesh_size2_idx7.yaml: -------------------------------------------------------------------------------- 1 | num_gpus: 2 2 | max_num_seqs: 256 3 | overload_threshold: 2 4 | gpu_memory_utilization: 0.2975 5 | models: 6 | - name: llm-8 7 | model: /mnt/afs/share/LLMCKPTs/huggyllama/llama-30b 8 | tensor_parallel_size: 2 9 | pipeline_parallel_size: 1 10 | placement: 11 | - - 0 12 | - 1 13 | mps_percentage: 14 | - 100 15 | - 80 16 | max_num_seqs: 73 17 | model_dtype: fp16 18 | - name: llm-7 19 | model: /mnt/afs/share/LLMCKPTs/huggyllama/llama-7b 20 | tensor_parallel_size: 2 21 | pipeline_parallel_size: 1 22 | placement: 23 | - - 0 24 | - 1 25 | mps_percentage: 26 | - 80 27 | - 30 28 | max_num_seqs: 17 29 | model_dtype: fp16 30 | workloads: 31 | workload_file: 32 | -------------------------------------------------------------------------------- /benchmark/chatlmsys/yamls/flexsm/requests_over_time_models_days_from_day55_to_day55_condense1600_N17_maxrate_26_avgrate_8_GPUnum32_mesh_size4_idx1.yaml: -------------------------------------------------------------------------------- 1 | num_gpus: 4 2 | max_num_seqs: 256 3 | overload_threshold: 2 4 | gpu_memory_utilization: 0.49124999999999996 5 | models: 6 | - name: llm-11 7 | model: /mnt/afs/share/LLMCKPTs/huggyllama/llama-30b 8 | tensor_parallel_size: 4 9 | pipeline_parallel_size: 1 10 | placement: 11 | - - 0 12 | - 1 13 | - 2 14 | - 3 15 | mps_percentage: 16 | - 100 17 | - 90 18 | max_num_seqs: 147 19 | model_dtype: fp16 20 | - name: llm-6 21 | model: /mnt/afs/share/LLMCKPTs/huggyllama/llama-13b 22 | tensor_parallel_size: 4 23 | pipeline_parallel_size: 1 24 | placement: 25 | - - 0 26 | - 1 27 | - 2 28 | - 3 29 | mps_percentage: 30 | - 90 31 | - 30 32 | max_num_seqs: 150 33 | model_dtype: fp16 34 | workloads: 35 | workload_file: 36 | -------------------------------------------------------------------------------- /benchmark/chatlmsys/yamls/flexsm/requests_over_time_models_days_from_day55_to_day55_condense1600_N17_maxrate_26_avgrate_8_GPUnum32_mesh_size4_idx2.yaml: -------------------------------------------------------------------------------- 1 | num_gpus: 4 2 | max_num_seqs: 256 3 | overload_threshold: 2 4 | gpu_memory_utilization: 0.10875000000000004 5 | models: 6 | - name: llm-10 7 | model: /mnt/afs/share/LLMCKPTs/huggyllama/llama-65b 8 | tensor_parallel_size: 4 9 | pipeline_parallel_size: 1 10 | placement: 11 | - - 0 12 | - 1 13 | - 2 14 | - 3 15 | mps_percentage: 16 | - 100 17 | - 50 18 | max_num_seqs: 83 19 | model_dtype: fp16 20 | - name: llm-5 21 | model: /mnt/afs/share/LLMCKPTs/huggyllama/llama-13b 22 | tensor_parallel_size: 4 23 | pipeline_parallel_size: 1 24 | placement: 25 | - - 0 26 | - 1 27 | - 2 28 | - 3 29 | mps_percentage: 30 | - 90 31 | - 30 32 | max_num_seqs: 22 33 | model_dtype: fp16 34 | - name: llm-13 35 | model: /mnt/afs/share/LLMCKPTs/huggyllama/llama-7b 36 | tensor_parallel_size: 4 37 | pipeline_parallel_size: 1 38 | placement: 39 | - - 0 40 | - 1 41 | - 2 42 | - 3 43 | mps_percentage: 44 | - 80 45 | - 30 46 | max_num_seqs: 38 47 | model_dtype: fp16 48 | workloads: 49 | workload_file: 50 | -------------------------------------------------------------------------------- /benchmark/chatlmsys/yamls/flexsm/requests_over_time_models_days_from_day55_to_day55_condense1600_N17_maxrate_26_avgrate_8_GPUnum32_mesh_size4_idx3.yaml: -------------------------------------------------------------------------------- 1 | num_gpus: 4 2 | max_num_seqs: 256 3 | overload_threshold: 2 4 | gpu_memory_utilization: 0.47375 5 | models: 6 | - name: llm-16 7 | model: /mnt/afs/share/LLMCKPTs/huggyllama/llama-65b 8 | tensor_parallel_size: 4 9 | pipeline_parallel_size: 1 10 | placement: 11 | - - 0 12 | - 1 13 | - 2 14 | - 3 15 | mps_percentage: 16 | - 100 17 | - 90 18 | max_num_seqs: 89 19 | model_dtype: fp16 20 | workloads: 21 | workload_file: 22 | -------------------------------------------------------------------------------- /benchmark/chatlmsys/yamls/flexsm/requests_over_time_models_days_from_day55_to_day55_condense1600_N17_maxrate_26_avgrate_8_GPUnum32_mesh_size4_idx4.yaml: -------------------------------------------------------------------------------- 1 | num_gpus: 4 2 | max_num_seqs: 256 3 | overload_threshold: 2 4 | gpu_memory_utilization: 0.5974999999999999 5 | models: 6 | - name: llm-3 7 | model: /mnt/afs/share/LLMCKPTs/huggyllama/llama-13b 8 | tensor_parallel_size: 4 9 | pipeline_parallel_size: 1 10 | placement: 11 | - - 0 12 | - 1 13 | - 2 14 | - 3 15 | mps_percentage: 16 | - 90 17 | - 90 18 | max_num_seqs: 287 19 | model_dtype: fp16 20 | - name: llm-12 21 | model: /mnt/afs/share/LLMCKPTs/huggyllama/llama-13b 22 | tensor_parallel_size: 4 23 | pipeline_parallel_size: 1 24 | placement: 25 | - - 0 26 | - 1 27 | - 2 28 | - 3 29 | mps_percentage: 30 | - 90 31 | - 30 32 | max_num_seqs: 155 33 | model_dtype: fp16 34 | workloads: 35 | workload_file: 36 | -------------------------------------------------------------------------------- /benchmark/chatlmsys/yamls/flexsm/requests_over_time_models_days_from_day55_to_day55_condense1600_N17_maxrate_26_avgrate_8_GPUnum32_mesh_size4_idx5.yaml: -------------------------------------------------------------------------------- 1 | num_gpus: 4 2 | max_num_seqs: 256 3 | overload_threshold: 2 4 | gpu_memory_utilization: 0.5287499999999999 5 | models: 6 | - name: llm-4 7 | model: /mnt/afs/share/LLMCKPTs/huggyllama/llama-7b 8 | tensor_parallel_size: 4 9 | pipeline_parallel_size: 1 10 | placement: 11 | - - 0 12 | - 1 13 | - 2 14 | - 3 15 | mps_percentage: 16 | - 80 17 | - 90 18 | max_num_seqs: 450 19 | model_dtype: fp16 20 | - name: llm-9 21 | model: /mnt/afs/share/LLMCKPTs/huggyllama/llama-30b 22 | tensor_parallel_size: 4 23 | pipeline_parallel_size: 1 24 | placement: 25 | - - 0 26 | - 1 27 | - 2 28 | - 3 29 | mps_percentage: 30 | - 100 31 | - 30 32 | max_num_seqs: 39 33 | model_dtype: fp16 34 | workloads: 35 | workload_file: 36 | -------------------------------------------------------------------------------- /benchmark/chatlmsys/yamls/flexsm/requests_over_time_models_days_from_day55_to_day55_condense1600_N17_maxrate_26_avgrate_8_GPUnum32_mesh_size8_idx0.yaml: -------------------------------------------------------------------------------- 1 | num_gpus: 8 2 | max_num_seqs: 256 3 | overload_threshold: 2 4 | gpu_memory_utilization: 0.51625 5 | models: 6 | - name: llm-1 7 | model: /mnt/afs/share/LLMCKPTs/huggyllama/llama-13b 8 | tensor_parallel_size: 8 9 | pipeline_parallel_size: 1 10 | placement: 11 | - - 0 12 | - 1 13 | - 2 14 | - 3 15 | - 4 16 | - 5 17 | - 6 18 | - 7 19 | mps_percentage: 20 | - 90 21 | - 90 22 | max_num_seqs: 576 23 | model_dtype: fp16 24 | - name: llm-14 25 | model: /mnt/afs/share/LLMCKPTs/huggyllama/llama-65b 26 | tensor_parallel_size: 8 27 | pipeline_parallel_size: 1 28 | placement: 29 | - - 0 30 | - 1 31 | - 2 32 | - 3 33 | - 4 34 | - 5 35 | - 6 36 | - 7 37 | mps_percentage: 38 | - 100 39 | - 30 40 | max_num_seqs: 87 41 | model_dtype: fp16 42 | workloads: 43 | workload_file: 44 | -------------------------------------------------------------------------------- /benchmark/chatlmsys/yamls/flexsm/requests_over_time_models_days_from_day55_to_day55_condense2000_N17_maxrate_32_avgrate_11_GPUnum32_mesh_size2_idx6.yaml: -------------------------------------------------------------------------------- 1 | num_gpus: 2 2 | max_num_seqs: 256 3 | overload_threshold: 2 4 | gpu_memory_utilization: 0.51 5 | models: 6 | - name: llm-15 7 | model: /mnt/afs/share/LLMCKPTs/huggyllama/llama-7b 8 | tensor_parallel_size: 2 9 | pipeline_parallel_size: 1 10 | placement: 11 | - - 0 12 | - 1 13 | mps_percentage: 14 | - 80 15 | - 90 16 | max_num_seqs: 224 17 | model_dtype: fp16 18 | - name: llm-2 19 | model: /mnt/afs/share/LLMCKPTs/huggyllama/llama-13b 20 | tensor_parallel_size: 2 21 | pipeline_parallel_size: 1 22 | placement: 23 | - - 0 24 | - 1 25 | mps_percentage: 26 | - 90 27 | - 30 28 | max_num_seqs: 107 29 | model_dtype: fp16 30 | workloads: 31 | workload_file: 32 | -------------------------------------------------------------------------------- /benchmark/chatlmsys/yamls/flexsm/requests_over_time_models_days_from_day55_to_day55_condense2000_N17_maxrate_32_avgrate_11_GPUnum32_mesh_size2_idx7.yaml: -------------------------------------------------------------------------------- 1 | num_gpus: 2 2 | max_num_seqs: 256 3 | overload_threshold: 2 4 | gpu_memory_utilization: 0.2975 5 | models: 6 | - name: llm-8 7 | model: /mnt/afs/share/LLMCKPTs/huggyllama/llama-30b 8 | tensor_parallel_size: 2 9 | pipeline_parallel_size: 1 10 | placement: 11 | - - 0 12 | - 1 13 | mps_percentage: 14 | - 100 15 | - 80 16 | max_num_seqs: 73 17 | model_dtype: fp16 18 | - name: llm-13 19 | model: /mnt/afs/share/LLMCKPTs/huggyllama/llama-7b 20 | tensor_parallel_size: 2 21 | pipeline_parallel_size: 1 22 | placement: 23 | - - 0 24 | - 1 25 | mps_percentage: 26 | - 80 27 | - 30 28 | max_num_seqs: 201 29 | model_dtype: fp16 30 | workloads: 31 | workload_file: 32 | -------------------------------------------------------------------------------- /benchmark/chatlmsys/yamls/flexsm/requests_over_time_models_days_from_day55_to_day55_condense2000_N17_maxrate_32_avgrate_11_GPUnum32_mesh_size4_idx1.yaml: -------------------------------------------------------------------------------- 1 | num_gpus: 4 2 | max_num_seqs: 256 3 | overload_threshold: 2 4 | gpu_memory_utilization: 0.49124999999999996 5 | models: 6 | - name: llm-11 7 | model: /mnt/afs/share/LLMCKPTs/huggyllama/llama-30b 8 | tensor_parallel_size: 4 9 | pipeline_parallel_size: 1 10 | placement: 11 | - - 0 12 | - 1 13 | - 2 14 | - 3 15 | mps_percentage: 16 | - 100 17 | - 90 18 | max_num_seqs: 147 19 | model_dtype: fp16 20 | - name: llm-6 21 | model: /mnt/afs/share/LLMCKPTs/huggyllama/llama-13b 22 | tensor_parallel_size: 4 23 | pipeline_parallel_size: 1 24 | placement: 25 | - - 0 26 | - 1 27 | - 2 28 | - 3 29 | mps_percentage: 30 | - 90 31 | - 30 32 | max_num_seqs: 198 33 | model_dtype: fp16 34 | workloads: 35 | workload_file: 36 | -------------------------------------------------------------------------------- /benchmark/chatlmsys/yamls/flexsm/requests_over_time_models_days_from_day55_to_day55_condense2000_N17_maxrate_32_avgrate_11_GPUnum32_mesh_size4_idx2.yaml: -------------------------------------------------------------------------------- 1 | num_gpus: 4 2 | max_num_seqs: 256 3 | overload_threshold: 2 4 | gpu_memory_utilization: 0.27249999999999996 5 | models: 6 | - name: llm-10 7 | model: /mnt/afs/share/LLMCKPTs/huggyllama/llama-65b 8 | tensor_parallel_size: 4 9 | pipeline_parallel_size: 1 10 | placement: 11 | - - 0 12 | - 1 13 | - 2 14 | - 3 15 | mps_percentage: 16 | - 100 17 | - 70 18 | max_num_seqs: 89 19 | model_dtype: fp16 20 | - name: llm-5 21 | model: /mnt/afs/share/LLMCKPTs/huggyllama/llama-13b 22 | tensor_parallel_size: 4 23 | pipeline_parallel_size: 1 24 | placement: 25 | - - 0 26 | - 1 27 | - 2 28 | - 3 29 | mps_percentage: 30 | - 90 31 | - 30 32 | max_num_seqs: 32 33 | model_dtype: fp16 34 | workloads: 35 | workload_file: 36 | -------------------------------------------------------------------------------- /benchmark/chatlmsys/yamls/flexsm/requests_over_time_models_days_from_day55_to_day55_condense2000_N17_maxrate_32_avgrate_11_GPUnum32_mesh_size4_idx3.yaml: -------------------------------------------------------------------------------- 1 | num_gpus: 4 2 | max_num_seqs: 256 3 | overload_threshold: 2 4 | gpu_memory_utilization: 0.30999999999999994 5 | models: 6 | - name: llm-16 7 | model: /mnt/afs/share/LLMCKPTs/huggyllama/llama-65b 8 | tensor_parallel_size: 4 9 | pipeline_parallel_size: 1 10 | placement: 11 | - - 0 12 | - 1 13 | - 2 14 | - 3 15 | mps_percentage: 16 | - 100 17 | - 90 18 | max_num_seqs: 89 19 | model_dtype: fp16 20 | - name: llm-7 21 | model: /mnt/afs/share/LLMCKPTs/huggyllama/llama-7b 22 | tensor_parallel_size: 4 23 | pipeline_parallel_size: 1 24 | placement: 25 | - - 0 26 | - 1 27 | - 2 28 | - 3 29 | mps_percentage: 30 | - 80 31 | - 30 32 | max_num_seqs: 21 33 | model_dtype: fp16 34 | workloads: 35 | workload_file: 36 | -------------------------------------------------------------------------------- /benchmark/chatlmsys/yamls/flexsm/requests_over_time_models_days_from_day55_to_day55_condense2000_N17_maxrate_32_avgrate_11_GPUnum32_mesh_size4_idx4.yaml: -------------------------------------------------------------------------------- 1 | num_gpus: 4 2 | max_num_seqs: 256 3 | overload_threshold: 2 4 | gpu_memory_utilization: 0.5974999999999999 5 | models: 6 | - name: llm-3 7 | model: /mnt/afs/share/LLMCKPTs/huggyllama/llama-13b 8 | tensor_parallel_size: 4 9 | pipeline_parallel_size: 1 10 | placement: 11 | - - 0 12 | - 1 13 | - 2 14 | - 3 15 | mps_percentage: 16 | - 90 17 | - 90 18 | max_num_seqs: 287 19 | model_dtype: fp16 20 | - name: llm-12 21 | model: /mnt/afs/share/LLMCKPTs/huggyllama/llama-13b 22 | tensor_parallel_size: 4 23 | pipeline_parallel_size: 1 24 | placement: 25 | - - 0 26 | - 1 27 | - 2 28 | - 3 29 | mps_percentage: 30 | - 90 31 | - 30 32 | max_num_seqs: 204 33 | model_dtype: fp16 34 | workloads: 35 | workload_file: 36 | -------------------------------------------------------------------------------- /benchmark/chatlmsys/yamls/flexsm/requests_over_time_models_days_from_day55_to_day55_condense2000_N17_maxrate_32_avgrate_11_GPUnum32_mesh_size4_idx5.yaml: -------------------------------------------------------------------------------- 1 | num_gpus: 4 2 | max_num_seqs: 256 3 | overload_threshold: 2 4 | gpu_memory_utilization: 0.5287499999999999 5 | models: 6 | - name: llm-4 7 | model: /mnt/afs/share/LLMCKPTs/huggyllama/llama-7b 8 | tensor_parallel_size: 4 9 | pipeline_parallel_size: 1 10 | placement: 11 | - - 0 12 | - 1 13 | - 2 14 | - 3 15 | mps_percentage: 16 | - 80 17 | - 90 18 | max_num_seqs: 450 19 | model_dtype: fp16 20 | - name: llm-9 21 | model: /mnt/afs/share/LLMCKPTs/huggyllama/llama-30b 22 | tensor_parallel_size: 4 23 | pipeline_parallel_size: 1 24 | placement: 25 | - - 0 26 | - 1 27 | - 2 28 | - 3 29 | mps_percentage: 30 | - 100 31 | - 30 32 | max_num_seqs: 90 33 | model_dtype: fp16 34 | workloads: 35 | workload_file: 36 | -------------------------------------------------------------------------------- /benchmark/chatlmsys/yamls/flexsm/requests_over_time_models_days_from_day55_to_day55_condense2000_N17_maxrate_32_avgrate_11_GPUnum32_mesh_size8_idx0.yaml: -------------------------------------------------------------------------------- 1 | num_gpus: 8 2 | max_num_seqs: 256 3 | overload_threshold: 2 4 | gpu_memory_utilization: 0.51625 5 | models: 6 | - name: llm-1 7 | model: /mnt/afs/share/LLMCKPTs/huggyllama/llama-13b 8 | tensor_parallel_size: 8 9 | pipeline_parallel_size: 1 10 | placement: 11 | - - 0 12 | - 1 13 | - 2 14 | - 3 15 | - 4 16 | - 5 17 | - 6 18 | - 7 19 | mps_percentage: 20 | - 90 21 | - 90 22 | max_num_seqs: 576 23 | model_dtype: fp16 24 | - name: llm-14 25 | model: /mnt/afs/share/LLMCKPTs/huggyllama/llama-65b 26 | tensor_parallel_size: 8 27 | pipeline_parallel_size: 1 28 | placement: 29 | - - 0 30 | - 1 31 | - 2 32 | - 3 33 | - 4 34 | - 5 35 | - 6 36 | - 7 37 | mps_percentage: 38 | - 100 39 | - 30 40 | max_num_seqs: 172 41 | model_dtype: fp16 42 | workloads: 43 | workload_file: 44 | -------------------------------------------------------------------------------- /benchmark/chatlmsys/yamls/flexsm/requests_over_time_models_days_from_day55_to_day55_condense500_N17_maxrate_8_avgrate_2_GPUnum32_mesh_size2_idx3.yaml: -------------------------------------------------------------------------------- 1 | num_gpus: 2 2 | max_num_seqs: 256 3 | overload_threshold: 2 4 | gpu_memory_utilization: 0.7175 5 | models: 6 | - name: llm-5 7 | model: /mnt/afs/share/LLMCKPTs/huggyllama/llama-13b 8 | tensor_parallel_size: 2 9 | pipeline_parallel_size: 1 10 | placement: 11 | - - 0 12 | - 1 13 | mps_percentage: 14 | - 90 15 | - 30 16 | max_num_seqs: 7 17 | model_dtype: fp16 18 | workloads: 19 | workload_file: 20 | -------------------------------------------------------------------------------- /benchmark/chatlmsys/yamls/flexsm/requests_over_time_models_days_from_day55_to_day55_condense500_N17_maxrate_8_avgrate_2_GPUnum32_mesh_size2_idx4.yaml: -------------------------------------------------------------------------------- 1 | num_gpus: 2 2 | max_num_seqs: 256 3 | overload_threshold: 2 4 | gpu_memory_utilization: 0.43499999999999994 5 | models: 6 | - name: llm-1 7 | model: /mnt/afs/share/LLMCKPTs/huggyllama/llama-13b 8 | tensor_parallel_size: 2 9 | pipeline_parallel_size: 1 10 | placement: 11 | - - 0 12 | - 1 13 | mps_percentage: 14 | - 90 15 | - 90 16 | max_num_seqs: 143 17 | model_dtype: fp16 18 | - name: llm-6 19 | model: /mnt/afs/share/LLMCKPTs/huggyllama/llama-13b 20 | tensor_parallel_size: 2 21 | pipeline_parallel_size: 1 22 | placement: 23 | - - 0 24 | - 1 25 | mps_percentage: 26 | - 90 27 | - 30 28 | max_num_seqs: 13 29 | model_dtype: fp16 30 | workloads: 31 | workload_file: 32 | -------------------------------------------------------------------------------- /benchmark/chatlmsys/yamls/flexsm/requests_over_time_models_days_from_day55_to_day55_condense500_N17_maxrate_8_avgrate_2_GPUnum32_mesh_size2_idx5.yaml: -------------------------------------------------------------------------------- 1 | num_gpus: 2 2 | max_num_seqs: 256 3 | overload_threshold: 2 4 | gpu_memory_utilization: 0.7925000000000001 5 | models: 6 | - name: llm-13 7 | model: /mnt/afs/share/LLMCKPTs/huggyllama/llama-7b 8 | tensor_parallel_size: 2 9 | pipeline_parallel_size: 1 10 | placement: 11 | - - 0 12 | - 1 13 | mps_percentage: 14 | - 80 15 | - 30 16 | max_num_seqs: 8 17 | model_dtype: fp16 18 | workloads: 19 | workload_file: 20 | -------------------------------------------------------------------------------- /benchmark/chatlmsys/yamls/flexsm/requests_over_time_models_days_from_day55_to_day55_condense500_N17_maxrate_8_avgrate_2_GPUnum32_mesh_size2_idx6.yaml: -------------------------------------------------------------------------------- 1 | num_gpus: 2 2 | max_num_seqs: 256 3 | overload_threshold: 2 4 | gpu_memory_utilization: 0.22249999999999998 5 | models: 6 | - name: llm-3 7 | model: /mnt/afs/share/LLMCKPTs/huggyllama/llama-13b 8 | tensor_parallel_size: 2 9 | pipeline_parallel_size: 1 10 | placement: 11 | - - 0 12 | - 1 13 | mps_percentage: 14 | - 90 15 | - 60 16 | max_num_seqs: 115 17 | model_dtype: fp16 18 | - name: llm-9 19 | model: /mnt/afs/share/LLMCKPTs/huggyllama/llama-30b 20 | tensor_parallel_size: 2 21 | pipeline_parallel_size: 1 22 | placement: 23 | - - 0 24 | - 1 25 | mps_percentage: 26 | - 100 27 | - 30 28 | max_num_seqs: 11 29 | model_dtype: fp16 30 | workloads: 31 | workload_file: 32 | -------------------------------------------------------------------------------- /benchmark/chatlmsys/yamls/flexsm/requests_over_time_models_days_from_day55_to_day55_condense500_N17_maxrate_8_avgrate_2_GPUnum32_mesh_size2_idx7.yaml: -------------------------------------------------------------------------------- 1 | num_gpus: 2 2 | max_num_seqs: 256 3 | overload_threshold: 2 4 | gpu_memory_utilization: 0.7925000000000001 5 | models: 6 | - name: llm-15 7 | model: /mnt/afs/share/LLMCKPTs/huggyllama/llama-7b 8 | tensor_parallel_size: 2 9 | pipeline_parallel_size: 1 10 | placement: 11 | - - 0 12 | - 1 13 | mps_percentage: 14 | - 80 15 | - 30 16 | max_num_seqs: 17 17 | model_dtype: fp16 18 | workloads: 19 | workload_file: 20 | -------------------------------------------------------------------------------- /benchmark/chatlmsys/yamls/flexsm/requests_over_time_models_days_from_day55_to_day55_condense500_N17_maxrate_8_avgrate_2_GPUnum32_mesh_size2_idx9.yaml: -------------------------------------------------------------------------------- 1 | num_gpus: 2 2 | max_num_seqs: 256 3 | overload_threshold: 2 4 | gpu_memory_utilization: 0.51 5 | models: 6 | - name: llm-4 7 | model: /mnt/afs/share/LLMCKPTs/huggyllama/llama-7b 8 | tensor_parallel_size: 2 9 | pipeline_parallel_size: 1 10 | placement: 11 | - - 0 12 | - 1 13 | mps_percentage: 14 | - 80 15 | - 30 16 | max_num_seqs: 211 17 | model_dtype: fp16 18 | - name: llm-12 19 | model: /mnt/afs/share/LLMCKPTs/huggyllama/llama-13b 20 | tensor_parallel_size: 2 21 | pipeline_parallel_size: 1 22 | placement: 23 | - - 0 24 | - 1 25 | mps_percentage: 26 | - 90 27 | - 30 28 | max_num_seqs: 13 29 | model_dtype: fp16 30 | workloads: 31 | workload_file: 32 | -------------------------------------------------------------------------------- /benchmark/chatlmsys/yamls/flexsm/requests_over_time_models_days_from_day55_to_day55_condense500_N17_maxrate_8_avgrate_2_GPUnum32_mesh_size4_idx1.yaml: -------------------------------------------------------------------------------- 1 | num_gpus: 4 2 | max_num_seqs: 256 3 | overload_threshold: 2 4 | gpu_memory_utilization: 0.38499999999999995 5 | models: 6 | - name: llm-11 7 | model: /mnt/afs/share/LLMCKPTs/huggyllama/llama-30b 8 | tensor_parallel_size: 4 9 | pipeline_parallel_size: 1 10 | placement: 11 | - - 0 12 | - 1 13 | - 2 14 | - 3 15 | mps_percentage: 16 | - 100 17 | - 90 18 | max_num_seqs: 147 19 | model_dtype: fp16 20 | - name: llm-8 21 | model: /mnt/afs/share/LLMCKPTs/huggyllama/llama-30b 22 | tensor_parallel_size: 4 23 | pipeline_parallel_size: 1 24 | placement: 25 | - - 0 26 | - 1 27 | - 2 28 | - 3 29 | mps_percentage: 30 | - 100 31 | - 30 32 | max_num_seqs: 23 33 | model_dtype: fp16 34 | workloads: 35 | workload_file: 36 | -------------------------------------------------------------------------------- /benchmark/chatlmsys/yamls/flexsm/requests_over_time_models_days_from_day55_to_day55_condense500_N17_maxrate_8_avgrate_2_GPUnum32_mesh_size4_idx2.yaml: -------------------------------------------------------------------------------- 1 | num_gpus: 4 2 | max_num_seqs: 256 3 | overload_threshold: 2 4 | gpu_memory_utilization: 0.27249999999999996 5 | models: 6 | - name: llm-10 7 | model: /mnt/afs/share/LLMCKPTs/huggyllama/llama-65b 8 | tensor_parallel_size: 4 9 | pipeline_parallel_size: 1 10 | placement: 11 | - - 0 12 | - 1 13 | - 2 14 | - 3 15 | mps_percentage: 16 | - 100 17 | - 30 18 | max_num_seqs: 12 19 | model_dtype: fp16 20 | - name: llm-2 21 | model: /mnt/afs/share/LLMCKPTs/huggyllama/llama-13b 22 | tensor_parallel_size: 4 23 | pipeline_parallel_size: 1 24 | placement: 25 | - - 0 26 | - 1 27 | - 2 28 | - 3 29 | mps_percentage: 30 | - 90 31 | - 30 32 | max_num_seqs: 6 33 | model_dtype: fp16 34 | workloads: 35 | workload_file: 36 | -------------------------------------------------------------------------------- /benchmark/chatlmsys/yamls/flexsm/requests_over_time_models_days_from_day55_to_day55_condense500_N17_maxrate_8_avgrate_2_GPUnum32_mesh_size8_idx0.yaml: -------------------------------------------------------------------------------- 1 | num_gpus: 8 2 | max_num_seqs: 256 3 | overload_threshold: 2 4 | gpu_memory_utilization: 0.21187500000000004 5 | models: 6 | - name: llm-16 7 | model: /mnt/afs/share/LLMCKPTs/huggyllama/llama-65b 8 | tensor_parallel_size: 8 9 | pipeline_parallel_size: 1 10 | placement: 11 | - - 0 12 | - 1 13 | - 2 14 | - 3 15 | - 4 16 | - 5 17 | - 6 18 | - 7 19 | mps_percentage: 20 | - 100 21 | - 30 22 | max_num_seqs: 17 23 | model_dtype: fp16 24 | - name: llm-14 25 | model: /mnt/afs/share/LLMCKPTs/huggyllama/llama-65b 26 | tensor_parallel_size: 8 27 | pipeline_parallel_size: 1 28 | placement: 29 | - - 0 30 | - 1 31 | - 2 32 | - 3 33 | - 4 34 | - 5 35 | - 6 36 | - 7 37 | mps_percentage: 38 | - 100 39 | - 30 40 | max_num_seqs: 10 41 | model_dtype: fp16 42 | - name: llm-7 43 | model: /mnt/afs/share/LLMCKPTs/huggyllama/llama-7b 44 | tensor_parallel_size: 8 45 | pipeline_parallel_size: 1 46 | placement: 47 | - - 0 48 | - 1 49 | - 2 50 | - 3 51 | - 4 52 | - 5 53 | - 6 54 | - 7 55 | mps_percentage: 56 | - 80 57 | - 40 58 | max_num_seqs: 5 59 | model_dtype: fp16 60 | workloads: 61 | workload_file: 62 | -------------------------------------------------------------------------------- /benchmark/chatlmsys/yamls/flexsm/requests_over_time_models_days_from_day55_to_day55_condense800_N17_maxrate_13_avgrate_4_GPUnum32_mesh_size4_idx1.yaml: -------------------------------------------------------------------------------- 1 | num_gpus: 4 2 | max_num_seqs: 256 3 | overload_threshold: 2 4 | gpu_memory_utilization: 0.38499999999999995 5 | models: 6 | - name: llm-11 7 | model: /mnt/afs/share/LLMCKPTs/huggyllama/llama-30b 8 | tensor_parallel_size: 4 9 | pipeline_parallel_size: 1 10 | placement: 11 | - - 0 12 | - 1 13 | - 2 14 | - 3 15 | mps_percentage: 16 | - 100 17 | - 90 18 | max_num_seqs: 147 19 | model_dtype: fp16 20 | - name: llm-8 21 | model: /mnt/afs/share/LLMCKPTs/huggyllama/llama-30b 22 | tensor_parallel_size: 4 23 | pipeline_parallel_size: 1 24 | placement: 25 | - - 0 26 | - 1 27 | - 2 28 | - 3 29 | mps_percentage: 30 | - 100 31 | - 30 32 | max_num_seqs: 96 33 | model_dtype: fp16 34 | workloads: 35 | workload_file: 36 | -------------------------------------------------------------------------------- /benchmark/chatlmsys/yamls/flexsm/requests_over_time_models_days_from_day55_to_day55_condense800_N17_maxrate_13_avgrate_4_GPUnum32_mesh_size4_idx3.yaml: -------------------------------------------------------------------------------- 1 | num_gpus: 4 2 | max_num_seqs: 256 3 | overload_threshold: 2 4 | gpu_memory_utilization: 0.47125000000000006 5 | models: 6 | - name: llm-5 7 | model: /mnt/afs/share/LLMCKPTs/huggyllama/llama-13b 8 | tensor_parallel_size: 4 9 | pipeline_parallel_size: 1 10 | placement: 11 | - - 0 12 | - 1 13 | - 2 14 | - 3 15 | mps_percentage: 16 | - 90 17 | - 30 18 | max_num_seqs: 10 19 | model_dtype: fp16 20 | - name: llm-13 21 | model: /mnt/afs/share/LLMCKPTs/huggyllama/llama-7b 22 | tensor_parallel_size: 4 23 | pipeline_parallel_size: 1 24 | placement: 25 | - - 0 26 | - 1 27 | - 2 28 | - 3 29 | mps_percentage: 30 | - 80 31 | - 30 32 | max_num_seqs: 15 33 | model_dtype: fp16 34 | - name: llm-7 35 | model: /mnt/afs/share/LLMCKPTs/huggyllama/llama-7b 36 | tensor_parallel_size: 4 37 | pipeline_parallel_size: 1 38 | placement: 39 | - - 0 40 | - 1 41 | - 2 42 | - 3 43 | mps_percentage: 44 | - 80 45 | - 30 46 | max_num_seqs: 7 47 | model_dtype: fp16 48 | workloads: 49 | workload_file: 50 | -------------------------------------------------------------------------------- /benchmark/chatlmsys/yamls/flexsm/requests_over_time_models_days_from_day55_to_day55_condense800_N17_maxrate_13_avgrate_4_GPUnum32_mesh_size4_idx4.yaml: -------------------------------------------------------------------------------- 1 | num_gpus: 4 2 | max_num_seqs: 256 3 | overload_threshold: 2 4 | gpu_memory_utilization: 0.27249999999999996 5 | models: 6 | - name: llm-16 7 | model: /mnt/afs/share/LLMCKPTs/huggyllama/llama-65b 8 | tensor_parallel_size: 4 9 | pipeline_parallel_size: 1 10 | placement: 11 | - - 0 12 | - 1 13 | - 2 14 | - 3 15 | mps_percentage: 16 | - 100 17 | - 30 18 | max_num_seqs: 89 19 | model_dtype: fp16 20 | - name: llm-12 21 | model: /mnt/afs/share/LLMCKPTs/huggyllama/llama-13b 22 | tensor_parallel_size: 4 23 | pipeline_parallel_size: 1 24 | placement: 25 | - - 0 26 | - 1 27 | - 2 28 | - 3 29 | mps_percentage: 30 | - 90 31 | - 30 32 | max_num_seqs: 19 33 | model_dtype: fp16 34 | workloads: 35 | workload_file: 36 | -------------------------------------------------------------------------------- /benchmark/chatlmsys/yamls/flexsm/requests_over_time_models_days_from_day55_to_day55_condense800_N17_maxrate_13_avgrate_4_GPUnum32_mesh_size4_idx5.yaml: -------------------------------------------------------------------------------- 1 | num_gpus: 4 2 | max_num_seqs: 256 3 | overload_threshold: 2 4 | gpu_memory_utilization: 0.27249999999999996 5 | models: 6 | - name: llm-10 7 | model: /mnt/afs/share/LLMCKPTs/huggyllama/llama-65b 8 | tensor_parallel_size: 4 9 | pipeline_parallel_size: 1 10 | placement: 11 | - - 0 12 | - 1 13 | - 2 14 | - 3 15 | mps_percentage: 16 | - 100 17 | - 30 18 | max_num_seqs: 23 19 | model_dtype: fp16 20 | - name: llm-6 21 | model: /mnt/afs/share/LLMCKPTs/huggyllama/llama-13b 22 | tensor_parallel_size: 4 23 | pipeline_parallel_size: 1 24 | placement: 25 | - - 0 26 | - 1 27 | - 2 28 | - 3 29 | mps_percentage: 30 | - 90 31 | - 30 32 | max_num_seqs: 18 33 | model_dtype: fp16 34 | workloads: 35 | workload_file: 36 | -------------------------------------------------------------------------------- /benchmark/chatlmsys/yamls/flexsm/requests_over_time_models_days_from_day55_to_day55_condense800_N17_maxrate_13_avgrate_4_GPUnum32_mesh_size4_idx6.yaml: -------------------------------------------------------------------------------- 1 | num_gpus: 4 2 | max_num_seqs: 256 3 | overload_threshold: 2 4 | gpu_memory_utilization: 0.36500000000000005 5 | models: 6 | - name: llm-4 7 | model: /mnt/afs/share/LLMCKPTs/huggyllama/llama-7b 8 | tensor_parallel_size: 4 9 | pipeline_parallel_size: 1 10 | placement: 11 | - - 0 12 | - 1 13 | - 2 14 | - 3 15 | mps_percentage: 16 | - 80 17 | - 40 18 | max_num_seqs: 418 19 | model_dtype: fp16 20 | - name: llm-15 21 | model: /mnt/afs/share/LLMCKPTs/huggyllama/llama-7b 22 | tensor_parallel_size: 4 23 | pipeline_parallel_size: 1 24 | placement: 25 | - - 0 26 | - 1 27 | - 2 28 | - 3 29 | mps_percentage: 30 | - 80 31 | - 30 32 | max_num_seqs: 28 33 | model_dtype: fp16 34 | - name: llm-9 35 | model: /mnt/afs/share/LLMCKPTs/huggyllama/llama-30b 36 | tensor_parallel_size: 4 37 | pipeline_parallel_size: 1 38 | placement: 39 | - - 0 40 | - 1 41 | - 2 42 | - 3 43 | mps_percentage: 44 | - 100 45 | - 30 46 | max_num_seqs: 11 47 | model_dtype: fp16 48 | workloads: 49 | workload_file: 50 | -------------------------------------------------------------------------------- /benchmark/chatlmsys/yamls/flexsm/requests_over_time_models_days_from_day55_to_day55_condense800_N17_maxrate_13_avgrate_4_GPUnum32_mesh_size8_idx0.yaml: -------------------------------------------------------------------------------- 1 | num_gpus: 8 2 | max_num_seqs: 256 3 | overload_threshold: 2 4 | gpu_memory_utilization: 0.195 5 | models: 6 | - name: llm-1 7 | model: /mnt/afs/share/LLMCKPTs/huggyllama/llama-13b 8 | tensor_parallel_size: 8 9 | pipeline_parallel_size: 1 10 | placement: 11 | - - 0 12 | - 1 13 | - 2 14 | - 3 15 | - 4 16 | - 5 17 | - 6 18 | - 7 19 | mps_percentage: 20 | - 90 21 | - 30 22 | max_num_seqs: 445 23 | model_dtype: fp16 24 | - name: llm-3 25 | model: /mnt/afs/share/LLMCKPTs/huggyllama/llama-13b 26 | tensor_parallel_size: 8 27 | pipeline_parallel_size: 1 28 | placement: 29 | - - 0 30 | - 1 31 | - 2 32 | - 3 33 | - 4 34 | - 5 35 | - 6 36 | - 7 37 | mps_percentage: 38 | - 90 39 | - 30 40 | max_num_seqs: 362 41 | model_dtype: fp16 42 | - name: llm-14 43 | model: /mnt/afs/share/LLMCKPTs/huggyllama/llama-65b 44 | tensor_parallel_size: 8 45 | pipeline_parallel_size: 1 46 | placement: 47 | - - 0 48 | - 1 49 | - 2 50 | - 3 51 | - 4 52 | - 5 53 | - 6 54 | - 7 55 | mps_percentage: 56 | - 100 57 | - 30 58 | max_num_seqs: 18 59 | model_dtype: fp16 60 | - name: llm-2 61 | model: /mnt/afs/share/LLMCKPTs/huggyllama/llama-13b 62 | tensor_parallel_size: 8 63 | pipeline_parallel_size: 1 64 | placement: 65 | - - 0 66 | - 1 67 | - 2 68 | - 3 69 | - 4 70 | - 5 71 | - 6 72 | - 7 73 | mps_percentage: 74 | - 90 75 | - 30 76 | max_num_seqs: 11 77 | model_dtype: fp16 78 | workloads: 79 | workload_file: 80 | -------------------------------------------------------------------------------- /benchmark/chatlmsys/yamls/requests_over_time_models_days_from_day55_to_day55_condense1200_N17_maxrate_19_avgrate_6.yaml: -------------------------------------------------------------------------------- 1 | cluster: 2 | nnodes: 4 3 | ngpus_per_node: 8 4 | models: 5 | - name: llm-1 6 | model: /mnt/afs/share/LLMCKPTs/huggyllama/llama-13b 7 | rate: 19.52777777777778 8 | - name: llm-2 9 | model: /mnt/afs/share/LLMCKPTs/huggyllama/llama-13b 10 | rate: 2.805555555555556 11 | - name: llm-3 12 | model: /mnt/afs/share/LLMCKPTs/huggyllama/llama-13b 13 | rate: 17.0 14 | - name: llm-4 15 | model: /mnt/afs/share/LLMCKPTs/huggyllama/llama-7b 16 | rate: 19.555555555555554 17 | - name: llm-5 18 | model: /mnt/afs/share/LLMCKPTs/huggyllama/llama-13b 19 | rate: 2.638888888888889 20 | - name: llm-6 21 | model: /mnt/afs/share/LLMCKPTs/huggyllama/llama-13b 22 | rate: 4.444444444444445 23 | - name: llm-7 24 | model: /mnt/afs/share/LLMCKPTs/huggyllama/llama-7b 25 | rate: 2.3611111111111107 26 | - name: llm-8 27 | model: /mnt/afs/share/LLMCKPTs/huggyllama/llama-30b 28 | rate: 4.916666666666667 29 | - name: llm-9 30 | model: /mnt/afs/share/LLMCKPTs/huggyllama/llama-30b 31 | rate: 1.8333333333333335 32 | - name: llm-10 33 | model: /mnt/afs/share/LLMCKPTs/huggyllama/llama-65b 34 | rate: 1.9166666666666667 35 | - name: llm-11 36 | model: /mnt/afs/share/LLMCKPTs/huggyllama/llama-30b 37 | rate: 14.833333333333334 38 | - name: llm-12 39 | model: /mnt/afs/share/LLMCKPTs/huggyllama/llama-13b 40 | rate: 4.555555555555555 41 | - name: llm-13 42 | model: /mnt/afs/share/LLMCKPTs/huggyllama/llama-7b 43 | rate: 4.694444444444444 44 | - name: llm-14 45 | model: /mnt/afs/share/LLMCKPTs/huggyllama/llama-65b 46 | rate: 2.111111111111111 47 | - name: llm-15 48 | model: /mnt/afs/share/LLMCKPTs/huggyllama/llama-7b 49 | rate: 7.583333333333333 50 | - name: llm-16 51 | model: /mnt/afs/share/LLMCKPTs/huggyllama/llama-65b 52 | rate: 3.25 53 | avg_output_len: 261.31368186874306 54 | avg_prompt_len: 100.17204301075269 55 | -------------------------------------------------------------------------------- /benchmark/chatlmsys/yamls/requests_over_time_models_days_from_day55_to_day55_condense1600_N17_maxrate_26_avgrate_8.yaml: -------------------------------------------------------------------------------- 1 | cluster: 2 | nnodes: 4 3 | ngpus_per_node: 8 4 | models: 5 | - name: llm-1 6 | model: /mnt/afs/share/LLMCKPTs/huggyllama/llama-13b 7 | rate: 26.037037037037038 8 | - name: llm-2 9 | model: /mnt/afs/share/LLMCKPTs/huggyllama/llama-13b 10 | rate: 3.740740740740741 11 | - name: llm-3 12 | model: /mnt/afs/share/LLMCKPTs/huggyllama/llama-13b 13 | rate: 22.666666666666664 14 | - name: llm-4 15 | model: /mnt/afs/share/LLMCKPTs/huggyllama/llama-7b 16 | rate: 26.074074074074073 17 | - name: llm-5 18 | model: /mnt/afs/share/LLMCKPTs/huggyllama/llama-13b 19 | rate: 3.5185185185185186 20 | - name: llm-6 21 | model: /mnt/afs/share/LLMCKPTs/huggyllama/llama-13b 22 | rate: 5.9259259259259265 23 | - name: llm-7 24 | model: /mnt/afs/share/LLMCKPTs/huggyllama/llama-7b 25 | rate: 3.148148148148148 26 | - name: llm-8 27 | model: /mnt/afs/share/LLMCKPTs/huggyllama/llama-30b 28 | rate: 6.555555555555556 29 | - name: llm-9 30 | model: /mnt/afs/share/LLMCKPTs/huggyllama/llama-30b 31 | rate: 2.4444444444444446 32 | - name: llm-10 33 | model: /mnt/afs/share/LLMCKPTs/huggyllama/llama-65b 34 | rate: 2.555555555555556 35 | - name: llm-11 36 | model: /mnt/afs/share/LLMCKPTs/huggyllama/llama-30b 37 | rate: 19.77777777777778 38 | - name: llm-12 39 | model: /mnt/afs/share/LLMCKPTs/huggyllama/llama-13b 40 | rate: 6.074074074074074 41 | - name: llm-13 42 | model: /mnt/afs/share/LLMCKPTs/huggyllama/llama-7b 43 | rate: 6.259259259259259 44 | - name: llm-14 45 | model: /mnt/afs/share/LLMCKPTs/huggyllama/llama-65b 46 | rate: 2.814814814814815 47 | - name: llm-15 48 | model: /mnt/afs/share/LLMCKPTs/huggyllama/llama-7b 49 | rate: 10.11111111111111 50 | - name: llm-16 51 | model: /mnt/afs/share/LLMCKPTs/huggyllama/llama-65b 52 | rate: 4.333333333333334 53 | avg_output_len: 261.31368186874306 54 | avg_prompt_len: 100.17204301075269 55 | -------------------------------------------------------------------------------- /benchmark/chatlmsys/yamls/requests_over_time_models_days_from_day55_to_day55_condense2000_N17_maxrate_32_avgrate_11.yaml: -------------------------------------------------------------------------------- 1 | cluster: 2 | nnodes: 4 3 | ngpus_per_node: 8 4 | models: 5 | - name: llm-1 6 | model: /mnt/afs/share/LLMCKPTs/huggyllama/llama-13b 7 | rate: 32.5462962962963 8 | - name: llm-2 9 | model: /mnt/afs/share/LLMCKPTs/huggyllama/llama-13b 10 | rate: 4.6759259259259265 11 | - name: llm-3 12 | model: /mnt/afs/share/LLMCKPTs/huggyllama/llama-13b 13 | rate: 28.333333333333332 14 | - name: llm-4 15 | model: /mnt/afs/share/LLMCKPTs/huggyllama/llama-7b 16 | rate: 32.59259259259259 17 | - name: llm-5 18 | model: /mnt/afs/share/LLMCKPTs/huggyllama/llama-13b 19 | rate: 4.398148148148149 20 | - name: llm-6 21 | model: /mnt/afs/share/LLMCKPTs/huggyllama/llama-13b 22 | rate: 7.407407407407407 23 | - name: llm-7 24 | model: /mnt/afs/share/LLMCKPTs/huggyllama/llama-7b 25 | rate: 3.9351851851851847 26 | - name: llm-8 27 | model: /mnt/afs/share/LLMCKPTs/huggyllama/llama-30b 28 | rate: 8.194444444444445 29 | - name: llm-9 30 | model: /mnt/afs/share/LLMCKPTs/huggyllama/llama-30b 31 | rate: 3.055555555555556 32 | - name: llm-10 33 | model: /mnt/afs/share/LLMCKPTs/huggyllama/llama-65b 34 | rate: 3.1944444444444446 35 | - name: llm-11 36 | model: /mnt/afs/share/LLMCKPTs/huggyllama/llama-30b 37 | rate: 24.72222222222222 38 | - name: llm-12 39 | model: /mnt/afs/share/LLMCKPTs/huggyllama/llama-13b 40 | rate: 7.592592592592593 41 | - name: llm-13 42 | model: /mnt/afs/share/LLMCKPTs/huggyllama/llama-7b 43 | rate: 7.8240740740740735 44 | - name: llm-14 45 | model: /mnt/afs/share/LLMCKPTs/huggyllama/llama-65b 46 | rate: 3.5185185185185186 47 | - name: llm-15 48 | model: /mnt/afs/share/LLMCKPTs/huggyllama/llama-7b 49 | rate: 12.63888888888889 50 | - name: llm-16 51 | model: /mnt/afs/share/LLMCKPTs/huggyllama/llama-65b 52 | rate: 5.416666666666667 53 | avg_output_len: 261.31368186874306 54 | avg_prompt_len: 100.17204301075269 55 | -------------------------------------------------------------------------------- /benchmark/chatlmsys/yamls/requests_over_time_models_days_from_day55_to_day55_condense500_N17_maxrate_8_avgrate_2.yaml: -------------------------------------------------------------------------------- 1 | cluster: 2 | nnodes: 4 3 | ngpus_per_node: 8 4 | models: 5 | - name: llm-1 6 | model: /mnt/afs/share/LLMCKPTs/huggyllama/llama-13b 7 | rate: 8.136574074074074 8 | - name: llm-2 9 | model: /mnt/afs/share/LLMCKPTs/huggyllama/llama-13b 10 | rate: 1.1689814814814816 11 | - name: llm-3 12 | model: /mnt/afs/share/LLMCKPTs/huggyllama/llama-13b 13 | rate: 7.083333333333333 14 | - name: llm-4 15 | model: /mnt/afs/share/LLMCKPTs/huggyllama/llama-7b 16 | rate: 8.148148148148147 17 | - name: llm-5 18 | model: /mnt/afs/share/LLMCKPTs/huggyllama/llama-13b 19 | rate: 1.0995370370370372 20 | - name: llm-6 21 | model: /mnt/afs/share/LLMCKPTs/huggyllama/llama-13b 22 | rate: 1.8518518518518519 23 | - name: llm-7 24 | model: /mnt/afs/share/LLMCKPTs/huggyllama/llama-7b 25 | rate: 0.9837962962962962 26 | - name: llm-8 27 | model: /mnt/afs/share/LLMCKPTs/huggyllama/llama-30b 28 | rate: 2.048611111111111 29 | - name: llm-9 30 | model: /mnt/afs/share/LLMCKPTs/huggyllama/llama-30b 31 | rate: 0.763888888888889 32 | - name: llm-10 33 | model: /mnt/afs/share/LLMCKPTs/huggyllama/llama-65b 34 | rate: 0.7986111111111112 35 | - name: llm-11 36 | model: /mnt/afs/share/LLMCKPTs/huggyllama/llama-30b 37 | rate: 6.180555555555555 38 | - name: llm-12 39 | model: /mnt/afs/share/LLMCKPTs/huggyllama/llama-13b 40 | rate: 1.8981481481481481 41 | - name: llm-13 42 | model: /mnt/afs/share/LLMCKPTs/huggyllama/llama-7b 43 | rate: 1.9560185185185184 44 | - name: llm-14 45 | model: /mnt/afs/share/LLMCKPTs/huggyllama/llama-65b 46 | rate: 0.8796296296296297 47 | - name: llm-15 48 | model: /mnt/afs/share/LLMCKPTs/huggyllama/llama-7b 49 | rate: 3.1597222222222223 50 | - name: llm-16 51 | model: /mnt/afs/share/LLMCKPTs/huggyllama/llama-65b 52 | rate: 1.3541666666666667 53 | avg_output_len: 261.31368186874306 54 | avg_prompt_len: 100.17204301075269 55 | -------------------------------------------------------------------------------- /benchmark/chatlmsys/yamls/requests_over_time_models_days_from_day55_to_day55_condense800_N17_maxrate_13_avgrate_4.yaml: -------------------------------------------------------------------------------- 1 | cluster: 2 | nnodes: 4 3 | ngpus_per_node: 8 4 | models: 5 | - name: llm-1 6 | model: /mnt/afs/share/LLMCKPTs/huggyllama/llama-13b 7 | rate: 13.018518518518519 8 | - name: llm-2 9 | model: /mnt/afs/share/LLMCKPTs/huggyllama/llama-13b 10 | rate: 1.8703703703703705 11 | - name: llm-3 12 | model: /mnt/afs/share/LLMCKPTs/huggyllama/llama-13b 13 | rate: 11.333333333333332 14 | - name: llm-4 15 | model: /mnt/afs/share/LLMCKPTs/huggyllama/llama-7b 16 | rate: 13.037037037037036 17 | - name: llm-5 18 | model: /mnt/afs/share/LLMCKPTs/huggyllama/llama-13b 19 | rate: 1.7592592592592593 20 | - name: llm-6 21 | model: /mnt/afs/share/LLMCKPTs/huggyllama/llama-13b 22 | rate: 2.9629629629629632 23 | - name: llm-7 24 | model: /mnt/afs/share/LLMCKPTs/huggyllama/llama-7b 25 | rate: 1.574074074074074 26 | - name: llm-8 27 | model: /mnt/afs/share/LLMCKPTs/huggyllama/llama-30b 28 | rate: 3.277777777777778 29 | - name: llm-9 30 | model: /mnt/afs/share/LLMCKPTs/huggyllama/llama-30b 31 | rate: 1.2222222222222223 32 | - name: llm-10 33 | model: /mnt/afs/share/LLMCKPTs/huggyllama/llama-65b 34 | rate: 1.277777777777778 35 | - name: llm-11 36 | model: /mnt/afs/share/LLMCKPTs/huggyllama/llama-30b 37 | rate: 9.88888888888889 38 | - name: llm-12 39 | model: /mnt/afs/share/LLMCKPTs/huggyllama/llama-13b 40 | rate: 3.037037037037037 41 | - name: llm-13 42 | model: /mnt/afs/share/LLMCKPTs/huggyllama/llama-7b 43 | rate: 3.1296296296296293 44 | - name: llm-14 45 | model: /mnt/afs/share/LLMCKPTs/huggyllama/llama-65b 46 | rate: 1.4074074074074074 47 | - name: llm-15 48 | model: /mnt/afs/share/LLMCKPTs/huggyllama/llama-7b 49 | rate: 5.055555555555555 50 | - name: llm-16 51 | model: /mnt/afs/share/LLMCKPTs/huggyllama/llama-65b 52 | rate: 2.166666666666667 53 | avg_output_len: 261.31368186874306 54 | avg_prompt_len: 100.17204301075269 55 | -------------------------------------------------------------------------------- /benchmark/chatlmsys/yamls/spatial/requests_over_time_models_days_from_day55_to_day55_condense1200_N17_maxrate_19_avgrate_6_GPUnum32_mesh_size1_idx0_spatial.yaml: -------------------------------------------------------------------------------- 1 | num_gpus: 1 2 | max_num_seqs: 256 3 | overload_threshold: 2 4 | gpu_memory_utilization: 0.5 5 | models: 6 | - name: llm-1 7 | model: /mnt/afs/share/LLMCKPTs/huggyllama/llama-13b 8 | tensor_parallel_size: 1 9 | pipeline_parallel_size: 1 10 | placement: 11 | - - 0 12 | mps_percentage: 13 | - 100 14 | - 90 15 | max_num_seqs: 256 16 | model_dtype: fp16 17 | workloads: 18 | workload_file: null 19 | -------------------------------------------------------------------------------- /benchmark/chatlmsys/yamls/spatial/requests_over_time_models_days_from_day55_to_day55_condense1200_N17_maxrate_19_avgrate_6_GPUnum32_mesh_size1_idx12_spatial.yaml: -------------------------------------------------------------------------------- 1 | num_gpus: 1 2 | max_num_seqs: 256 3 | overload_threshold: 2 4 | gpu_memory_utilization: 0.65 5 | models: 6 | - name: llm-13 7 | model: /mnt/afs/share/LLMCKPTs/huggyllama/llama-7b 8 | tensor_parallel_size: 1 9 | pipeline_parallel_size: 1 10 | placement: 11 | - - 0 12 | mps_percentage: 13 | - 100 14 | - 90 15 | max_num_seqs: 256 16 | model_dtype: fp16 17 | workloads: 18 | workload_file: null 19 | -------------------------------------------------------------------------------- /benchmark/chatlmsys/yamls/spatial/requests_over_time_models_days_from_day55_to_day55_condense1200_N17_maxrate_19_avgrate_6_GPUnum32_mesh_size1_idx14_spatial.yaml: -------------------------------------------------------------------------------- 1 | num_gpus: 1 2 | max_num_seqs: 256 3 | overload_threshold: 2 4 | gpu_memory_utilization: 0.65 5 | models: 6 | - name: llm-15 7 | model: /mnt/afs/share/LLMCKPTs/huggyllama/llama-7b 8 | tensor_parallel_size: 1 9 | pipeline_parallel_size: 1 10 | placement: 11 | - - 0 12 | mps_percentage: 13 | - 100 14 | - 90 15 | max_num_seqs: 256 16 | model_dtype: fp16 17 | workloads: 18 | workload_file: null 19 | -------------------------------------------------------------------------------- /benchmark/chatlmsys/yamls/spatial/requests_over_time_models_days_from_day55_to_day55_condense1200_N17_maxrate_19_avgrate_6_GPUnum32_mesh_size1_idx2_spatial.yaml: -------------------------------------------------------------------------------- 1 | num_gpus: 1 2 | max_num_seqs: 256 3 | overload_threshold: 2 4 | gpu_memory_utilization: 0.5 5 | models: 6 | - name: llm-3 7 | model: /mnt/afs/share/LLMCKPTs/huggyllama/llama-13b 8 | tensor_parallel_size: 1 9 | pipeline_parallel_size: 1 10 | placement: 11 | - - 0 12 | mps_percentage: 13 | - 100 14 | - 90 15 | max_num_seqs: 256 16 | model_dtype: fp16 17 | workloads: 18 | workload_file: null 19 | -------------------------------------------------------------------------------- /benchmark/chatlmsys/yamls/spatial/requests_over_time_models_days_from_day55_to_day55_condense1200_N17_maxrate_19_avgrate_6_GPUnum32_mesh_size1_idx3_spatial.yaml: -------------------------------------------------------------------------------- 1 | num_gpus: 1 2 | max_num_seqs: 256 3 | overload_threshold: 2 4 | gpu_memory_utilization: 0.65 5 | models: 6 | - name: llm-4 7 | model: /mnt/afs/share/LLMCKPTs/huggyllama/llama-7b 8 | tensor_parallel_size: 1 9 | pipeline_parallel_size: 1 10 | placement: 11 | - - 0 12 | mps_percentage: 13 | - 100 14 | - 90 15 | max_num_seqs: 256 16 | model_dtype: fp16 17 | workloads: 18 | workload_file: null 19 | -------------------------------------------------------------------------------- /benchmark/chatlmsys/yamls/spatial/requests_over_time_models_days_from_day55_to_day55_condense1200_N17_maxrate_19_avgrate_6_GPUnum32_mesh_size1_idx6_spatial.yaml: -------------------------------------------------------------------------------- 1 | num_gpus: 1 2 | max_num_seqs: 256 3 | overload_threshold: 2 4 | gpu_memory_utilization: 0.65 5 | models: 6 | - name: llm-7 7 | model: /mnt/afs/share/LLMCKPTs/huggyllama/llama-7b 8 | tensor_parallel_size: 1 9 | pipeline_parallel_size: 1 10 | placement: 11 | - - 0 12 | mps_percentage: 13 | - 100 14 | - 90 15 | max_num_seqs: 256 16 | model_dtype: fp16 17 | workloads: 18 | workload_file: null 19 | -------------------------------------------------------------------------------- /benchmark/chatlmsys/yamls/spatial/requests_over_time_models_days_from_day55_to_day55_condense1200_N17_maxrate_19_avgrate_6_GPUnum32_mesh_size2_idx10_spatial.yaml: -------------------------------------------------------------------------------- 1 | num_gpus: 2 2 | max_num_seqs: 256 3 | overload_threshold: 2 4 | gpu_memory_utilization: 0.375 5 | models: 6 | - name: llm-11 7 | model: /mnt/afs/share/LLMCKPTs/huggyllama/llama-30b 8 | tensor_parallel_size: 2 9 | pipeline_parallel_size: 1 10 | placement: 11 | - - 0 12 | - 1 13 | mps_percentage: 14 | - 100 15 | - 90 16 | max_num_seqs: 256 17 | model_dtype: fp16 18 | workloads: 19 | workload_file: null 20 | -------------------------------------------------------------------------------- /benchmark/chatlmsys/yamls/spatial/requests_over_time_models_days_from_day55_to_day55_condense1200_N17_maxrate_19_avgrate_6_GPUnum32_mesh_size2_idx11_spatial.yaml: -------------------------------------------------------------------------------- 1 | num_gpus: 2 2 | max_num_seqs: 256 3 | overload_threshold: 2 4 | gpu_memory_utilization: 0.5 5 | models: 6 | - name: llm-12 7 | model: /mnt/afs/share/LLMCKPTs/huggyllama/llama-13b 8 | tensor_parallel_size: 2 9 | pipeline_parallel_size: 1 10 | placement: 11 | - - 0 12 | - 1 13 | mps_percentage: 14 | - 100 15 | - 90 16 | max_num_seqs: 256 17 | model_dtype: fp16 18 | workloads: 19 | workload_file: null 20 | -------------------------------------------------------------------------------- /benchmark/chatlmsys/yamls/spatial/requests_over_time_models_days_from_day55_to_day55_condense1200_N17_maxrate_19_avgrate_6_GPUnum32_mesh_size2_idx1_spatial.yaml: -------------------------------------------------------------------------------- 1 | num_gpus: 2 2 | max_num_seqs: 256 3 | overload_threshold: 2 4 | gpu_memory_utilization: 0.5 5 | models: 6 | - name: llm-2 7 | model: /mnt/afs/share/LLMCKPTs/huggyllama/llama-13b 8 | tensor_parallel_size: 2 9 | pipeline_parallel_size: 1 10 | placement: 11 | - - 0 12 | - 1 13 | mps_percentage: 14 | - 100 15 | - 90 16 | max_num_seqs: 256 17 | model_dtype: fp16 18 | workloads: 19 | workload_file: null 20 | -------------------------------------------------------------------------------- /benchmark/chatlmsys/yamls/spatial/requests_over_time_models_days_from_day55_to_day55_condense1200_N17_maxrate_19_avgrate_6_GPUnum32_mesh_size2_idx4_spatial.yaml: -------------------------------------------------------------------------------- 1 | num_gpus: 2 2 | max_num_seqs: 256 3 | overload_threshold: 2 4 | gpu_memory_utilization: 0.5 5 | models: 6 | - name: llm-5 7 | model: /mnt/afs/share/LLMCKPTs/huggyllama/llama-13b 8 | tensor_parallel_size: 2 9 | pipeline_parallel_size: 1 10 | placement: 11 | - - 0 12 | - 1 13 | mps_percentage: 14 | - 100 15 | - 90 16 | max_num_seqs: 256 17 | model_dtype: fp16 18 | workloads: 19 | workload_file: null 20 | -------------------------------------------------------------------------------- /benchmark/chatlmsys/yamls/spatial/requests_over_time_models_days_from_day55_to_day55_condense1200_N17_maxrate_19_avgrate_6_GPUnum32_mesh_size2_idx5_spatial.yaml: -------------------------------------------------------------------------------- 1 | num_gpus: 2 2 | max_num_seqs: 256 3 | overload_threshold: 2 4 | gpu_memory_utilization: 0.5 5 | models: 6 | - name: llm-6 7 | model: /mnt/afs/share/LLMCKPTs/huggyllama/llama-13b 8 | tensor_parallel_size: 2 9 | pipeline_parallel_size: 1 10 | placement: 11 | - - 0 12 | - 1 13 | mps_percentage: 14 | - 100 15 | - 90 16 | max_num_seqs: 256 17 | model_dtype: fp16 18 | workloads: 19 | workload_file: null 20 | -------------------------------------------------------------------------------- /benchmark/chatlmsys/yamls/spatial/requests_over_time_models_days_from_day55_to_day55_condense1200_N17_maxrate_19_avgrate_6_GPUnum32_mesh_size2_idx7_spatial.yaml: -------------------------------------------------------------------------------- 1 | num_gpus: 2 2 | max_num_seqs: 256 3 | overload_threshold: 2 4 | gpu_memory_utilization: 0.375 5 | models: 6 | - name: llm-8 7 | model: /mnt/afs/share/LLMCKPTs/huggyllama/llama-30b 8 | tensor_parallel_size: 2 9 | pipeline_parallel_size: 1 10 | placement: 11 | - - 0 12 | - 1 13 | mps_percentage: 14 | - 100 15 | - 90 16 | max_num_seqs: 256 17 | model_dtype: fp16 18 | workloads: 19 | workload_file: null 20 | -------------------------------------------------------------------------------- /benchmark/chatlmsys/yamls/spatial/requests_over_time_models_days_from_day55_to_day55_condense1200_N17_maxrate_19_avgrate_6_GPUnum32_mesh_size2_idx8_spatial.yaml: -------------------------------------------------------------------------------- 1 | num_gpus: 2 2 | max_num_seqs: 256 3 | overload_threshold: 2 4 | gpu_memory_utilization: 0.375 5 | models: 6 | - name: llm-9 7 | model: /mnt/afs/share/LLMCKPTs/huggyllama/llama-30b 8 | tensor_parallel_size: 2 9 | pipeline_parallel_size: 1 10 | placement: 11 | - - 0 12 | - 1 13 | mps_percentage: 14 | - 100 15 | - 90 16 | max_num_seqs: 256 17 | model_dtype: fp16 18 | workloads: 19 | workload_file: null 20 | -------------------------------------------------------------------------------- /benchmark/chatlmsys/yamls/spatial/requests_over_time_models_days_from_day55_to_day55_condense1200_N17_maxrate_19_avgrate_6_GPUnum32_mesh_size4_idx13_spatial.yaml: -------------------------------------------------------------------------------- 1 | num_gpus: 4 2 | max_num_seqs: 256 3 | overload_threshold: 2 4 | gpu_memory_utilization: 0.375 5 | models: 6 | - name: llm-14 7 | model: /mnt/afs/share/LLMCKPTs/huggyllama/llama-65b 8 | tensor_parallel_size: 4 9 | pipeline_parallel_size: 1 10 | placement: 11 | - - 0 12 | - 1 13 | - 2 14 | - 3 15 | mps_percentage: 16 | - 100 17 | - 90 18 | max_num_seqs: 256 19 | model_dtype: fp16 20 | workloads: 21 | workload_file: null 22 | -------------------------------------------------------------------------------- /benchmark/chatlmsys/yamls/spatial/requests_over_time_models_days_from_day55_to_day55_condense1200_N17_maxrate_19_avgrate_6_GPUnum32_mesh_size4_idx15_spatial.yaml: -------------------------------------------------------------------------------- 1 | num_gpus: 4 2 | max_num_seqs: 256 3 | overload_threshold: 2 4 | gpu_memory_utilization: 0.375 5 | models: 6 | - name: llm-16 7 | model: /mnt/afs/share/LLMCKPTs/huggyllama/llama-65b 8 | tensor_parallel_size: 4 9 | pipeline_parallel_size: 1 10 | placement: 11 | - - 0 12 | - 1 13 | - 2 14 | - 3 15 | mps_percentage: 16 | - 100 17 | - 90 18 | max_num_seqs: 256 19 | model_dtype: fp16 20 | workloads: 21 | workload_file: null 22 | -------------------------------------------------------------------------------- /benchmark/chatlmsys/yamls/spatial/requests_over_time_models_days_from_day55_to_day55_condense1200_N17_maxrate_19_avgrate_6_GPUnum32_mesh_size4_idx9_spatial.yaml: -------------------------------------------------------------------------------- 1 | num_gpus: 4 2 | max_num_seqs: 256 3 | overload_threshold: 2 4 | gpu_memory_utilization: 0.375 5 | models: 6 | - name: llm-10 7 | model: /mnt/afs/share/LLMCKPTs/huggyllama/llama-65b 8 | tensor_parallel_size: 4 9 | pipeline_parallel_size: 1 10 | placement: 11 | - - 0 12 | - 1 13 | - 2 14 | - 3 15 | mps_percentage: 16 | - 100 17 | - 90 18 | max_num_seqs: 256 19 | model_dtype: fp16 20 | workloads: 21 | workload_file: null 22 | -------------------------------------------------------------------------------- /benchmark/chatlmsys/yamls/spatial/requests_over_time_models_days_from_day55_to_day55_condense1600_N17_maxrate_26_avgrate_8_GPUnum32_mesh_size1_idx0_spatial.yaml: -------------------------------------------------------------------------------- 1 | num_gpus: 1 2 | max_num_seqs: 256 3 | overload_threshold: 2 4 | gpu_memory_utilization: 0.5 5 | models: 6 | - name: llm-1 7 | model: /mnt/afs/share/LLMCKPTs/huggyllama/llama-13b 8 | tensor_parallel_size: 1 9 | pipeline_parallel_size: 1 10 | placement: 11 | - - 0 12 | mps_percentage: 13 | - 100 14 | - 90 15 | max_num_seqs: 256 16 | model_dtype: fp16 17 | workloads: 18 | workload_file: null 19 | -------------------------------------------------------------------------------- /benchmark/chatlmsys/yamls/spatial/requests_over_time_models_days_from_day55_to_day55_condense1600_N17_maxrate_26_avgrate_8_GPUnum32_mesh_size1_idx12_spatial.yaml: -------------------------------------------------------------------------------- 1 | num_gpus: 1 2 | max_num_seqs: 256 3 | overload_threshold: 2 4 | gpu_memory_utilization: 0.65 5 | models: 6 | - name: llm-13 7 | model: /mnt/afs/share/LLMCKPTs/huggyllama/llama-7b 8 | tensor_parallel_size: 1 9 | pipeline_parallel_size: 1 10 | placement: 11 | - - 0 12 | mps_percentage: 13 | - 100 14 | - 90 15 | max_num_seqs: 256 16 | model_dtype: fp16 17 | workloads: 18 | workload_file: null 19 | -------------------------------------------------------------------------------- /benchmark/chatlmsys/yamls/spatial/requests_over_time_models_days_from_day55_to_day55_condense1600_N17_maxrate_26_avgrate_8_GPUnum32_mesh_size1_idx14_spatial.yaml: -------------------------------------------------------------------------------- 1 | num_gpus: 1 2 | max_num_seqs: 256 3 | overload_threshold: 2 4 | gpu_memory_utilization: 0.65 5 | models: 6 | - name: llm-15 7 | model: /mnt/afs/share/LLMCKPTs/huggyllama/llama-7b 8 | tensor_parallel_size: 1 9 | pipeline_parallel_size: 1 10 | placement: 11 | - - 0 12 | mps_percentage: 13 | - 100 14 | - 90 15 | max_num_seqs: 256 16 | model_dtype: fp16 17 | workloads: 18 | workload_file: null 19 | -------------------------------------------------------------------------------- /benchmark/chatlmsys/yamls/spatial/requests_over_time_models_days_from_day55_to_day55_condense1600_N17_maxrate_26_avgrate_8_GPUnum32_mesh_size1_idx2_spatial.yaml: -------------------------------------------------------------------------------- 1 | num_gpus: 1 2 | max_num_seqs: 256 3 | overload_threshold: 2 4 | gpu_memory_utilization: 0.5 5 | models: 6 | - name: llm-3 7 | model: /mnt/afs/share/LLMCKPTs/huggyllama/llama-13b 8 | tensor_parallel_size: 1 9 | pipeline_parallel_size: 1 10 | placement: 11 | - - 0 12 | mps_percentage: 13 | - 100 14 | - 90 15 | max_num_seqs: 256 16 | model_dtype: fp16 17 | workloads: 18 | workload_file: null 19 | -------------------------------------------------------------------------------- /benchmark/chatlmsys/yamls/spatial/requests_over_time_models_days_from_day55_to_day55_condense1600_N17_maxrate_26_avgrate_8_GPUnum32_mesh_size1_idx3_spatial.yaml: -------------------------------------------------------------------------------- 1 | num_gpus: 1 2 | max_num_seqs: 256 3 | overload_threshold: 2 4 | gpu_memory_utilization: 0.65 5 | models: 6 | - name: llm-4 7 | model: /mnt/afs/share/LLMCKPTs/huggyllama/llama-7b 8 | tensor_parallel_size: 1 9 | pipeline_parallel_size: 1 10 | placement: 11 | - - 0 12 | mps_percentage: 13 | - 100 14 | - 90 15 | max_num_seqs: 256 16 | model_dtype: fp16 17 | workloads: 18 | workload_file: null 19 | -------------------------------------------------------------------------------- /benchmark/chatlmsys/yamls/spatial/requests_over_time_models_days_from_day55_to_day55_condense1600_N17_maxrate_26_avgrate_8_GPUnum32_mesh_size1_idx6_spatial.yaml: -------------------------------------------------------------------------------- 1 | num_gpus: 1 2 | max_num_seqs: 256 3 | overload_threshold: 2 4 | gpu_memory_utilization: 0.65 5 | models: 6 | - name: llm-7 7 | model: /mnt/afs/share/LLMCKPTs/huggyllama/llama-7b 8 | tensor_parallel_size: 1 9 | pipeline_parallel_size: 1 10 | placement: 11 | - - 0 12 | mps_percentage: 13 | - 100 14 | - 90 15 | max_num_seqs: 256 16 | model_dtype: fp16 17 | workloads: 18 | workload_file: null 19 | -------------------------------------------------------------------------------- /benchmark/chatlmsys/yamls/spatial/requests_over_time_models_days_from_day55_to_day55_condense1600_N17_maxrate_26_avgrate_8_GPUnum32_mesh_size2_idx10_spatial.yaml: -------------------------------------------------------------------------------- 1 | num_gpus: 2 2 | max_num_seqs: 256 3 | overload_threshold: 2 4 | gpu_memory_utilization: 0.375 5 | models: 6 | - name: llm-11 7 | model: /mnt/afs/share/LLMCKPTs/huggyllama/llama-30b 8 | tensor_parallel_size: 2 9 | pipeline_parallel_size: 1 10 | placement: 11 | - - 0 12 | - 1 13 | mps_percentage: 14 | - 100 15 | - 90 16 | max_num_seqs: 256 17 | model_dtype: fp16 18 | workloads: 19 | workload_file: null 20 | -------------------------------------------------------------------------------- /benchmark/chatlmsys/yamls/spatial/requests_over_time_models_days_from_day55_to_day55_condense1600_N17_maxrate_26_avgrate_8_GPUnum32_mesh_size2_idx11_spatial.yaml: -------------------------------------------------------------------------------- 1 | num_gpus: 2 2 | max_num_seqs: 256 3 | overload_threshold: 2 4 | gpu_memory_utilization: 0.5 5 | models: 6 | - name: llm-12 7 | model: /mnt/afs/share/LLMCKPTs/huggyllama/llama-13b 8 | tensor_parallel_size: 2 9 | pipeline_parallel_size: 1 10 | placement: 11 | - - 0 12 | - 1 13 | mps_percentage: 14 | - 100 15 | - 90 16 | max_num_seqs: 256 17 | model_dtype: fp16 18 | workloads: 19 | workload_file: null 20 | -------------------------------------------------------------------------------- /benchmark/chatlmsys/yamls/spatial/requests_over_time_models_days_from_day55_to_day55_condense1600_N17_maxrate_26_avgrate_8_GPUnum32_mesh_size2_idx1_spatial.yaml: -------------------------------------------------------------------------------- 1 | num_gpus: 2 2 | max_num_seqs: 256 3 | overload_threshold: 2 4 | gpu_memory_utilization: 0.5 5 | models: 6 | - name: llm-2 7 | model: /mnt/afs/share/LLMCKPTs/huggyllama/llama-13b 8 | tensor_parallel_size: 2 9 | pipeline_parallel_size: 1 10 | placement: 11 | - - 0 12 | - 1 13 | mps_percentage: 14 | - 100 15 | - 90 16 | max_num_seqs: 256 17 | model_dtype: fp16 18 | workloads: 19 | workload_file: null 20 | -------------------------------------------------------------------------------- /benchmark/chatlmsys/yamls/spatial/requests_over_time_models_days_from_day55_to_day55_condense1600_N17_maxrate_26_avgrate_8_GPUnum32_mesh_size2_idx4_spatial.yaml: -------------------------------------------------------------------------------- 1 | num_gpus: 2 2 | max_num_seqs: 256 3 | overload_threshold: 2 4 | gpu_memory_utilization: 0.5 5 | models: 6 | - name: llm-5 7 | model: /mnt/afs/share/LLMCKPTs/huggyllama/llama-13b 8 | tensor_parallel_size: 2 9 | pipeline_parallel_size: 1 10 | placement: 11 | - - 0 12 | - 1 13 | mps_percentage: 14 | - 100 15 | - 90 16 | max_num_seqs: 256 17 | model_dtype: fp16 18 | workloads: 19 | workload_file: null 20 | -------------------------------------------------------------------------------- /benchmark/chatlmsys/yamls/spatial/requests_over_time_models_days_from_day55_to_day55_condense1600_N17_maxrate_26_avgrate_8_GPUnum32_mesh_size2_idx5_spatial.yaml: -------------------------------------------------------------------------------- 1 | num_gpus: 2 2 | max_num_seqs: 256 3 | overload_threshold: 2 4 | gpu_memory_utilization: 0.5 5 | models: 6 | - name: llm-6 7 | model: /mnt/afs/share/LLMCKPTs/huggyllama/llama-13b 8 | tensor_parallel_size: 2 9 | pipeline_parallel_size: 1 10 | placement: 11 | - - 0 12 | - 1 13 | mps_percentage: 14 | - 100 15 | - 90 16 | max_num_seqs: 256 17 | model_dtype: fp16 18 | workloads: 19 | workload_file: null 20 | -------------------------------------------------------------------------------- /benchmark/chatlmsys/yamls/spatial/requests_over_time_models_days_from_day55_to_day55_condense1600_N17_maxrate_26_avgrate_8_GPUnum32_mesh_size2_idx7_spatial.yaml: -------------------------------------------------------------------------------- 1 | num_gpus: 2 2 | max_num_seqs: 256 3 | overload_threshold: 2 4 | gpu_memory_utilization: 0.375 5 | models: 6 | - name: llm-8 7 | model: /mnt/afs/share/LLMCKPTs/huggyllama/llama-30b 8 | tensor_parallel_size: 2 9 | pipeline_parallel_size: 1 10 | placement: 11 | - - 0 12 | - 1 13 | mps_percentage: 14 | - 100 15 | - 90 16 | max_num_seqs: 256 17 | model_dtype: fp16 18 | workloads: 19 | workload_file: null 20 | -------------------------------------------------------------------------------- /benchmark/chatlmsys/yamls/spatial/requests_over_time_models_days_from_day55_to_day55_condense1600_N17_maxrate_26_avgrate_8_GPUnum32_mesh_size2_idx8_spatial.yaml: -------------------------------------------------------------------------------- 1 | num_gpus: 2 2 | max_num_seqs: 256 3 | overload_threshold: 2 4 | gpu_memory_utilization: 0.375 5 | models: 6 | - name: llm-9 7 | model: /mnt/afs/share/LLMCKPTs/huggyllama/llama-30b 8 | tensor_parallel_size: 2 9 | pipeline_parallel_size: 1 10 | placement: 11 | - - 0 12 | - 1 13 | mps_percentage: 14 | - 100 15 | - 90 16 | max_num_seqs: 256 17 | model_dtype: fp16 18 | workloads: 19 | workload_file: null 20 | -------------------------------------------------------------------------------- /benchmark/chatlmsys/yamls/spatial/requests_over_time_models_days_from_day55_to_day55_condense1600_N17_maxrate_26_avgrate_8_GPUnum32_mesh_size4_idx13_spatial.yaml: -------------------------------------------------------------------------------- 1 | num_gpus: 4 2 | max_num_seqs: 256 3 | overload_threshold: 2 4 | gpu_memory_utilization: 0.375 5 | models: 6 | - name: llm-14 7 | model: /mnt/afs/share/LLMCKPTs/huggyllama/llama-65b 8 | tensor_parallel_size: 4 9 | pipeline_parallel_size: 1 10 | placement: 11 | - - 0 12 | - 1 13 | - 2 14 | - 3 15 | mps_percentage: 16 | - 100 17 | - 90 18 | max_num_seqs: 256 19 | model_dtype: fp16 20 | workloads: 21 | workload_file: null 22 | -------------------------------------------------------------------------------- /benchmark/chatlmsys/yamls/spatial/requests_over_time_models_days_from_day55_to_day55_condense1600_N17_maxrate_26_avgrate_8_GPUnum32_mesh_size4_idx15_spatial.yaml: -------------------------------------------------------------------------------- 1 | num_gpus: 4 2 | max_num_seqs: 256 3 | overload_threshold: 2 4 | gpu_memory_utilization: 0.375 5 | models: 6 | - name: llm-16 7 | model: /mnt/afs/share/LLMCKPTs/huggyllama/llama-65b 8 | tensor_parallel_size: 4 9 | pipeline_parallel_size: 1 10 | placement: 11 | - - 0 12 | - 1 13 | - 2 14 | - 3 15 | mps_percentage: 16 | - 100 17 | - 90 18 | max_num_seqs: 256 19 | model_dtype: fp16 20 | workloads: 21 | workload_file: null 22 | -------------------------------------------------------------------------------- /benchmark/chatlmsys/yamls/spatial/requests_over_time_models_days_from_day55_to_day55_condense1600_N17_maxrate_26_avgrate_8_GPUnum32_mesh_size4_idx9_spatial.yaml: -------------------------------------------------------------------------------- 1 | num_gpus: 4 2 | max_num_seqs: 256 3 | overload_threshold: 2 4 | gpu_memory_utilization: 0.375 5 | models: 6 | - name: llm-10 7 | model: /mnt/afs/share/LLMCKPTs/huggyllama/llama-65b 8 | tensor_parallel_size: 4 9 | pipeline_parallel_size: 1 10 | placement: 11 | - - 0 12 | - 1 13 | - 2 14 | - 3 15 | mps_percentage: 16 | - 100 17 | - 90 18 | max_num_seqs: 256 19 | model_dtype: fp16 20 | workloads: 21 | workload_file: null 22 | -------------------------------------------------------------------------------- /benchmark/chatlmsys/yamls/spatial/requests_over_time_models_days_from_day55_to_day55_condense2000_N17_maxrate_32_avgrate_11_GPUnum32_mesh_size1_idx0_spatial.yaml: -------------------------------------------------------------------------------- 1 | num_gpus: 1 2 | max_num_seqs: 256 3 | overload_threshold: 2 4 | gpu_memory_utilization: 0.5 5 | models: 6 | - name: llm-1 7 | model: /mnt/afs/share/LLMCKPTs/huggyllama/llama-13b 8 | tensor_parallel_size: 1 9 | pipeline_parallel_size: 1 10 | placement: 11 | - - 0 12 | mps_percentage: 13 | - 100 14 | - 90 15 | max_num_seqs: 256 16 | model_dtype: fp16 17 | workloads: 18 | workload_file: null 19 | -------------------------------------------------------------------------------- /benchmark/chatlmsys/yamls/spatial/requests_over_time_models_days_from_day55_to_day55_condense2000_N17_maxrate_32_avgrate_11_GPUnum32_mesh_size1_idx12_spatial.yaml: -------------------------------------------------------------------------------- 1 | num_gpus: 1 2 | max_num_seqs: 256 3 | overload_threshold: 2 4 | gpu_memory_utilization: 0.65 5 | models: 6 | - name: llm-13 7 | model: /mnt/afs/share/LLMCKPTs/huggyllama/llama-7b 8 | tensor_parallel_size: 1 9 | pipeline_parallel_size: 1 10 | placement: 11 | - - 0 12 | mps_percentage: 13 | - 100 14 | - 90 15 | max_num_seqs: 256 16 | model_dtype: fp16 17 | workloads: 18 | workload_file: null 19 | -------------------------------------------------------------------------------- /benchmark/chatlmsys/yamls/spatial/requests_over_time_models_days_from_day55_to_day55_condense2000_N17_maxrate_32_avgrate_11_GPUnum32_mesh_size1_idx14_spatial.yaml: -------------------------------------------------------------------------------- 1 | num_gpus: 1 2 | max_num_seqs: 256 3 | overload_threshold: 2 4 | gpu_memory_utilization: 0.65 5 | models: 6 | - name: llm-15 7 | model: /mnt/afs/share/LLMCKPTs/huggyllama/llama-7b 8 | tensor_parallel_size: 1 9 | pipeline_parallel_size: 1 10 | placement: 11 | - - 0 12 | mps_percentage: 13 | - 100 14 | - 90 15 | max_num_seqs: 256 16 | model_dtype: fp16 17 | workloads: 18 | workload_file: null 19 | -------------------------------------------------------------------------------- /benchmark/chatlmsys/yamls/spatial/requests_over_time_models_days_from_day55_to_day55_condense2000_N17_maxrate_32_avgrate_11_GPUnum32_mesh_size1_idx2_spatial.yaml: -------------------------------------------------------------------------------- 1 | num_gpus: 1 2 | max_num_seqs: 256 3 | overload_threshold: 2 4 | gpu_memory_utilization: 0.5 5 | models: 6 | - name: llm-3 7 | model: /mnt/afs/share/LLMCKPTs/huggyllama/llama-13b 8 | tensor_parallel_size: 1 9 | pipeline_parallel_size: 1 10 | placement: 11 | - - 0 12 | mps_percentage: 13 | - 100 14 | - 90 15 | max_num_seqs: 256 16 | model_dtype: fp16 17 | workloads: 18 | workload_file: null 19 | -------------------------------------------------------------------------------- /benchmark/chatlmsys/yamls/spatial/requests_over_time_models_days_from_day55_to_day55_condense2000_N17_maxrate_32_avgrate_11_GPUnum32_mesh_size1_idx3_spatial.yaml: -------------------------------------------------------------------------------- 1 | num_gpus: 1 2 | max_num_seqs: 256 3 | overload_threshold: 2 4 | gpu_memory_utilization: 0.65 5 | models: 6 | - name: llm-4 7 | model: /mnt/afs/share/LLMCKPTs/huggyllama/llama-7b 8 | tensor_parallel_size: 1 9 | pipeline_parallel_size: 1 10 | placement: 11 | - - 0 12 | mps_percentage: 13 | - 100 14 | - 90 15 | max_num_seqs: 256 16 | model_dtype: fp16 17 | workloads: 18 | workload_file: null 19 | -------------------------------------------------------------------------------- /benchmark/chatlmsys/yamls/spatial/requests_over_time_models_days_from_day55_to_day55_condense2000_N17_maxrate_32_avgrate_11_GPUnum32_mesh_size1_idx6_spatial.yaml: -------------------------------------------------------------------------------- 1 | num_gpus: 1 2 | max_num_seqs: 256 3 | overload_threshold: 2 4 | gpu_memory_utilization: 0.65 5 | models: 6 | - name: llm-7 7 | model: /mnt/afs/share/LLMCKPTs/huggyllama/llama-7b 8 | tensor_parallel_size: 1 9 | pipeline_parallel_size: 1 10 | placement: 11 | - - 0 12 | mps_percentage: 13 | - 100 14 | - 90 15 | max_num_seqs: 256 16 | model_dtype: fp16 17 | workloads: 18 | workload_file: null 19 | -------------------------------------------------------------------------------- /benchmark/chatlmsys/yamls/spatial/requests_over_time_models_days_from_day55_to_day55_condense2000_N17_maxrate_32_avgrate_11_GPUnum32_mesh_size2_idx10_spatial.yaml: -------------------------------------------------------------------------------- 1 | num_gpus: 2 2 | max_num_seqs: 256 3 | overload_threshold: 2 4 | gpu_memory_utilization: 0.375 5 | models: 6 | - name: llm-11 7 | model: /mnt/afs/share/LLMCKPTs/huggyllama/llama-30b 8 | tensor_parallel_size: 2 9 | pipeline_parallel_size: 1 10 | placement: 11 | - - 0 12 | - 1 13 | mps_percentage: 14 | - 100 15 | - 90 16 | max_num_seqs: 256 17 | model_dtype: fp16 18 | workloads: 19 | workload_file: null 20 | -------------------------------------------------------------------------------- /benchmark/chatlmsys/yamls/spatial/requests_over_time_models_days_from_day55_to_day55_condense2000_N17_maxrate_32_avgrate_11_GPUnum32_mesh_size2_idx11_spatial.yaml: -------------------------------------------------------------------------------- 1 | num_gpus: 2 2 | max_num_seqs: 256 3 | overload_threshold: 2 4 | gpu_memory_utilization: 0.5 5 | models: 6 | - name: llm-12 7 | model: /mnt/afs/share/LLMCKPTs/huggyllama/llama-13b 8 | tensor_parallel_size: 2 9 | pipeline_parallel_size: 1 10 | placement: 11 | - - 0 12 | - 1 13 | mps_percentage: 14 | - 100 15 | - 90 16 | max_num_seqs: 256 17 | model_dtype: fp16 18 | workloads: 19 | workload_file: null 20 | -------------------------------------------------------------------------------- /benchmark/chatlmsys/yamls/spatial/requests_over_time_models_days_from_day55_to_day55_condense2000_N17_maxrate_32_avgrate_11_GPUnum32_mesh_size2_idx1_spatial.yaml: -------------------------------------------------------------------------------- 1 | num_gpus: 2 2 | max_num_seqs: 256 3 | overload_threshold: 2 4 | gpu_memory_utilization: 0.5 5 | models: 6 | - name: llm-2 7 | model: /mnt/afs/share/LLMCKPTs/huggyllama/llama-13b 8 | tensor_parallel_size: 2 9 | pipeline_parallel_size: 1 10 | placement: 11 | - - 0 12 | - 1 13 | mps_percentage: 14 | - 100 15 | - 90 16 | max_num_seqs: 256 17 | model_dtype: fp16 18 | workloads: 19 | workload_file: null 20 | -------------------------------------------------------------------------------- /benchmark/chatlmsys/yamls/spatial/requests_over_time_models_days_from_day55_to_day55_condense2000_N17_maxrate_32_avgrate_11_GPUnum32_mesh_size2_idx4_spatial.yaml: -------------------------------------------------------------------------------- 1 | num_gpus: 2 2 | max_num_seqs: 256 3 | overload_threshold: 2 4 | gpu_memory_utilization: 0.5 5 | models: 6 | - name: llm-5 7 | model: /mnt/afs/share/LLMCKPTs/huggyllama/llama-13b 8 | tensor_parallel_size: 2 9 | pipeline_parallel_size: 1 10 | placement: 11 | - - 0 12 | - 1 13 | mps_percentage: 14 | - 100 15 | - 90 16 | max_num_seqs: 256 17 | model_dtype: fp16 18 | workloads: 19 | workload_file: null 20 | -------------------------------------------------------------------------------- /benchmark/chatlmsys/yamls/spatial/requests_over_time_models_days_from_day55_to_day55_condense2000_N17_maxrate_32_avgrate_11_GPUnum32_mesh_size2_idx5_spatial.yaml: -------------------------------------------------------------------------------- 1 | num_gpus: 2 2 | max_num_seqs: 256 3 | overload_threshold: 2 4 | gpu_memory_utilization: 0.5 5 | models: 6 | - name: llm-6 7 | model: /mnt/afs/share/LLMCKPTs/huggyllama/llama-13b 8 | tensor_parallel_size: 2 9 | pipeline_parallel_size: 1 10 | placement: 11 | - - 0 12 | - 1 13 | mps_percentage: 14 | - 100 15 | - 90 16 | max_num_seqs: 256 17 | model_dtype: fp16 18 | workloads: 19 | workload_file: null 20 | -------------------------------------------------------------------------------- /benchmark/chatlmsys/yamls/spatial/requests_over_time_models_days_from_day55_to_day55_condense2000_N17_maxrate_32_avgrate_11_GPUnum32_mesh_size2_idx7_spatial.yaml: -------------------------------------------------------------------------------- 1 | num_gpus: 2 2 | max_num_seqs: 256 3 | overload_threshold: 2 4 | gpu_memory_utilization: 0.375 5 | models: 6 | - name: llm-8 7 | model: /mnt/afs/share/LLMCKPTs/huggyllama/llama-30b 8 | tensor_parallel_size: 2 9 | pipeline_parallel_size: 1 10 | placement: 11 | - - 0 12 | - 1 13 | mps_percentage: 14 | - 100 15 | - 90 16 | max_num_seqs: 256 17 | model_dtype: fp16 18 | workloads: 19 | workload_file: null 20 | -------------------------------------------------------------------------------- /benchmark/chatlmsys/yamls/spatial/requests_over_time_models_days_from_day55_to_day55_condense2000_N17_maxrate_32_avgrate_11_GPUnum32_mesh_size2_idx8_spatial.yaml: -------------------------------------------------------------------------------- 1 | num_gpus: 2 2 | max_num_seqs: 256 3 | overload_threshold: 2 4 | gpu_memory_utilization: 0.375 5 | models: 6 | - name: llm-9 7 | model: /mnt/afs/share/LLMCKPTs/huggyllama/llama-30b 8 | tensor_parallel_size: 2 9 | pipeline_parallel_size: 1 10 | placement: 11 | - - 0 12 | - 1 13 | mps_percentage: 14 | - 100 15 | - 90 16 | max_num_seqs: 256 17 | model_dtype: fp16 18 | workloads: 19 | workload_file: null 20 | -------------------------------------------------------------------------------- /benchmark/chatlmsys/yamls/spatial/requests_over_time_models_days_from_day55_to_day55_condense2000_N17_maxrate_32_avgrate_11_GPUnum32_mesh_size4_idx13_spatial.yaml: -------------------------------------------------------------------------------- 1 | num_gpus: 4 2 | max_num_seqs: 256 3 | overload_threshold: 2 4 | gpu_memory_utilization: 0.375 5 | models: 6 | - name: llm-14 7 | model: /mnt/afs/share/LLMCKPTs/huggyllama/llama-65b 8 | tensor_parallel_size: 4 9 | pipeline_parallel_size: 1 10 | placement: 11 | - - 0 12 | - 1 13 | - 2 14 | - 3 15 | mps_percentage: 16 | - 100 17 | - 90 18 | max_num_seqs: 256 19 | model_dtype: fp16 20 | workloads: 21 | workload_file: null 22 | -------------------------------------------------------------------------------- /benchmark/chatlmsys/yamls/spatial/requests_over_time_models_days_from_day55_to_day55_condense2000_N17_maxrate_32_avgrate_11_GPUnum32_mesh_size4_idx15_spatial.yaml: -------------------------------------------------------------------------------- 1 | num_gpus: 4 2 | max_num_seqs: 256 3 | overload_threshold: 2 4 | gpu_memory_utilization: 0.375 5 | models: 6 | - name: llm-16 7 | model: /mnt/afs/share/LLMCKPTs/huggyllama/llama-65b 8 | tensor_parallel_size: 4 9 | pipeline_parallel_size: 1 10 | placement: 11 | - - 0 12 | - 1 13 | - 2 14 | - 3 15 | mps_percentage: 16 | - 100 17 | - 90 18 | max_num_seqs: 256 19 | model_dtype: fp16 20 | workloads: 21 | workload_file: null 22 | -------------------------------------------------------------------------------- /benchmark/chatlmsys/yamls/spatial/requests_over_time_models_days_from_day55_to_day55_condense2000_N17_maxrate_32_avgrate_11_GPUnum32_mesh_size4_idx9_spatial.yaml: -------------------------------------------------------------------------------- 1 | num_gpus: 4 2 | max_num_seqs: 256 3 | overload_threshold: 2 4 | gpu_memory_utilization: 0.375 5 | models: 6 | - name: llm-10 7 | model: /mnt/afs/share/LLMCKPTs/huggyllama/llama-65b 8 | tensor_parallel_size: 4 9 | pipeline_parallel_size: 1 10 | placement: 11 | - - 0 12 | - 1 13 | - 2 14 | - 3 15 | mps_percentage: 16 | - 100 17 | - 90 18 | max_num_seqs: 256 19 | model_dtype: fp16 20 | workloads: 21 | workload_file: null 22 | -------------------------------------------------------------------------------- /benchmark/chatlmsys/yamls/spatial/requests_over_time_models_days_from_day55_to_day55_condense500_N17_maxrate_8_avgrate_2_GPUnum32_mesh_size1_idx0_spatial.yaml: -------------------------------------------------------------------------------- 1 | num_gpus: 1 2 | max_num_seqs: 256 3 | overload_threshold: 2 4 | gpu_memory_utilization: 0.5 5 | models: 6 | - name: llm-1 7 | model: /mnt/afs/share/LLMCKPTs/huggyllama/llama-13b 8 | tensor_parallel_size: 1 9 | pipeline_parallel_size: 1 10 | placement: 11 | - - 0 12 | mps_percentage: 13 | - 100 14 | - 90 15 | max_num_seqs: 256 16 | model_dtype: fp16 17 | workloads: 18 | workload_file: null 19 | -------------------------------------------------------------------------------- /benchmark/chatlmsys/yamls/spatial/requests_over_time_models_days_from_day55_to_day55_condense500_N17_maxrate_8_avgrate_2_GPUnum32_mesh_size1_idx12_spatial.yaml: -------------------------------------------------------------------------------- 1 | num_gpus: 1 2 | max_num_seqs: 256 3 | overload_threshold: 2 4 | gpu_memory_utilization: 0.65 5 | models: 6 | - name: llm-13 7 | model: /mnt/afs/share/LLMCKPTs/huggyllama/llama-7b 8 | tensor_parallel_size: 1 9 | pipeline_parallel_size: 1 10 | placement: 11 | - - 0 12 | mps_percentage: 13 | - 100 14 | - 90 15 | max_num_seqs: 256 16 | model_dtype: fp16 17 | workloads: 18 | workload_file: null 19 | -------------------------------------------------------------------------------- /benchmark/chatlmsys/yamls/spatial/requests_over_time_models_days_from_day55_to_day55_condense500_N17_maxrate_8_avgrate_2_GPUnum32_mesh_size1_idx14_spatial.yaml: -------------------------------------------------------------------------------- 1 | num_gpus: 1 2 | max_num_seqs: 256 3 | overload_threshold: 2 4 | gpu_memory_utilization: 0.65 5 | models: 6 | - name: llm-15 7 | model: /mnt/afs/share/LLMCKPTs/huggyllama/llama-7b 8 | tensor_parallel_size: 1 9 | pipeline_parallel_size: 1 10 | placement: 11 | - - 0 12 | mps_percentage: 13 | - 100 14 | - 90 15 | max_num_seqs: 256 16 | model_dtype: fp16 17 | workloads: 18 | workload_file: null 19 | -------------------------------------------------------------------------------- /benchmark/chatlmsys/yamls/spatial/requests_over_time_models_days_from_day55_to_day55_condense500_N17_maxrate_8_avgrate_2_GPUnum32_mesh_size1_idx2_spatial.yaml: -------------------------------------------------------------------------------- 1 | num_gpus: 1 2 | max_num_seqs: 256 3 | overload_threshold: 2 4 | gpu_memory_utilization: 0.5 5 | models: 6 | - name: llm-3 7 | model: /mnt/afs/share/LLMCKPTs/huggyllama/llama-13b 8 | tensor_parallel_size: 1 9 | pipeline_parallel_size: 1 10 | placement: 11 | - - 0 12 | mps_percentage: 13 | - 100 14 | - 90 15 | max_num_seqs: 256 16 | model_dtype: fp16 17 | workloads: 18 | workload_file: null 19 | -------------------------------------------------------------------------------- /benchmark/chatlmsys/yamls/spatial/requests_over_time_models_days_from_day55_to_day55_condense500_N17_maxrate_8_avgrate_2_GPUnum32_mesh_size1_idx3_spatial.yaml: -------------------------------------------------------------------------------- 1 | num_gpus: 1 2 | max_num_seqs: 256 3 | overload_threshold: 2 4 | gpu_memory_utilization: 0.65 5 | models: 6 | - name: llm-4 7 | model: /mnt/afs/share/LLMCKPTs/huggyllama/llama-7b 8 | tensor_parallel_size: 1 9 | pipeline_parallel_size: 1 10 | placement: 11 | - - 0 12 | mps_percentage: 13 | - 100 14 | - 90 15 | max_num_seqs: 256 16 | model_dtype: fp16 17 | workloads: 18 | workload_file: null 19 | -------------------------------------------------------------------------------- /benchmark/chatlmsys/yamls/spatial/requests_over_time_models_days_from_day55_to_day55_condense500_N17_maxrate_8_avgrate_2_GPUnum32_mesh_size1_idx6_spatial.yaml: -------------------------------------------------------------------------------- 1 | num_gpus: 1 2 | max_num_seqs: 256 3 | overload_threshold: 2 4 | gpu_memory_utilization: 0.65 5 | models: 6 | - name: llm-7 7 | model: /mnt/afs/share/LLMCKPTs/huggyllama/llama-7b 8 | tensor_parallel_size: 1 9 | pipeline_parallel_size: 1 10 | placement: 11 | - - 0 12 | mps_percentage: 13 | - 100 14 | - 90 15 | max_num_seqs: 256 16 | model_dtype: fp16 17 | workloads: 18 | workload_file: null 19 | -------------------------------------------------------------------------------- /benchmark/chatlmsys/yamls/spatial/requests_over_time_models_days_from_day55_to_day55_condense500_N17_maxrate_8_avgrate_2_GPUnum32_mesh_size2_idx10_spatial.yaml: -------------------------------------------------------------------------------- 1 | num_gpus: 2 2 | max_num_seqs: 256 3 | overload_threshold: 2 4 | gpu_memory_utilization: 0.375 5 | models: 6 | - name: llm-11 7 | model: /mnt/afs/share/LLMCKPTs/huggyllama/llama-30b 8 | tensor_parallel_size: 2 9 | pipeline_parallel_size: 1 10 | placement: 11 | - - 0 12 | - 1 13 | mps_percentage: 14 | - 100 15 | - 90 16 | max_num_seqs: 256 17 | model_dtype: fp16 18 | workloads: 19 | workload_file: null 20 | -------------------------------------------------------------------------------- /benchmark/chatlmsys/yamls/spatial/requests_over_time_models_days_from_day55_to_day55_condense500_N17_maxrate_8_avgrate_2_GPUnum32_mesh_size2_idx11_spatial.yaml: -------------------------------------------------------------------------------- 1 | num_gpus: 2 2 | max_num_seqs: 256 3 | overload_threshold: 2 4 | gpu_memory_utilization: 0.5 5 | models: 6 | - name: llm-12 7 | model: /mnt/afs/share/LLMCKPTs/huggyllama/llama-13b 8 | tensor_parallel_size: 2 9 | pipeline_parallel_size: 1 10 | placement: 11 | - - 0 12 | - 1 13 | mps_percentage: 14 | - 100 15 | - 90 16 | max_num_seqs: 256 17 | model_dtype: fp16 18 | workloads: 19 | workload_file: null 20 | -------------------------------------------------------------------------------- /benchmark/chatlmsys/yamls/spatial/requests_over_time_models_days_from_day55_to_day55_condense500_N17_maxrate_8_avgrate_2_GPUnum32_mesh_size2_idx1_spatial.yaml: -------------------------------------------------------------------------------- 1 | num_gpus: 2 2 | max_num_seqs: 256 3 | overload_threshold: 2 4 | gpu_memory_utilization: 0.5 5 | models: 6 | - name: llm-2 7 | model: /mnt/afs/share/LLMCKPTs/huggyllama/llama-13b 8 | tensor_parallel_size: 2 9 | pipeline_parallel_size: 1 10 | placement: 11 | - - 0 12 | - 1 13 | mps_percentage: 14 | - 100 15 | - 90 16 | max_num_seqs: 256 17 | model_dtype: fp16 18 | workloads: 19 | workload_file: null 20 | -------------------------------------------------------------------------------- /benchmark/chatlmsys/yamls/spatial/requests_over_time_models_days_from_day55_to_day55_condense500_N17_maxrate_8_avgrate_2_GPUnum32_mesh_size2_idx4_spatial.yaml: -------------------------------------------------------------------------------- 1 | num_gpus: 2 2 | max_num_seqs: 256 3 | overload_threshold: 2 4 | gpu_memory_utilization: 0.5 5 | models: 6 | - name: llm-5 7 | model: /mnt/afs/share/LLMCKPTs/huggyllama/llama-13b 8 | tensor_parallel_size: 2 9 | pipeline_parallel_size: 1 10 | placement: 11 | - - 0 12 | - 1 13 | mps_percentage: 14 | - 100 15 | - 90 16 | max_num_seqs: 256 17 | model_dtype: fp16 18 | workloads: 19 | workload_file: null 20 | -------------------------------------------------------------------------------- /benchmark/chatlmsys/yamls/spatial/requests_over_time_models_days_from_day55_to_day55_condense500_N17_maxrate_8_avgrate_2_GPUnum32_mesh_size2_idx5_spatial.yaml: -------------------------------------------------------------------------------- 1 | num_gpus: 2 2 | max_num_seqs: 256 3 | overload_threshold: 2 4 | gpu_memory_utilization: 0.5 5 | models: 6 | - name: llm-6 7 | model: /mnt/afs/share/LLMCKPTs/huggyllama/llama-13b 8 | tensor_parallel_size: 2 9 | pipeline_parallel_size: 1 10 | placement: 11 | - - 0 12 | - 1 13 | mps_percentage: 14 | - 100 15 | - 90 16 | max_num_seqs: 256 17 | model_dtype: fp16 18 | workloads: 19 | workload_file: null 20 | -------------------------------------------------------------------------------- /benchmark/chatlmsys/yamls/spatial/requests_over_time_models_days_from_day55_to_day55_condense500_N17_maxrate_8_avgrate_2_GPUnum32_mesh_size2_idx7_spatial.yaml: -------------------------------------------------------------------------------- 1 | num_gpus: 2 2 | max_num_seqs: 256 3 | overload_threshold: 2 4 | gpu_memory_utilization: 0.375 5 | models: 6 | - name: llm-8 7 | model: /mnt/afs/share/LLMCKPTs/huggyllama/llama-30b 8 | tensor_parallel_size: 2 9 | pipeline_parallel_size: 1 10 | placement: 11 | - - 0 12 | - 1 13 | mps_percentage: 14 | - 100 15 | - 90 16 | max_num_seqs: 256 17 | model_dtype: fp16 18 | workloads: 19 | workload_file: null 20 | -------------------------------------------------------------------------------- /benchmark/chatlmsys/yamls/spatial/requests_over_time_models_days_from_day55_to_day55_condense500_N17_maxrate_8_avgrate_2_GPUnum32_mesh_size2_idx8_spatial.yaml: -------------------------------------------------------------------------------- 1 | num_gpus: 2 2 | max_num_seqs: 256 3 | overload_threshold: 2 4 | gpu_memory_utilization: 0.375 5 | models: 6 | - name: llm-9 7 | model: /mnt/afs/share/LLMCKPTs/huggyllama/llama-30b 8 | tensor_parallel_size: 2 9 | pipeline_parallel_size: 1 10 | placement: 11 | - - 0 12 | - 1 13 | mps_percentage: 14 | - 100 15 | - 90 16 | max_num_seqs: 256 17 | model_dtype: fp16 18 | workloads: 19 | workload_file: null 20 | -------------------------------------------------------------------------------- /benchmark/chatlmsys/yamls/spatial/requests_over_time_models_days_from_day55_to_day55_condense500_N17_maxrate_8_avgrate_2_GPUnum32_mesh_size4_idx13_spatial.yaml: -------------------------------------------------------------------------------- 1 | num_gpus: 4 2 | max_num_seqs: 256 3 | overload_threshold: 2 4 | gpu_memory_utilization: 0.375 5 | models: 6 | - name: llm-14 7 | model: /mnt/afs/share/LLMCKPTs/huggyllama/llama-65b 8 | tensor_parallel_size: 4 9 | pipeline_parallel_size: 1 10 | placement: 11 | - - 0 12 | - 1 13 | - 2 14 | - 3 15 | mps_percentage: 16 | - 100 17 | - 90 18 | max_num_seqs: 256 19 | model_dtype: fp16 20 | workloads: 21 | workload_file: null 22 | -------------------------------------------------------------------------------- /benchmark/chatlmsys/yamls/spatial/requests_over_time_models_days_from_day55_to_day55_condense500_N17_maxrate_8_avgrate_2_GPUnum32_mesh_size4_idx15_spatial.yaml: -------------------------------------------------------------------------------- 1 | num_gpus: 4 2 | max_num_seqs: 256 3 | overload_threshold: 2 4 | gpu_memory_utilization: 0.375 5 | models: 6 | - name: llm-16 7 | model: /mnt/afs/share/LLMCKPTs/huggyllama/llama-65b 8 | tensor_parallel_size: 4 9 | pipeline_parallel_size: 1 10 | placement: 11 | - - 0 12 | - 1 13 | - 2 14 | - 3 15 | mps_percentage: 16 | - 100 17 | - 90 18 | max_num_seqs: 256 19 | model_dtype: fp16 20 | workloads: 21 | workload_file: null 22 | -------------------------------------------------------------------------------- /benchmark/chatlmsys/yamls/spatial/requests_over_time_models_days_from_day55_to_day55_condense500_N17_maxrate_8_avgrate_2_GPUnum32_mesh_size4_idx9_spatial.yaml: -------------------------------------------------------------------------------- 1 | num_gpus: 4 2 | max_num_seqs: 256 3 | overload_threshold: 2 4 | gpu_memory_utilization: 0.375 5 | models: 6 | - name: llm-10 7 | model: /mnt/afs/share/LLMCKPTs/huggyllama/llama-65b 8 | tensor_parallel_size: 4 9 | pipeline_parallel_size: 1 10 | placement: 11 | - - 0 12 | - 1 13 | - 2 14 | - 3 15 | mps_percentage: 16 | - 100 17 | - 90 18 | max_num_seqs: 256 19 | model_dtype: fp16 20 | workloads: 21 | workload_file: null 22 | -------------------------------------------------------------------------------- /benchmark/chatlmsys/yamls/spatial/requests_over_time_models_days_from_day55_to_day55_condense800_N17_maxrate_13_avgrate_4_GPUnum32_mesh_size1_idx0_spatial.yaml: -------------------------------------------------------------------------------- 1 | num_gpus: 1 2 | max_num_seqs: 256 3 | overload_threshold: 2 4 | gpu_memory_utilization: 0.5 5 | models: 6 | - name: llm-1 7 | model: /mnt/afs/share/LLMCKPTs/huggyllama/llama-13b 8 | tensor_parallel_size: 1 9 | pipeline_parallel_size: 1 10 | placement: 11 | - - 0 12 | mps_percentage: 13 | - 100 14 | - 90 15 | max_num_seqs: 256 16 | model_dtype: fp16 17 | workloads: 18 | workload_file: null 19 | -------------------------------------------------------------------------------- /benchmark/chatlmsys/yamls/spatial/requests_over_time_models_days_from_day55_to_day55_condense800_N17_maxrate_13_avgrate_4_GPUnum32_mesh_size1_idx12_spatial.yaml: -------------------------------------------------------------------------------- 1 | num_gpus: 1 2 | max_num_seqs: 256 3 | overload_threshold: 2 4 | gpu_memory_utilization: 0.65 5 | models: 6 | - name: llm-13 7 | model: /mnt/afs/share/LLMCKPTs/huggyllama/llama-7b 8 | tensor_parallel_size: 1 9 | pipeline_parallel_size: 1 10 | placement: 11 | - - 0 12 | mps_percentage: 13 | - 100 14 | - 90 15 | max_num_seqs: 256 16 | model_dtype: fp16 17 | workloads: 18 | workload_file: null 19 | -------------------------------------------------------------------------------- /benchmark/chatlmsys/yamls/spatial/requests_over_time_models_days_from_day55_to_day55_condense800_N17_maxrate_13_avgrate_4_GPUnum32_mesh_size1_idx14_spatial.yaml: -------------------------------------------------------------------------------- 1 | num_gpus: 1 2 | max_num_seqs: 256 3 | overload_threshold: 2 4 | gpu_memory_utilization: 0.65 5 | models: 6 | - name: llm-15 7 | model: /mnt/afs/share/LLMCKPTs/huggyllama/llama-7b 8 | tensor_parallel_size: 1 9 | pipeline_parallel_size: 1 10 | placement: 11 | - - 0 12 | mps_percentage: 13 | - 100 14 | - 90 15 | max_num_seqs: 256 16 | model_dtype: fp16 17 | workloads: 18 | workload_file: null 19 | -------------------------------------------------------------------------------- /benchmark/chatlmsys/yamls/spatial/requests_over_time_models_days_from_day55_to_day55_condense800_N17_maxrate_13_avgrate_4_GPUnum32_mesh_size1_idx2_spatial.yaml: -------------------------------------------------------------------------------- 1 | num_gpus: 1 2 | max_num_seqs: 256 3 | overload_threshold: 2 4 | gpu_memory_utilization: 0.5 5 | models: 6 | - name: llm-3 7 | model: /mnt/afs/share/LLMCKPTs/huggyllama/llama-13b 8 | tensor_parallel_size: 1 9 | pipeline_parallel_size: 1 10 | placement: 11 | - - 0 12 | mps_percentage: 13 | - 100 14 | - 90 15 | max_num_seqs: 256 16 | model_dtype: fp16 17 | workloads: 18 | workload_file: null 19 | -------------------------------------------------------------------------------- /benchmark/chatlmsys/yamls/spatial/requests_over_time_models_days_from_day55_to_day55_condense800_N17_maxrate_13_avgrate_4_GPUnum32_mesh_size1_idx3_spatial.yaml: -------------------------------------------------------------------------------- 1 | num_gpus: 1 2 | max_num_seqs: 256 3 | overload_threshold: 2 4 | gpu_memory_utilization: 0.65 5 | models: 6 | - name: llm-4 7 | model: /mnt/afs/share/LLMCKPTs/huggyllama/llama-7b 8 | tensor_parallel_size: 1 9 | pipeline_parallel_size: 1 10 | placement: 11 | - - 0 12 | mps_percentage: 13 | - 100 14 | - 90 15 | max_num_seqs: 256 16 | model_dtype: fp16 17 | workloads: 18 | workload_file: null 19 | -------------------------------------------------------------------------------- /benchmark/chatlmsys/yamls/spatial/requests_over_time_models_days_from_day55_to_day55_condense800_N17_maxrate_13_avgrate_4_GPUnum32_mesh_size1_idx6_spatial.yaml: -------------------------------------------------------------------------------- 1 | num_gpus: 1 2 | max_num_seqs: 256 3 | overload_threshold: 2 4 | gpu_memory_utilization: 0.65 5 | models: 6 | - name: llm-7 7 | model: /mnt/afs/share/LLMCKPTs/huggyllama/llama-7b 8 | tensor_parallel_size: 1 9 | pipeline_parallel_size: 1 10 | placement: 11 | - - 0 12 | mps_percentage: 13 | - 100 14 | - 90 15 | max_num_seqs: 256 16 | model_dtype: fp16 17 | workloads: 18 | workload_file: null 19 | -------------------------------------------------------------------------------- /benchmark/chatlmsys/yamls/spatial/requests_over_time_models_days_from_day55_to_day55_condense800_N17_maxrate_13_avgrate_4_GPUnum32_mesh_size2_idx10_spatial.yaml: -------------------------------------------------------------------------------- 1 | num_gpus: 2 2 | max_num_seqs: 256 3 | overload_threshold: 2 4 | gpu_memory_utilization: 0.375 5 | models: 6 | - name: llm-11 7 | model: /mnt/afs/share/LLMCKPTs/huggyllama/llama-30b 8 | tensor_parallel_size: 2 9 | pipeline_parallel_size: 1 10 | placement: 11 | - - 0 12 | - 1 13 | mps_percentage: 14 | - 100 15 | - 90 16 | max_num_seqs: 256 17 | model_dtype: fp16 18 | workloads: 19 | workload_file: null 20 | -------------------------------------------------------------------------------- /benchmark/chatlmsys/yamls/spatial/requests_over_time_models_days_from_day55_to_day55_condense800_N17_maxrate_13_avgrate_4_GPUnum32_mesh_size2_idx11_spatial.yaml: -------------------------------------------------------------------------------- 1 | num_gpus: 2 2 | max_num_seqs: 256 3 | overload_threshold: 2 4 | gpu_memory_utilization: 0.5 5 | models: 6 | - name: llm-12 7 | model: /mnt/afs/share/LLMCKPTs/huggyllama/llama-13b 8 | tensor_parallel_size: 2 9 | pipeline_parallel_size: 1 10 | placement: 11 | - - 0 12 | - 1 13 | mps_percentage: 14 | - 100 15 | - 90 16 | max_num_seqs: 256 17 | model_dtype: fp16 18 | workloads: 19 | workload_file: null 20 | -------------------------------------------------------------------------------- /benchmark/chatlmsys/yamls/spatial/requests_over_time_models_days_from_day55_to_day55_condense800_N17_maxrate_13_avgrate_4_GPUnum32_mesh_size2_idx1_spatial.yaml: -------------------------------------------------------------------------------- 1 | num_gpus: 2 2 | max_num_seqs: 256 3 | overload_threshold: 2 4 | gpu_memory_utilization: 0.5 5 | models: 6 | - name: llm-2 7 | model: /mnt/afs/share/LLMCKPTs/huggyllama/llama-13b 8 | tensor_parallel_size: 2 9 | pipeline_parallel_size: 1 10 | placement: 11 | - - 0 12 | - 1 13 | mps_percentage: 14 | - 100 15 | - 90 16 | max_num_seqs: 256 17 | model_dtype: fp16 18 | workloads: 19 | workload_file: null 20 | -------------------------------------------------------------------------------- /benchmark/chatlmsys/yamls/spatial/requests_over_time_models_days_from_day55_to_day55_condense800_N17_maxrate_13_avgrate_4_GPUnum32_mesh_size2_idx4_spatial.yaml: -------------------------------------------------------------------------------- 1 | num_gpus: 2 2 | max_num_seqs: 256 3 | overload_threshold: 2 4 | gpu_memory_utilization: 0.5 5 | models: 6 | - name: llm-5 7 | model: /mnt/afs/share/LLMCKPTs/huggyllama/llama-13b 8 | tensor_parallel_size: 2 9 | pipeline_parallel_size: 1 10 | placement: 11 | - - 0 12 | - 1 13 | mps_percentage: 14 | - 100 15 | - 90 16 | max_num_seqs: 256 17 | model_dtype: fp16 18 | workloads: 19 | workload_file: null 20 | -------------------------------------------------------------------------------- /benchmark/chatlmsys/yamls/spatial/requests_over_time_models_days_from_day55_to_day55_condense800_N17_maxrate_13_avgrate_4_GPUnum32_mesh_size2_idx5_spatial.yaml: -------------------------------------------------------------------------------- 1 | num_gpus: 2 2 | max_num_seqs: 256 3 | overload_threshold: 2 4 | gpu_memory_utilization: 0.5 5 | models: 6 | - name: llm-6 7 | model: /mnt/afs/share/LLMCKPTs/huggyllama/llama-13b 8 | tensor_parallel_size: 2 9 | pipeline_parallel_size: 1 10 | placement: 11 | - - 0 12 | - 1 13 | mps_percentage: 14 | - 100 15 | - 90 16 | max_num_seqs: 256 17 | model_dtype: fp16 18 | workloads: 19 | workload_file: null 20 | -------------------------------------------------------------------------------- /benchmark/chatlmsys/yamls/spatial/requests_over_time_models_days_from_day55_to_day55_condense800_N17_maxrate_13_avgrate_4_GPUnum32_mesh_size2_idx7_spatial.yaml: -------------------------------------------------------------------------------- 1 | num_gpus: 2 2 | max_num_seqs: 256 3 | overload_threshold: 2 4 | gpu_memory_utilization: 0.375 5 | models: 6 | - name: llm-8 7 | model: /mnt/afs/share/LLMCKPTs/huggyllama/llama-30b 8 | tensor_parallel_size: 2 9 | pipeline_parallel_size: 1 10 | placement: 11 | - - 0 12 | - 1 13 | mps_percentage: 14 | - 100 15 | - 90 16 | max_num_seqs: 256 17 | model_dtype: fp16 18 | workloads: 19 | workload_file: null 20 | -------------------------------------------------------------------------------- /benchmark/chatlmsys/yamls/spatial/requests_over_time_models_days_from_day55_to_day55_condense800_N17_maxrate_13_avgrate_4_GPUnum32_mesh_size2_idx8_spatial.yaml: -------------------------------------------------------------------------------- 1 | num_gpus: 2 2 | max_num_seqs: 256 3 | overload_threshold: 2 4 | gpu_memory_utilization: 0.375 5 | models: 6 | - name: llm-9 7 | model: /mnt/afs/share/LLMCKPTs/huggyllama/llama-30b 8 | tensor_parallel_size: 2 9 | pipeline_parallel_size: 1 10 | placement: 11 | - - 0 12 | - 1 13 | mps_percentage: 14 | - 100 15 | - 90 16 | max_num_seqs: 256 17 | model_dtype: fp16 18 | workloads: 19 | workload_file: null 20 | -------------------------------------------------------------------------------- /benchmark/chatlmsys/yamls/spatial/requests_over_time_models_days_from_day55_to_day55_condense800_N17_maxrate_13_avgrate_4_GPUnum32_mesh_size4_idx13_spatial.yaml: -------------------------------------------------------------------------------- 1 | num_gpus: 4 2 | max_num_seqs: 256 3 | overload_threshold: 2 4 | gpu_memory_utilization: 0.375 5 | models: 6 | - name: llm-14 7 | model: /mnt/afs/share/LLMCKPTs/huggyllama/llama-65b 8 | tensor_parallel_size: 4 9 | pipeline_parallel_size: 1 10 | placement: 11 | - - 0 12 | - 1 13 | - 2 14 | - 3 15 | mps_percentage: 16 | - 100 17 | - 90 18 | max_num_seqs: 256 19 | model_dtype: fp16 20 | workloads: 21 | workload_file: null 22 | -------------------------------------------------------------------------------- /benchmark/chatlmsys/yamls/spatial/requests_over_time_models_days_from_day55_to_day55_condense800_N17_maxrate_13_avgrate_4_GPUnum32_mesh_size4_idx15_spatial.yaml: -------------------------------------------------------------------------------- 1 | num_gpus: 4 2 | max_num_seqs: 256 3 | overload_threshold: 2 4 | gpu_memory_utilization: 0.375 5 | models: 6 | - name: llm-16 7 | model: /mnt/afs/share/LLMCKPTs/huggyllama/llama-65b 8 | tensor_parallel_size: 4 9 | pipeline_parallel_size: 1 10 | placement: 11 | - - 0 12 | - 1 13 | - 2 14 | - 3 15 | mps_percentage: 16 | - 100 17 | - 90 18 | max_num_seqs: 256 19 | model_dtype: fp16 20 | workloads: 21 | workload_file: null 22 | -------------------------------------------------------------------------------- /benchmark/chatlmsys/yamls/spatial/requests_over_time_models_days_from_day55_to_day55_condense800_N17_maxrate_13_avgrate_4_GPUnum32_mesh_size4_idx9_spatial.yaml: -------------------------------------------------------------------------------- 1 | num_gpus: 4 2 | max_num_seqs: 256 3 | overload_threshold: 2 4 | gpu_memory_utilization: 0.375 5 | models: 6 | - name: llm-10 7 | model: /mnt/afs/share/LLMCKPTs/huggyllama/llama-65b 8 | tensor_parallel_size: 4 9 | pipeline_parallel_size: 1 10 | placement: 11 | - - 0 12 | - 1 13 | - 2 14 | - 3 15 | mps_percentage: 16 | - 100 17 | - 90 18 | max_num_seqs: 256 19 | model_dtype: fp16 20 | workloads: 21 | workload_file: null 22 | -------------------------------------------------------------------------------- /benchmark/chatlmsys/yamls/temporal/requests_over_time_models_days_from_day55_to_day55_condense1200_N17_maxrate_19_avgrate_6_GPUnum32_mesh_size1_idx10_temporal.yaml: -------------------------------------------------------------------------------- 1 | num_gpus: 1 2 | max_num_seqs: 256 3 | overload_threshold: 2 4 | gpu_memory_utilization: 0.5549999999999999 5 | models: 6 | - name: llm-6 7 | model: /mnt/afs/share/LLMCKPTs/huggyllama/llama-13b 8 | tensor_parallel_size: 1 9 | pipeline_parallel_size: 1 10 | placement: 11 | - - 0 12 | mps_percentage: 13 | - 100 14 | - 90 15 | max_num_seqs: 62 16 | model_dtype: fp16 17 | workloads: 18 | workload_file: null 19 | -------------------------------------------------------------------------------- /benchmark/chatlmsys/yamls/temporal/requests_over_time_models_days_from_day55_to_day55_condense1200_N17_maxrate_19_avgrate_6_GPUnum32_mesh_size1_idx7_temporal.yaml: -------------------------------------------------------------------------------- 1 | num_gpus: 1 2 | max_num_seqs: 256 3 | overload_threshold: 2 4 | gpu_memory_utilization: 0.5549999999999999 5 | models: 6 | - name: llm-2 7 | model: /mnt/afs/share/LLMCKPTs/huggyllama/llama-13b 8 | tensor_parallel_size: 1 9 | pipeline_parallel_size: 1 10 | placement: 11 | - - 0 12 | mps_percentage: 13 | - 100 14 | - 90 15 | max_num_seqs: 49 16 | model_dtype: fp16 17 | workloads: 18 | workload_file: null 19 | -------------------------------------------------------------------------------- /benchmark/chatlmsys/yamls/temporal/requests_over_time_models_days_from_day55_to_day55_condense1200_N17_maxrate_19_avgrate_6_GPUnum32_mesh_size1_idx8_temporal.yaml: -------------------------------------------------------------------------------- 1 | num_gpus: 1 2 | max_num_seqs: 256 3 | overload_threshold: 2 4 | gpu_memory_utilization: 0.25999999999999995 5 | models: 6 | - name: llm-5 7 | model: /mnt/afs/share/LLMCKPTs/huggyllama/llama-13b 8 | tensor_parallel_size: 1 9 | pipeline_parallel_size: 1 10 | placement: 11 | - - 0 12 | mps_percentage: 13 | - 100 14 | - 90 15 | max_num_seqs: 45 16 | model_dtype: fp16 17 | - name: llm-7 18 | model: /mnt/afs/share/LLMCKPTs/huggyllama/llama-7b 19 | tensor_parallel_size: 1 20 | pipeline_parallel_size: 1 21 | placement: 22 | - - 0 23 | mps_percentage: 24 | - 100 25 | - 90 26 | max_num_seqs: 17 27 | model_dtype: fp16 28 | workloads: 29 | workload_file: null 30 | -------------------------------------------------------------------------------- /benchmark/chatlmsys/yamls/temporal/requests_over_time_models_days_from_day55_to_day55_condense1200_N17_maxrate_19_avgrate_6_GPUnum32_mesh_size1_idx9_temporal.yaml: -------------------------------------------------------------------------------- 1 | num_gpus: 1 2 | max_num_seqs: 256 3 | overload_threshold: 2 4 | gpu_memory_utilization: 0.7050000000000001 5 | models: 6 | - name: llm-15 7 | model: /mnt/afs/share/LLMCKPTs/huggyllama/llama-7b 8 | tensor_parallel_size: 1 9 | pipeline_parallel_size: 1 10 | placement: 11 | - - 0 12 | mps_percentage: 13 | - 100 14 | - 90 15 | max_num_seqs: 111 16 | model_dtype: fp16 17 | workloads: 18 | workload_file: null 19 | -------------------------------------------------------------------------------- /benchmark/chatlmsys/yamls/temporal/requests_over_time_models_days_from_day55_to_day55_condense1200_N17_maxrate_19_avgrate_6_GPUnum32_mesh_size2_idx5_temporal.yaml: -------------------------------------------------------------------------------- 1 | num_gpus: 2 2 | max_num_seqs: 256 3 | overload_threshold: 2 4 | gpu_memory_utilization: 0.2975 5 | models: 6 | - name: llm-9 7 | model: /mnt/afs/share/LLMCKPTs/huggyllama/llama-30b 8 | tensor_parallel_size: 2 9 | pipeline_parallel_size: 1 10 | placement: 11 | - - 0 12 | - 1 13 | mps_percentage: 14 | - 100 15 | - 90 16 | max_num_seqs: 73 17 | model_dtype: fp16 18 | - name: llm-13 19 | model: /mnt/afs/share/LLMCKPTs/huggyllama/llama-7b 20 | tensor_parallel_size: 2 21 | pipeline_parallel_size: 1 22 | placement: 23 | - - 0 24 | - 1 25 | mps_percentage: 26 | - 100 27 | - 90 28 | max_num_seqs: 28 29 | model_dtype: fp16 30 | workloads: 31 | workload_file: null 32 | -------------------------------------------------------------------------------- /benchmark/chatlmsys/yamls/temporal/requests_over_time_models_days_from_day55_to_day55_condense1200_N17_maxrate_19_avgrate_6_GPUnum32_mesh_size2_idx6_temporal.yaml: -------------------------------------------------------------------------------- 1 | num_gpus: 2 2 | max_num_seqs: 256 3 | overload_threshold: 2 4 | gpu_memory_utilization: 0.505 5 | models: 6 | - name: llm-8 7 | model: /mnt/afs/share/LLMCKPTs/huggyllama/llama-30b 8 | tensor_parallel_size: 2 9 | pipeline_parallel_size: 1 10 | placement: 11 | - - 0 12 | - 1 13 | mps_percentage: 14 | - 100 15 | - 90 16 | max_num_seqs: 73 17 | model_dtype: fp16 18 | workloads: 19 | workload_file: null 20 | -------------------------------------------------------------------------------- /benchmark/chatlmsys/yamls/temporal/requests_over_time_models_days_from_day55_to_day55_condense1200_N17_maxrate_19_avgrate_6_GPUnum32_mesh_size4_idx1_temporal.yaml: -------------------------------------------------------------------------------- 1 | num_gpus: 4 2 | max_num_seqs: 256 3 | overload_threshold: 2 4 | gpu_memory_utilization: 0.49124999999999996 5 | models: 6 | - name: llm-11 7 | model: /mnt/afs/share/LLMCKPTs/huggyllama/llama-30b 8 | tensor_parallel_size: 4 9 | pipeline_parallel_size: 1 10 | placement: 11 | - - 0 12 | - 1 13 | - 2 14 | - 3 15 | mps_percentage: 16 | - 100 17 | - 90 18 | max_num_seqs: 147 19 | model_dtype: fp16 20 | - name: llm-12 21 | model: /mnt/afs/share/LLMCKPTs/huggyllama/llama-13b 22 | tensor_parallel_size: 4 23 | pipeline_parallel_size: 1 24 | placement: 25 | - - 0 26 | - 1 27 | - 2 28 | - 3 29 | mps_percentage: 30 | - 100 31 | - 90 32 | max_num_seqs: 34 33 | model_dtype: fp16 34 | workloads: 35 | workload_file: null 36 | -------------------------------------------------------------------------------- /benchmark/chatlmsys/yamls/temporal/requests_over_time_models_days_from_day55_to_day55_condense1200_N17_maxrate_19_avgrate_6_GPUnum32_mesh_size4_idx3_temporal.yaml: -------------------------------------------------------------------------------- 1 | num_gpus: 4 2 | max_num_seqs: 256 3 | overload_threshold: 2 4 | gpu_memory_utilization: 0.30999999999999994 5 | models: 6 | - name: llm-4 7 | model: /mnt/afs/share/LLMCKPTs/huggyllama/llama-7b 8 | tensor_parallel_size: 4 9 | pipeline_parallel_size: 1 10 | placement: 11 | - - 0 12 | - 1 13 | - 2 14 | - 3 15 | mps_percentage: 16 | - 100 17 | - 90 18 | max_num_seqs: 450 19 | model_dtype: fp16 20 | - name: llm-14 21 | model: /mnt/afs/share/LLMCKPTs/huggyllama/llama-65b 22 | tensor_parallel_size: 4 23 | pipeline_parallel_size: 1 24 | placement: 25 | - - 0 26 | - 1 27 | - 2 28 | - 3 29 | mps_percentage: 30 | - 100 31 | - 90 32 | max_num_seqs: 89 33 | model_dtype: fp16 34 | workloads: 35 | workload_file: null 36 | -------------------------------------------------------------------------------- /benchmark/chatlmsys/yamls/temporal/requests_over_time_models_days_from_day55_to_day55_condense1200_N17_maxrate_19_avgrate_6_GPUnum32_mesh_size4_idx4_temporal.yaml: -------------------------------------------------------------------------------- 1 | num_gpus: 4 2 | max_num_seqs: 256 3 | overload_threshold: 2 4 | gpu_memory_utilization: 0.27249999999999996 5 | models: 6 | - name: llm-3 7 | model: /mnt/afs/share/LLMCKPTs/huggyllama/llama-13b 8 | tensor_parallel_size: 4 9 | pipeline_parallel_size: 1 10 | placement: 11 | - - 0 12 | - 1 13 | - 2 14 | - 3 15 | mps_percentage: 16 | - 100 17 | - 90 18 | max_num_seqs: 287 19 | model_dtype: fp16 20 | - name: llm-10 21 | model: /mnt/afs/share/LLMCKPTs/huggyllama/llama-65b 22 | tensor_parallel_size: 4 23 | pipeline_parallel_size: 1 24 | placement: 25 | - - 0 26 | - 1 27 | - 2 28 | - 3 29 | mps_percentage: 30 | - 100 31 | - 90 32 | max_num_seqs: 86 33 | model_dtype: fp16 34 | workloads: 35 | workload_file: null 36 | -------------------------------------------------------------------------------- /benchmark/chatlmsys/yamls/temporal/requests_over_time_models_days_from_day55_to_day55_condense1200_N17_maxrate_19_avgrate_6_GPUnum32_mesh_size8_idx0_temporal.yaml: -------------------------------------------------------------------------------- 1 | num_gpus: 8 2 | max_num_seqs: 256 3 | overload_threshold: 2 4 | gpu_memory_utilization: 0.51625 5 | models: 6 | - name: llm-1 7 | model: /mnt/afs/share/LLMCKPTs/huggyllama/llama-13b 8 | tensor_parallel_size: 8 9 | pipeline_parallel_size: 1 10 | placement: 11 | - - 0 12 | - 1 13 | - 2 14 | - 3 15 | - 4 16 | - 5 17 | - 6 18 | - 7 19 | mps_percentage: 20 | - 100 21 | - 90 22 | max_num_seqs: 576 23 | model_dtype: fp16 24 | - name: llm-16 25 | model: /mnt/afs/share/LLMCKPTs/huggyllama/llama-65b 26 | tensor_parallel_size: 8 27 | pipeline_parallel_size: 1 28 | placement: 29 | - - 0 30 | - 1 31 | - 2 32 | - 3 33 | - 4 34 | - 5 35 | - 6 36 | - 7 37 | mps_percentage: 38 | - 100 39 | - 90 40 | max_num_seqs: 158 41 | model_dtype: fp16 42 | workloads: 43 | workload_file: null 44 | -------------------------------------------------------------------------------- /benchmark/chatlmsys/yamls/temporal/requests_over_time_models_days_from_day55_to_day55_condense1600_N17_maxrate_26_avgrate_8_GPUnum32_mesh_size2_idx6_temporal.yaml: -------------------------------------------------------------------------------- 1 | num_gpus: 2 2 | max_num_seqs: 256 3 | overload_threshold: 2 4 | gpu_memory_utilization: 0.51 5 | models: 6 | - name: llm-15 7 | model: /mnt/afs/share/LLMCKPTs/huggyllama/llama-7b 8 | tensor_parallel_size: 2 9 | pipeline_parallel_size: 1 10 | placement: 11 | - - 0 12 | - 1 13 | mps_percentage: 14 | - 100 15 | - 90 16 | max_num_seqs: 220 17 | model_dtype: fp16 18 | - name: llm-2 19 | model: /mnt/afs/share/LLMCKPTs/huggyllama/llama-13b 20 | tensor_parallel_size: 2 21 | pipeline_parallel_size: 1 22 | placement: 23 | - - 0 24 | - 1 25 | mps_percentage: 26 | - 100 27 | - 90 28 | max_num_seqs: 84 29 | model_dtype: fp16 30 | workloads: 31 | workload_file: null 32 | -------------------------------------------------------------------------------- /benchmark/chatlmsys/yamls/temporal/requests_over_time_models_days_from_day55_to_day55_condense1600_N17_maxrate_26_avgrate_8_GPUnum32_mesh_size2_idx7_temporal.yaml: -------------------------------------------------------------------------------- 1 | num_gpus: 2 2 | max_num_seqs: 256 3 | overload_threshold: 2 4 | gpu_memory_utilization: 0.2975 5 | models: 6 | - name: llm-8 7 | model: /mnt/afs/share/LLMCKPTs/huggyllama/llama-30b 8 | tensor_parallel_size: 2 9 | pipeline_parallel_size: 1 10 | placement: 11 | - - 0 12 | - 1 13 | mps_percentage: 14 | - 100 15 | - 90 16 | max_num_seqs: 73 17 | model_dtype: fp16 18 | - name: llm-7 19 | model: /mnt/afs/share/LLMCKPTs/huggyllama/llama-7b 20 | tensor_parallel_size: 2 21 | pipeline_parallel_size: 1 22 | placement: 23 | - - 0 24 | - 1 25 | mps_percentage: 26 | - 100 27 | - 90 28 | max_num_seqs: 17 29 | model_dtype: fp16 30 | workloads: 31 | workload_file: null 32 | -------------------------------------------------------------------------------- /benchmark/chatlmsys/yamls/temporal/requests_over_time_models_days_from_day55_to_day55_condense1600_N17_maxrate_26_avgrate_8_GPUnum32_mesh_size4_idx1_temporal.yaml: -------------------------------------------------------------------------------- 1 | num_gpus: 4 2 | max_num_seqs: 256 3 | overload_threshold: 2 4 | gpu_memory_utilization: 0.49124999999999996 5 | models: 6 | - name: llm-11 7 | model: /mnt/afs/share/LLMCKPTs/huggyllama/llama-30b 8 | tensor_parallel_size: 4 9 | pipeline_parallel_size: 1 10 | placement: 11 | - - 0 12 | - 1 13 | - 2 14 | - 3 15 | mps_percentage: 16 | - 100 17 | - 90 18 | max_num_seqs: 147 19 | model_dtype: fp16 20 | - name: llm-6 21 | model: /mnt/afs/share/LLMCKPTs/huggyllama/llama-13b 22 | tensor_parallel_size: 4 23 | pipeline_parallel_size: 1 24 | placement: 25 | - - 0 26 | - 1 27 | - 2 28 | - 3 29 | mps_percentage: 30 | - 100 31 | - 90 32 | max_num_seqs: 150 33 | model_dtype: fp16 34 | workloads: 35 | workload_file: null 36 | -------------------------------------------------------------------------------- /benchmark/chatlmsys/yamls/temporal/requests_over_time_models_days_from_day55_to_day55_condense1600_N17_maxrate_26_avgrate_8_GPUnum32_mesh_size4_idx2_temporal.yaml: -------------------------------------------------------------------------------- 1 | num_gpus: 4 2 | max_num_seqs: 256 3 | overload_threshold: 2 4 | gpu_memory_utilization: 0.10875000000000004 5 | models: 6 | - name: llm-10 7 | model: /mnt/afs/share/LLMCKPTs/huggyllama/llama-65b 8 | tensor_parallel_size: 4 9 | pipeline_parallel_size: 1 10 | placement: 11 | - - 0 12 | - 1 13 | - 2 14 | - 3 15 | mps_percentage: 16 | - 100 17 | - 90 18 | max_num_seqs: 83 19 | model_dtype: fp16 20 | - name: llm-5 21 | model: /mnt/afs/share/LLMCKPTs/huggyllama/llama-13b 22 | tensor_parallel_size: 4 23 | pipeline_parallel_size: 1 24 | placement: 25 | - - 0 26 | - 1 27 | - 2 28 | - 3 29 | mps_percentage: 30 | - 100 31 | - 90 32 | max_num_seqs: 22 33 | model_dtype: fp16 34 | - name: llm-13 35 | model: /mnt/afs/share/LLMCKPTs/huggyllama/llama-7b 36 | tensor_parallel_size: 4 37 | pipeline_parallel_size: 1 38 | placement: 39 | - - 0 40 | - 1 41 | - 2 42 | - 3 43 | mps_percentage: 44 | - 100 45 | - 90 46 | max_num_seqs: 38 47 | model_dtype: fp16 48 | workloads: 49 | workload_file: null 50 | -------------------------------------------------------------------------------- /benchmark/chatlmsys/yamls/temporal/requests_over_time_models_days_from_day55_to_day55_condense1600_N17_maxrate_26_avgrate_8_GPUnum32_mesh_size4_idx3_temporal.yaml: -------------------------------------------------------------------------------- 1 | num_gpus: 4 2 | max_num_seqs: 256 3 | overload_threshold: 2 4 | gpu_memory_utilization: 0.47375 5 | models: 6 | - name: llm-16 7 | model: /mnt/afs/share/LLMCKPTs/huggyllama/llama-65b 8 | tensor_parallel_size: 4 9 | pipeline_parallel_size: 1 10 | placement: 11 | - - 0 12 | - 1 13 | - 2 14 | - 3 15 | mps_percentage: 16 | - 100 17 | - 90 18 | max_num_seqs: 89 19 | model_dtype: fp16 20 | workloads: 21 | workload_file: null 22 | -------------------------------------------------------------------------------- /benchmark/chatlmsys/yamls/temporal/requests_over_time_models_days_from_day55_to_day55_condense1600_N17_maxrate_26_avgrate_8_GPUnum32_mesh_size4_idx4_temporal.yaml: -------------------------------------------------------------------------------- 1 | num_gpus: 4 2 | max_num_seqs: 256 3 | overload_threshold: 2 4 | gpu_memory_utilization: 0.5974999999999999 5 | models: 6 | - name: llm-3 7 | model: /mnt/afs/share/LLMCKPTs/huggyllama/llama-13b 8 | tensor_parallel_size: 4 9 | pipeline_parallel_size: 1 10 | placement: 11 | - - 0 12 | - 1 13 | - 2 14 | - 3 15 | mps_percentage: 16 | - 100 17 | - 90 18 | max_num_seqs: 287 19 | model_dtype: fp16 20 | - name: llm-12 21 | model: /mnt/afs/share/LLMCKPTs/huggyllama/llama-13b 22 | tensor_parallel_size: 4 23 | pipeline_parallel_size: 1 24 | placement: 25 | - - 0 26 | - 1 27 | - 2 28 | - 3 29 | mps_percentage: 30 | - 100 31 | - 90 32 | max_num_seqs: 155 33 | model_dtype: fp16 34 | workloads: 35 | workload_file: null 36 | -------------------------------------------------------------------------------- /benchmark/chatlmsys/yamls/temporal/requests_over_time_models_days_from_day55_to_day55_condense1600_N17_maxrate_26_avgrate_8_GPUnum32_mesh_size4_idx5_temporal.yaml: -------------------------------------------------------------------------------- 1 | num_gpus: 4 2 | max_num_seqs: 256 3 | overload_threshold: 2 4 | gpu_memory_utilization: 0.5287499999999999 5 | models: 6 | - name: llm-4 7 | model: /mnt/afs/share/LLMCKPTs/huggyllama/llama-7b 8 | tensor_parallel_size: 4 9 | pipeline_parallel_size: 1 10 | placement: 11 | - - 0 12 | - 1 13 | - 2 14 | - 3 15 | mps_percentage: 16 | - 100 17 | - 90 18 | max_num_seqs: 450 19 | model_dtype: fp16 20 | - name: llm-9 21 | model: /mnt/afs/share/LLMCKPTs/huggyllama/llama-30b 22 | tensor_parallel_size: 4 23 | pipeline_parallel_size: 1 24 | placement: 25 | - - 0 26 | - 1 27 | - 2 28 | - 3 29 | mps_percentage: 30 | - 100 31 | - 90 32 | max_num_seqs: 39 33 | model_dtype: fp16 34 | workloads: 35 | workload_file: null 36 | -------------------------------------------------------------------------------- /benchmark/chatlmsys/yamls/temporal/requests_over_time_models_days_from_day55_to_day55_condense1600_N17_maxrate_26_avgrate_8_GPUnum32_mesh_size8_idx0_temporal.yaml: -------------------------------------------------------------------------------- 1 | num_gpus: 8 2 | max_num_seqs: 256 3 | overload_threshold: 2 4 | gpu_memory_utilization: 0.51625 5 | models: 6 | - name: llm-1 7 | model: /mnt/afs/share/LLMCKPTs/huggyllama/llama-13b 8 | tensor_parallel_size: 8 9 | pipeline_parallel_size: 1 10 | placement: 11 | - - 0 12 | - 1 13 | - 2 14 | - 3 15 | - 4 16 | - 5 17 | - 6 18 | - 7 19 | mps_percentage: 20 | - 100 21 | - 90 22 | max_num_seqs: 576 23 | model_dtype: fp16 24 | - name: llm-14 25 | model: /mnt/afs/share/LLMCKPTs/huggyllama/llama-65b 26 | tensor_parallel_size: 8 27 | pipeline_parallel_size: 1 28 | placement: 29 | - - 0 30 | - 1 31 | - 2 32 | - 3 33 | - 4 34 | - 5 35 | - 6 36 | - 7 37 | mps_percentage: 38 | - 100 39 | - 90 40 | max_num_seqs: 87 41 | model_dtype: fp16 42 | workloads: 43 | workload_file: null 44 | -------------------------------------------------------------------------------- /benchmark/chatlmsys/yamls/temporal/requests_over_time_models_days_from_day55_to_day55_condense2000_N17_maxrate_32_avgrate_11_GPUnum32_mesh_size2_idx6_temporal.yaml: -------------------------------------------------------------------------------- 1 | num_gpus: 2 2 | max_num_seqs: 256 3 | overload_threshold: 2 4 | gpu_memory_utilization: 0.51 5 | models: 6 | - name: llm-15 7 | model: /mnt/afs/share/LLMCKPTs/huggyllama/llama-7b 8 | tensor_parallel_size: 2 9 | pipeline_parallel_size: 1 10 | placement: 11 | - - 0 12 | - 1 13 | mps_percentage: 14 | - 100 15 | - 90 16 | max_num_seqs: 224 17 | model_dtype: fp16 18 | - name: llm-2 19 | model: /mnt/afs/share/LLMCKPTs/huggyllama/llama-13b 20 | tensor_parallel_size: 2 21 | pipeline_parallel_size: 1 22 | placement: 23 | - - 0 24 | - 1 25 | mps_percentage: 26 | - 100 27 | - 90 28 | max_num_seqs: 107 29 | model_dtype: fp16 30 | workloads: 31 | workload_file: null 32 | -------------------------------------------------------------------------------- /benchmark/chatlmsys/yamls/temporal/requests_over_time_models_days_from_day55_to_day55_condense2000_N17_maxrate_32_avgrate_11_GPUnum32_mesh_size2_idx7_temporal.yaml: -------------------------------------------------------------------------------- 1 | num_gpus: 2 2 | max_num_seqs: 256 3 | overload_threshold: 2 4 | gpu_memory_utilization: 0.2975 5 | models: 6 | - name: llm-8 7 | model: /mnt/afs/share/LLMCKPTs/huggyllama/llama-30b 8 | tensor_parallel_size: 2 9 | pipeline_parallel_size: 1 10 | placement: 11 | - - 0 12 | - 1 13 | mps_percentage: 14 | - 100 15 | - 90 16 | max_num_seqs: 73 17 | model_dtype: fp16 18 | - name: llm-13 19 | model: /mnt/afs/share/LLMCKPTs/huggyllama/llama-7b 20 | tensor_parallel_size: 2 21 | pipeline_parallel_size: 1 22 | placement: 23 | - - 0 24 | - 1 25 | mps_percentage: 26 | - 100 27 | - 90 28 | max_num_seqs: 201 29 | model_dtype: fp16 30 | workloads: 31 | workload_file: null 32 | -------------------------------------------------------------------------------- /benchmark/chatlmsys/yamls/temporal/requests_over_time_models_days_from_day55_to_day55_condense2000_N17_maxrate_32_avgrate_11_GPUnum32_mesh_size4_idx1_temporal.yaml: -------------------------------------------------------------------------------- 1 | num_gpus: 4 2 | max_num_seqs: 256 3 | overload_threshold: 2 4 | gpu_memory_utilization: 0.49124999999999996 5 | models: 6 | - name: llm-11 7 | model: /mnt/afs/share/LLMCKPTs/huggyllama/llama-30b 8 | tensor_parallel_size: 4 9 | pipeline_parallel_size: 1 10 | placement: 11 | - - 0 12 | - 1 13 | - 2 14 | - 3 15 | mps_percentage: 16 | - 100 17 | - 90 18 | max_num_seqs: 147 19 | model_dtype: fp16 20 | - name: llm-6 21 | model: /mnt/afs/share/LLMCKPTs/huggyllama/llama-13b 22 | tensor_parallel_size: 4 23 | pipeline_parallel_size: 1 24 | placement: 25 | - - 0 26 | - 1 27 | - 2 28 | - 3 29 | mps_percentage: 30 | - 100 31 | - 90 32 | max_num_seqs: 198 33 | model_dtype: fp16 34 | workloads: 35 | workload_file: null 36 | -------------------------------------------------------------------------------- /benchmark/chatlmsys/yamls/temporal/requests_over_time_models_days_from_day55_to_day55_condense2000_N17_maxrate_32_avgrate_11_GPUnum32_mesh_size4_idx2_temporal.yaml: -------------------------------------------------------------------------------- 1 | num_gpus: 4 2 | max_num_seqs: 256 3 | overload_threshold: 2 4 | gpu_memory_utilization: 0.27249999999999996 5 | models: 6 | - name: llm-10 7 | model: /mnt/afs/share/LLMCKPTs/huggyllama/llama-65b 8 | tensor_parallel_size: 4 9 | pipeline_parallel_size: 1 10 | placement: 11 | - - 0 12 | - 1 13 | - 2 14 | - 3 15 | mps_percentage: 16 | - 100 17 | - 90 18 | max_num_seqs: 89 19 | model_dtype: fp16 20 | - name: llm-5 21 | model: /mnt/afs/share/LLMCKPTs/huggyllama/llama-13b 22 | tensor_parallel_size: 4 23 | pipeline_parallel_size: 1 24 | placement: 25 | - - 0 26 | - 1 27 | - 2 28 | - 3 29 | mps_percentage: 30 | - 100 31 | - 90 32 | max_num_seqs: 32 33 | model_dtype: fp16 34 | workloads: 35 | workload_file: null 36 | -------------------------------------------------------------------------------- /benchmark/chatlmsys/yamls/temporal/requests_over_time_models_days_from_day55_to_day55_condense2000_N17_maxrate_32_avgrate_11_GPUnum32_mesh_size4_idx3_temporal.yaml: -------------------------------------------------------------------------------- 1 | num_gpus: 4 2 | max_num_seqs: 256 3 | overload_threshold: 2 4 | gpu_memory_utilization: 0.30999999999999994 5 | models: 6 | - name: llm-16 7 | model: /mnt/afs/share/LLMCKPTs/huggyllama/llama-65b 8 | tensor_parallel_size: 4 9 | pipeline_parallel_size: 1 10 | placement: 11 | - - 0 12 | - 1 13 | - 2 14 | - 3 15 | mps_percentage: 16 | - 100 17 | - 90 18 | max_num_seqs: 89 19 | model_dtype: fp16 20 | - name: llm-7 21 | model: /mnt/afs/share/LLMCKPTs/huggyllama/llama-7b 22 | tensor_parallel_size: 4 23 | pipeline_parallel_size: 1 24 | placement: 25 | - - 0 26 | - 1 27 | - 2 28 | - 3 29 | mps_percentage: 30 | - 100 31 | - 90 32 | max_num_seqs: 21 33 | model_dtype: fp16 34 | workloads: 35 | workload_file: null 36 | -------------------------------------------------------------------------------- /benchmark/chatlmsys/yamls/temporal/requests_over_time_models_days_from_day55_to_day55_condense2000_N17_maxrate_32_avgrate_11_GPUnum32_mesh_size4_idx4_temporal.yaml: -------------------------------------------------------------------------------- 1 | num_gpus: 4 2 | max_num_seqs: 256 3 | overload_threshold: 2 4 | gpu_memory_utilization: 0.5974999999999999 5 | models: 6 | - name: llm-3 7 | model: /mnt/afs/share/LLMCKPTs/huggyllama/llama-13b 8 | tensor_parallel_size: 4 9 | pipeline_parallel_size: 1 10 | placement: 11 | - - 0 12 | - 1 13 | - 2 14 | - 3 15 | mps_percentage: 16 | - 100 17 | - 90 18 | max_num_seqs: 287 19 | model_dtype: fp16 20 | - name: llm-12 21 | model: /mnt/afs/share/LLMCKPTs/huggyllama/llama-13b 22 | tensor_parallel_size: 4 23 | pipeline_parallel_size: 1 24 | placement: 25 | - - 0 26 | - 1 27 | - 2 28 | - 3 29 | mps_percentage: 30 | - 100 31 | - 90 32 | max_num_seqs: 204 33 | model_dtype: fp16 34 | workloads: 35 | workload_file: null 36 | -------------------------------------------------------------------------------- /benchmark/chatlmsys/yamls/temporal/requests_over_time_models_days_from_day55_to_day55_condense2000_N17_maxrate_32_avgrate_11_GPUnum32_mesh_size4_idx5_temporal.yaml: -------------------------------------------------------------------------------- 1 | num_gpus: 4 2 | max_num_seqs: 256 3 | overload_threshold: 2 4 | gpu_memory_utilization: 0.5287499999999999 5 | models: 6 | - name: llm-4 7 | model: /mnt/afs/share/LLMCKPTs/huggyllama/llama-7b 8 | tensor_parallel_size: 4 9 | pipeline_parallel_size: 1 10 | placement: 11 | - - 0 12 | - 1 13 | - 2 14 | - 3 15 | mps_percentage: 16 | - 100 17 | - 90 18 | max_num_seqs: 450 19 | model_dtype: fp16 20 | - name: llm-9 21 | model: /mnt/afs/share/LLMCKPTs/huggyllama/llama-30b 22 | tensor_parallel_size: 4 23 | pipeline_parallel_size: 1 24 | placement: 25 | - - 0 26 | - 1 27 | - 2 28 | - 3 29 | mps_percentage: 30 | - 100 31 | - 90 32 | max_num_seqs: 90 33 | model_dtype: fp16 34 | workloads: 35 | workload_file: null 36 | -------------------------------------------------------------------------------- /benchmark/chatlmsys/yamls/temporal/requests_over_time_models_days_from_day55_to_day55_condense2000_N17_maxrate_32_avgrate_11_GPUnum32_mesh_size8_idx0_temporal.yaml: -------------------------------------------------------------------------------- 1 | num_gpus: 8 2 | max_num_seqs: 256 3 | overload_threshold: 2 4 | gpu_memory_utilization: 0.51625 5 | models: 6 | - name: llm-1 7 | model: /mnt/afs/share/LLMCKPTs/huggyllama/llama-13b 8 | tensor_parallel_size: 8 9 | pipeline_parallel_size: 1 10 | placement: 11 | - - 0 12 | - 1 13 | - 2 14 | - 3 15 | - 4 16 | - 5 17 | - 6 18 | - 7 19 | mps_percentage: 20 | - 100 21 | - 90 22 | max_num_seqs: 576 23 | model_dtype: fp16 24 | - name: llm-14 25 | model: /mnt/afs/share/LLMCKPTs/huggyllama/llama-65b 26 | tensor_parallel_size: 8 27 | pipeline_parallel_size: 1 28 | placement: 29 | - - 0 30 | - 1 31 | - 2 32 | - 3 33 | - 4 34 | - 5 35 | - 6 36 | - 7 37 | mps_percentage: 38 | - 100 39 | - 90 40 | max_num_seqs: 172 41 | model_dtype: fp16 42 | workloads: 43 | workload_file: null 44 | -------------------------------------------------------------------------------- /benchmark/chatlmsys/yamls/temporal/requests_over_time_models_days_from_day55_to_day55_condense500_N17_maxrate_8_avgrate_2_GPUnum32_mesh_size2_idx3_temporal.yaml: -------------------------------------------------------------------------------- 1 | num_gpus: 2 2 | max_num_seqs: 256 3 | overload_threshold: 2 4 | gpu_memory_utilization: 0.7175 5 | models: 6 | - name: llm-5 7 | model: /mnt/afs/share/LLMCKPTs/huggyllama/llama-13b 8 | tensor_parallel_size: 2 9 | pipeline_parallel_size: 1 10 | placement: 11 | - - 0 12 | - 1 13 | mps_percentage: 14 | - 100 15 | - 90 16 | max_num_seqs: 7 17 | model_dtype: fp16 18 | workloads: 19 | workload_file: null 20 | -------------------------------------------------------------------------------- /benchmark/chatlmsys/yamls/temporal/requests_over_time_models_days_from_day55_to_day55_condense500_N17_maxrate_8_avgrate_2_GPUnum32_mesh_size2_idx4_temporal.yaml: -------------------------------------------------------------------------------- 1 | num_gpus: 2 2 | max_num_seqs: 256 3 | overload_threshold: 2 4 | gpu_memory_utilization: 0.43499999999999994 5 | models: 6 | - name: llm-1 7 | model: /mnt/afs/share/LLMCKPTs/huggyllama/llama-13b 8 | tensor_parallel_size: 2 9 | pipeline_parallel_size: 1 10 | placement: 11 | - - 0 12 | - 1 13 | mps_percentage: 14 | - 100 15 | - 90 16 | max_num_seqs: 143 17 | model_dtype: fp16 18 | - name: llm-6 19 | model: /mnt/afs/share/LLMCKPTs/huggyllama/llama-13b 20 | tensor_parallel_size: 2 21 | pipeline_parallel_size: 1 22 | placement: 23 | - - 0 24 | - 1 25 | mps_percentage: 26 | - 100 27 | - 90 28 | max_num_seqs: 13 29 | model_dtype: fp16 30 | workloads: 31 | workload_file: null 32 | -------------------------------------------------------------------------------- /benchmark/chatlmsys/yamls/temporal/requests_over_time_models_days_from_day55_to_day55_condense500_N17_maxrate_8_avgrate_2_GPUnum32_mesh_size2_idx5_temporal.yaml: -------------------------------------------------------------------------------- 1 | num_gpus: 2 2 | max_num_seqs: 256 3 | overload_threshold: 2 4 | gpu_memory_utilization: 0.7925000000000001 5 | models: 6 | - name: llm-13 7 | model: /mnt/afs/share/LLMCKPTs/huggyllama/llama-7b 8 | tensor_parallel_size: 2 9 | pipeline_parallel_size: 1 10 | placement: 11 | - - 0 12 | - 1 13 | mps_percentage: 14 | - 100 15 | - 90 16 | max_num_seqs: 8 17 | model_dtype: fp16 18 | workloads: 19 | workload_file: null 20 | -------------------------------------------------------------------------------- /benchmark/chatlmsys/yamls/temporal/requests_over_time_models_days_from_day55_to_day55_condense500_N17_maxrate_8_avgrate_2_GPUnum32_mesh_size2_idx6_temporal.yaml: -------------------------------------------------------------------------------- 1 | num_gpus: 2 2 | max_num_seqs: 256 3 | overload_threshold: 2 4 | gpu_memory_utilization: 0.22249999999999998 5 | models: 6 | - name: llm-3 7 | model: /mnt/afs/share/LLMCKPTs/huggyllama/llama-13b 8 | tensor_parallel_size: 2 9 | pipeline_parallel_size: 1 10 | placement: 11 | - - 0 12 | - 1 13 | mps_percentage: 14 | - 100 15 | - 90 16 | max_num_seqs: 115 17 | model_dtype: fp16 18 | - name: llm-9 19 | model: /mnt/afs/share/LLMCKPTs/huggyllama/llama-30b 20 | tensor_parallel_size: 2 21 | pipeline_parallel_size: 1 22 | placement: 23 | - - 0 24 | - 1 25 | mps_percentage: 26 | - 100 27 | - 90 28 | max_num_seqs: 11 29 | model_dtype: fp16 30 | workloads: 31 | workload_file: null 32 | -------------------------------------------------------------------------------- /benchmark/chatlmsys/yamls/temporal/requests_over_time_models_days_from_day55_to_day55_condense500_N17_maxrate_8_avgrate_2_GPUnum32_mesh_size2_idx7_temporal.yaml: -------------------------------------------------------------------------------- 1 | num_gpus: 2 2 | max_num_seqs: 256 3 | overload_threshold: 2 4 | gpu_memory_utilization: 0.7925000000000001 5 | models: 6 | - name: llm-15 7 | model: /mnt/afs/share/LLMCKPTs/huggyllama/llama-7b 8 | tensor_parallel_size: 2 9 | pipeline_parallel_size: 1 10 | placement: 11 | - - 0 12 | - 1 13 | mps_percentage: 14 | - 100 15 | - 90 16 | max_num_seqs: 17 17 | model_dtype: fp16 18 | workloads: 19 | workload_file: null 20 | -------------------------------------------------------------------------------- /benchmark/chatlmsys/yamls/temporal/requests_over_time_models_days_from_day55_to_day55_condense500_N17_maxrate_8_avgrate_2_GPUnum32_mesh_size2_idx9_temporal.yaml: -------------------------------------------------------------------------------- 1 | num_gpus: 2 2 | max_num_seqs: 256 3 | overload_threshold: 2 4 | gpu_memory_utilization: 0.51 5 | models: 6 | - name: llm-4 7 | model: /mnt/afs/share/LLMCKPTs/huggyllama/llama-7b 8 | tensor_parallel_size: 2 9 | pipeline_parallel_size: 1 10 | placement: 11 | - - 0 12 | - 1 13 | mps_percentage: 14 | - 100 15 | - 90 16 | max_num_seqs: 211 17 | model_dtype: fp16 18 | - name: llm-12 19 | model: /mnt/afs/share/LLMCKPTs/huggyllama/llama-13b 20 | tensor_parallel_size: 2 21 | pipeline_parallel_size: 1 22 | placement: 23 | - - 0 24 | - 1 25 | mps_percentage: 26 | - 100 27 | - 90 28 | max_num_seqs: 13 29 | model_dtype: fp16 30 | workloads: 31 | workload_file: null 32 | -------------------------------------------------------------------------------- /benchmark/chatlmsys/yamls/temporal/requests_over_time_models_days_from_day55_to_day55_condense500_N17_maxrate_8_avgrate_2_GPUnum32_mesh_size4_idx1_temporal.yaml: -------------------------------------------------------------------------------- 1 | num_gpus: 4 2 | max_num_seqs: 256 3 | overload_threshold: 2 4 | gpu_memory_utilization: 0.38499999999999995 5 | models: 6 | - name: llm-11 7 | model: /mnt/afs/share/LLMCKPTs/huggyllama/llama-30b 8 | tensor_parallel_size: 4 9 | pipeline_parallel_size: 1 10 | placement: 11 | - - 0 12 | - 1 13 | - 2 14 | - 3 15 | mps_percentage: 16 | - 100 17 | - 90 18 | max_num_seqs: 147 19 | model_dtype: fp16 20 | - name: llm-8 21 | model: /mnt/afs/share/LLMCKPTs/huggyllama/llama-30b 22 | tensor_parallel_size: 4 23 | pipeline_parallel_size: 1 24 | placement: 25 | - - 0 26 | - 1 27 | - 2 28 | - 3 29 | mps_percentage: 30 | - 100 31 | - 90 32 | max_num_seqs: 23 33 | model_dtype: fp16 34 | workloads: 35 | workload_file: null 36 | -------------------------------------------------------------------------------- /benchmark/chatlmsys/yamls/temporal/requests_over_time_models_days_from_day55_to_day55_condense500_N17_maxrate_8_avgrate_2_GPUnum32_mesh_size4_idx2_temporal.yaml: -------------------------------------------------------------------------------- 1 | num_gpus: 4 2 | max_num_seqs: 256 3 | overload_threshold: 2 4 | gpu_memory_utilization: 0.27249999999999996 5 | models: 6 | - name: llm-10 7 | model: /mnt/afs/share/LLMCKPTs/huggyllama/llama-65b 8 | tensor_parallel_size: 4 9 | pipeline_parallel_size: 1 10 | placement: 11 | - - 0 12 | - 1 13 | - 2 14 | - 3 15 | mps_percentage: 16 | - 100 17 | - 90 18 | max_num_seqs: 12 19 | model_dtype: fp16 20 | - name: llm-2 21 | model: /mnt/afs/share/LLMCKPTs/huggyllama/llama-13b 22 | tensor_parallel_size: 4 23 | pipeline_parallel_size: 1 24 | placement: 25 | - - 0 26 | - 1 27 | - 2 28 | - 3 29 | mps_percentage: 30 | - 100 31 | - 90 32 | max_num_seqs: 6 33 | model_dtype: fp16 34 | workloads: 35 | workload_file: null 36 | -------------------------------------------------------------------------------- /benchmark/chatlmsys/yamls/temporal/requests_over_time_models_days_from_day55_to_day55_condense500_N17_maxrate_8_avgrate_2_GPUnum32_mesh_size8_idx0_temporal.yaml: -------------------------------------------------------------------------------- 1 | num_gpus: 8 2 | max_num_seqs: 256 3 | overload_threshold: 2 4 | gpu_memory_utilization: 0.21187500000000004 5 | models: 6 | - name: llm-16 7 | model: /mnt/afs/share/LLMCKPTs/huggyllama/llama-65b 8 | tensor_parallel_size: 8 9 | pipeline_parallel_size: 1 10 | placement: 11 | - - 0 12 | - 1 13 | - 2 14 | - 3 15 | - 4 16 | - 5 17 | - 6 18 | - 7 19 | mps_percentage: 20 | - 100 21 | - 90 22 | max_num_seqs: 17 23 | model_dtype: fp16 24 | - name: llm-14 25 | model: /mnt/afs/share/LLMCKPTs/huggyllama/llama-65b 26 | tensor_parallel_size: 8 27 | pipeline_parallel_size: 1 28 | placement: 29 | - - 0 30 | - 1 31 | - 2 32 | - 3 33 | - 4 34 | - 5 35 | - 6 36 | - 7 37 | mps_percentage: 38 | - 100 39 | - 90 40 | max_num_seqs: 10 41 | model_dtype: fp16 42 | - name: llm-7 43 | model: /mnt/afs/share/LLMCKPTs/huggyllama/llama-7b 44 | tensor_parallel_size: 8 45 | pipeline_parallel_size: 1 46 | placement: 47 | - - 0 48 | - 1 49 | - 2 50 | - 3 51 | - 4 52 | - 5 53 | - 6 54 | - 7 55 | mps_percentage: 56 | - 100 57 | - 90 58 | max_num_seqs: 5 59 | model_dtype: fp16 60 | workloads: 61 | workload_file: null 62 | -------------------------------------------------------------------------------- /benchmark/chatlmsys/yamls/temporal/requests_over_time_models_days_from_day55_to_day55_condense800_N17_maxrate_13_avgrate_4_GPUnum32_mesh_size4_idx1_temporal.yaml: -------------------------------------------------------------------------------- 1 | num_gpus: 4 2 | max_num_seqs: 256 3 | overload_threshold: 2 4 | gpu_memory_utilization: 0.38499999999999995 5 | models: 6 | - name: llm-11 7 | model: /mnt/afs/share/LLMCKPTs/huggyllama/llama-30b 8 | tensor_parallel_size: 4 9 | pipeline_parallel_size: 1 10 | placement: 11 | - - 0 12 | - 1 13 | - 2 14 | - 3 15 | mps_percentage: 16 | - 100 17 | - 90 18 | max_num_seqs: 147 19 | model_dtype: fp16 20 | - name: llm-8 21 | model: /mnt/afs/share/LLMCKPTs/huggyllama/llama-30b 22 | tensor_parallel_size: 4 23 | pipeline_parallel_size: 1 24 | placement: 25 | - - 0 26 | - 1 27 | - 2 28 | - 3 29 | mps_percentage: 30 | - 100 31 | - 90 32 | max_num_seqs: 96 33 | model_dtype: fp16 34 | workloads: 35 | workload_file: null 36 | -------------------------------------------------------------------------------- /benchmark/chatlmsys/yamls/temporal/requests_over_time_models_days_from_day55_to_day55_condense800_N17_maxrate_13_avgrate_4_GPUnum32_mesh_size4_idx3_temporal.yaml: -------------------------------------------------------------------------------- 1 | num_gpus: 4 2 | max_num_seqs: 256 3 | overload_threshold: 2 4 | gpu_memory_utilization: 0.47125000000000006 5 | models: 6 | - name: llm-5 7 | model: /mnt/afs/share/LLMCKPTs/huggyllama/llama-13b 8 | tensor_parallel_size: 4 9 | pipeline_parallel_size: 1 10 | placement: 11 | - - 0 12 | - 1 13 | - 2 14 | - 3 15 | mps_percentage: 16 | - 100 17 | - 90 18 | max_num_seqs: 10 19 | model_dtype: fp16 20 | - name: llm-13 21 | model: /mnt/afs/share/LLMCKPTs/huggyllama/llama-7b 22 | tensor_parallel_size: 4 23 | pipeline_parallel_size: 1 24 | placement: 25 | - - 0 26 | - 1 27 | - 2 28 | - 3 29 | mps_percentage: 30 | - 100 31 | - 90 32 | max_num_seqs: 15 33 | model_dtype: fp16 34 | - name: llm-7 35 | model: /mnt/afs/share/LLMCKPTs/huggyllama/llama-7b 36 | tensor_parallel_size: 4 37 | pipeline_parallel_size: 1 38 | placement: 39 | - - 0 40 | - 1 41 | - 2 42 | - 3 43 | mps_percentage: 44 | - 100 45 | - 90 46 | max_num_seqs: 7 47 | model_dtype: fp16 48 | workloads: 49 | workload_file: null 50 | -------------------------------------------------------------------------------- /benchmark/chatlmsys/yamls/temporal/requests_over_time_models_days_from_day55_to_day55_condense800_N17_maxrate_13_avgrate_4_GPUnum32_mesh_size4_idx4_temporal.yaml: -------------------------------------------------------------------------------- 1 | num_gpus: 4 2 | max_num_seqs: 256 3 | overload_threshold: 2 4 | gpu_memory_utilization: 0.27249999999999996 5 | models: 6 | - name: llm-16 7 | model: /mnt/afs/share/LLMCKPTs/huggyllama/llama-65b 8 | tensor_parallel_size: 4 9 | pipeline_parallel_size: 1 10 | placement: 11 | - - 0 12 | - 1 13 | - 2 14 | - 3 15 | mps_percentage: 16 | - 100 17 | - 90 18 | max_num_seqs: 89 19 | model_dtype: fp16 20 | - name: llm-12 21 | model: /mnt/afs/share/LLMCKPTs/huggyllama/llama-13b 22 | tensor_parallel_size: 4 23 | pipeline_parallel_size: 1 24 | placement: 25 | - - 0 26 | - 1 27 | - 2 28 | - 3 29 | mps_percentage: 30 | - 100 31 | - 90 32 | max_num_seqs: 19 33 | model_dtype: fp16 34 | workloads: 35 | workload_file: null 36 | -------------------------------------------------------------------------------- /benchmark/chatlmsys/yamls/temporal/requests_over_time_models_days_from_day55_to_day55_condense800_N17_maxrate_13_avgrate_4_GPUnum32_mesh_size4_idx5_temporal.yaml: -------------------------------------------------------------------------------- 1 | num_gpus: 4 2 | max_num_seqs: 256 3 | overload_threshold: 2 4 | gpu_memory_utilization: 0.27249999999999996 5 | models: 6 | - name: llm-10 7 | model: /mnt/afs/share/LLMCKPTs/huggyllama/llama-65b 8 | tensor_parallel_size: 4 9 | pipeline_parallel_size: 1 10 | placement: 11 | - - 0 12 | - 1 13 | - 2 14 | - 3 15 | mps_percentage: 16 | - 100 17 | - 90 18 | max_num_seqs: 23 19 | model_dtype: fp16 20 | - name: llm-6 21 | model: /mnt/afs/share/LLMCKPTs/huggyllama/llama-13b 22 | tensor_parallel_size: 4 23 | pipeline_parallel_size: 1 24 | placement: 25 | - - 0 26 | - 1 27 | - 2 28 | - 3 29 | mps_percentage: 30 | - 100 31 | - 90 32 | max_num_seqs: 18 33 | model_dtype: fp16 34 | workloads: 35 | workload_file: null 36 | -------------------------------------------------------------------------------- /benchmark/chatlmsys/yamls/temporal/requests_over_time_models_days_from_day55_to_day55_condense800_N17_maxrate_13_avgrate_4_GPUnum32_mesh_size4_idx6_temporal.yaml: -------------------------------------------------------------------------------- 1 | num_gpus: 4 2 | max_num_seqs: 256 3 | overload_threshold: 2 4 | gpu_memory_utilization: 0.36500000000000005 5 | models: 6 | - name: llm-4 7 | model: /mnt/afs/share/LLMCKPTs/huggyllama/llama-7b 8 | tensor_parallel_size: 4 9 | pipeline_parallel_size: 1 10 | placement: 11 | - - 0 12 | - 1 13 | - 2 14 | - 3 15 | mps_percentage: 16 | - 100 17 | - 90 18 | max_num_seqs: 418 19 | model_dtype: fp16 20 | - name: llm-15 21 | model: /mnt/afs/share/LLMCKPTs/huggyllama/llama-7b 22 | tensor_parallel_size: 4 23 | pipeline_parallel_size: 1 24 | placement: 25 | - - 0 26 | - 1 27 | - 2 28 | - 3 29 | mps_percentage: 30 | - 100 31 | - 90 32 | max_num_seqs: 28 33 | model_dtype: fp16 34 | - name: llm-9 35 | model: /mnt/afs/share/LLMCKPTs/huggyllama/llama-30b 36 | tensor_parallel_size: 4 37 | pipeline_parallel_size: 1 38 | placement: 39 | - - 0 40 | - 1 41 | - 2 42 | - 3 43 | mps_percentage: 44 | - 100 45 | - 90 46 | max_num_seqs: 11 47 | model_dtype: fp16 48 | workloads: 49 | workload_file: null 50 | -------------------------------------------------------------------------------- /benchmark/chatlmsys/yamls/temporal/requests_over_time_models_days_from_day55_to_day55_condense800_N17_maxrate_13_avgrate_4_GPUnum32_mesh_size8_idx0_temporal.yaml: -------------------------------------------------------------------------------- 1 | num_gpus: 8 2 | max_num_seqs: 256 3 | overload_threshold: 2 4 | gpu_memory_utilization: 0.195 5 | models: 6 | - name: llm-1 7 | model: /mnt/afs/share/LLMCKPTs/huggyllama/llama-13b 8 | tensor_parallel_size: 8 9 | pipeline_parallel_size: 1 10 | placement: 11 | - - 0 12 | - 1 13 | - 2 14 | - 3 15 | - 4 16 | - 5 17 | - 6 18 | - 7 19 | mps_percentage: 20 | - 100 21 | - 90 22 | max_num_seqs: 445 23 | model_dtype: fp16 24 | - name: llm-3 25 | model: /mnt/afs/share/LLMCKPTs/huggyllama/llama-13b 26 | tensor_parallel_size: 8 27 | pipeline_parallel_size: 1 28 | placement: 29 | - - 0 30 | - 1 31 | - 2 32 | - 3 33 | - 4 34 | - 5 35 | - 6 36 | - 7 37 | mps_percentage: 38 | - 100 39 | - 90 40 | max_num_seqs: 362 41 | model_dtype: fp16 42 | - name: llm-14 43 | model: /mnt/afs/share/LLMCKPTs/huggyllama/llama-65b 44 | tensor_parallel_size: 8 45 | pipeline_parallel_size: 1 46 | placement: 47 | - - 0 48 | - 1 49 | - 2 50 | - 3 51 | - 4 52 | - 5 53 | - 6 54 | - 7 55 | mps_percentage: 56 | - 100 57 | - 90 58 | max_num_seqs: 18 59 | model_dtype: fp16 60 | - name: llm-2 61 | model: /mnt/afs/share/LLMCKPTs/huggyllama/llama-13b 62 | tensor_parallel_size: 8 63 | pipeline_parallel_size: 1 64 | placement: 65 | - - 0 66 | - 1 67 | - 2 68 | - 3 69 | - 4 70 | - 5 71 | - 6 72 | - 7 73 | mps_percentage: 74 | - 100 75 | - 90 76 | max_num_seqs: 11 77 | model_dtype: fp16 78 | workloads: 79 | workload_file: null 80 | -------------------------------------------------------------------------------- /benchmark/end_to_end/README.md: -------------------------------------------------------------------------------- 1 | 1. Generate configration: 2 | 3 | First, **replace the model path and dataset path** specified in `bench_end_to_end_muxserve.py` with your own path. Specifically, modify the following variables according to the comments. 4 | 5 | - `MODEL_TO_PATH` 6 | - `SHAREGPT_PATH` 7 | - `TOKENIZED_DATA_CACHE` 8 | 9 | Run the scripts to generate configuration: 10 | ```bash 11 | python bench_end_to_end_muxserve.py 12 | ``` 13 | This will generate the configuration and workloads file for the corresponding end-to-end evaluation in the paper: `alpha` = 0.7, 0.9, 1.3, 1.7, 2.1. 14 | 15 | 2. To start the experiment with running the `run_end_to_end.sh` script. Execute the following command in your terminal: 16 | 17 | ```bash 18 | bash run_end_to_end.sh [split_llm if 'spatial'] 19 | ``` 20 | 21 | - `launch_type` is choosen from [`muxserve`, `spatial`, `temporal`] 22 | - **Note:** `llm-id` is needed if `launch_type` is temporal; which is in the config file 23 | - **Note:** Flexsm utilizes Nvidia MPS. Running the muxserve component in the experiment requires **root** privileges. Replace the password in the script with your password(which is marked as `YOUR_PASSWD` in the `run_end_to_end.sh`). 24 | 25 | An example: 26 | 27 | ```bash 28 | bash run_end_to_end.sh spatial 0 \ 29 | model_cfgs/alpha0.7_scale0.5_max40/spatial_cfg.yaml \ 30 | workloads/alpha0.7_scale0.5_max40/sharegpt_n19_req.json 2 31 | ``` 32 | 33 | Make sure you are in the correct directory where the `run_end_to_end.sh` script is located. This script will initiate the necessary steps to run the end-to-end experiment. 34 | 35 | Once the test is stared, run logs will be generated in `${PROJ_DIR}/benchmark/end_to_end/log` by default. 36 | 37 | 3. Extract the evaluation result from log file: 38 | 39 | We provide an automated script `plot_p_latency.py` that performs statistical analysis on evaluation results and visualizes them. 40 | -------------------------------------------------------------------------------- /benchmark/end_to_end/plot_alpha_rate_dist.py: -------------------------------------------------------------------------------- 1 | import matplotlib.pyplot as plt 2 | 3 | 4 | def gen_power_law_dis(alpha: float, num_models: int) -> list[float]: 5 | rates = [(x + 1)**(-alpha) for x in range(num_models)] 6 | rates_sum = sum(rates) 7 | rates_ratio = [x / rates_sum for x in rates] 8 | 9 | return rates_ratio 10 | 11 | 12 | def plot_single_graph(ax, 13 | x_values, 14 | y_values, 15 | xlabel, 16 | ylabel, 17 | label, 18 | marker="o"): 19 | ax.set_xlabel(xlabel) 20 | ax.set_ylabel(ylabel) 21 | # ax.set_xticks(x_values) 22 | # ax.set_xticklabels(x_values) 23 | ax.plot(x_values, y_values, label=label, marker=marker, markersize=2) 24 | ax.grid() 25 | 26 | 27 | def gen_config_with_power_law(): 28 | # num_models = 19 # 12 x 7B; 4 x 13B; 2 x 30B; 1 x 65B 29 | num_models = 100 30 | alpha_lis = [0.7, 0.9, 1.3, 1.7, 2.1] 31 | max_rate_lis = [40] 32 | rate_scale_lis = [0.75] # 25, 40 33 | 34 | fig, ax = plt.subplots(figsize=(3, 2), dpi=300) 35 | x_label = 'Top-k Models (%)' 36 | y_label = 'Cumulative Rate (%)' 37 | 38 | x = [5, 10, 20, 30, 40, 50, 60, 70, 80, 90, 100] 39 | 40 | for alpha in alpha_lis: 41 | label = f"α: {alpha}" 42 | print(f"* α: {alpha}") 43 | for rate_scale in rate_scale_lis: 44 | rates_ratio = gen_power_law_dis(alpha, num_models) 45 | rates_ratio = sorted(rates_ratio, reverse=True) 46 | x_axis = x 47 | y_axis = [ 48 | sum(rates_ratio[:r]) * 100 / sum(rates_ratio) for r in x_axis 49 | ] 50 | print(x_axis, y_axis) 51 | # y_axis = [x * 100 for x in rates_ratio] 52 | print(f"=== rate scale: {rate_scale}") 53 | plot_single_graph(ax, x_axis, y_axis, x_label, y_label, label) 54 | 55 | handles, labels = ax.get_legend_handles_labels() 56 | fig.legend(handles, 57 | labels, 58 | loc='upper center', 59 | ncol=3, 60 | bbox_to_anchor=(0.5, 1.23)) 61 | 62 | fig.tight_layout() 63 | 64 | # fig.savefig("benchmark/end_to_end/plot_alpha_rate_dist.jpg", 65 | fig.savefig("benchmark/end_to_end/plot_alpha_rate_dist.pdf", 66 | bbox_inches='tight', 67 | pad_inches=0.05) 68 | 69 | 70 | if __name__ == "__main__": 71 | gen_config_with_power_law() 72 | -------------------------------------------------------------------------------- /benchmark/parallelism/bench_parallelism.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | dir=$(dirname $0) 3 | workdir=$(realpath $dir/../..) 4 | 5 | export PYTHONPATH=$(pwd):$PYTHONPATH 6 | 7 | LOGDIR="log/bench_parallelism" 8 | # LOGDIR="log" 9 | mkdir -p ${LOGDIR} 10 | 11 | IFS=',' read -ra models <<< "$1" 12 | IFS=',' read -ra ngpus <<< "$2" 13 | 14 | MPSDIR="${workdir}/log/mps" 15 | 16 | for model in ${models[@]}; do 17 | for ngpu in ${ngpus[@]}; do 18 | cfg="llama-${model}_n${ngpu}.yaml" 19 | if [ ! -f "benchmark/parallelism/${cfg}" ]; then 20 | echo "Config file ${cfg} does not exist!" 21 | continue 22 | fi 23 | echo "Running ${cfg}..." 24 | echo "djf@123" | sudo -S sh scripts/start_mps.sh ${MPSDIR} 25 | flexstore_port=$(python3 -c 'import socket; s=socket.socket(); s.bind(("127.0.0.1", 0)); print(s.getsockname()[1]); s.close()') 26 | python benchmark/parallelism/benchmark_parallelism.py \ 27 | benchmark/parallelism/${cfg} \ 28 | --nproc-per-node ${ngpu} \ 29 | --mps-dir ${MPSDIR} \ 30 | --workload-file examples/workloads/sharegpt_n1_rate10.json \ 31 | --server-port 4134 --flexstore-port ${flexstore_port} \ 32 | --log-dir ${LOGDIR} \ 33 | 2>&1 | tee log/muxserve_test.log 34 | echo "djf@123" | sudo -S sh scripts/stop_mps.sh ${MPSDIR} 35 | done 36 | done 37 | -------------------------------------------------------------------------------- /benchmark/parallelism/llama-13b_n1.yaml: -------------------------------------------------------------------------------- 1 | num_gpus: 1 2 | max_num_seqs: 256 3 | overload_threshold: 3 4 | gpu_memory_utilization: 0.4 5 | 6 | models: 7 | - 8 | name: llm-0 9 | model: /mnt/afs/share/LLMCKPTs/huggyllama/llama-13b 10 | tensor_parallel_size: 1 11 | pipeline_parallel_size: 1 12 | placement: 13 | - [0] 14 | mps_percentage: [100, 90, 80, 70, 60, 50, 40, 30] 15 | model_dtype: fp16 16 | workloads: 17 | # workload generation refer to README 18 | workload_file: 19 | -------------------------------------------------------------------------------- /benchmark/parallelism/llama-13b_n2.yaml: -------------------------------------------------------------------------------- 1 | num_gpus: 2 2 | max_num_seqs: 256 3 | overload_threshold: 3 4 | gpu_memory_utilization: 0.4 5 | 6 | models: 7 | - 8 | name: llm-0 9 | model: /mnt/afs/share/LLMCKPTs/huggyllama/llama-13b 10 | tensor_parallel_size: 2 11 | pipeline_parallel_size: 1 12 | placement: 13 | - [0, 1] 14 | mps_percentage: [100, 90, 80, 70, 60, 50, 40, 30] 15 | model_dtype: fp16 16 | workloads: 17 | # workload generation refer to README 18 | workload_file: 19 | -------------------------------------------------------------------------------- /benchmark/parallelism/llama-13b_n4.yaml: -------------------------------------------------------------------------------- 1 | num_gpus: 4 2 | max_num_seqs: 512 3 | overload_threshold: 3 4 | gpu_memory_utilization: 0.4 5 | 6 | models: 7 | - 8 | name: llm-0 9 | model: /mnt/afs/share/LLMCKPTs/huggyllama/llama-13b 10 | tensor_parallel_size: 4 11 | pipeline_parallel_size: 1 12 | placement: 13 | - [0, 1, 2, 3] 14 | mps_percentage: [100, 90, 80, 70, 60, 50, 40, 30] 15 | model_dtype: fp16 16 | workloads: 17 | # workload generation refer to README 18 | workload_file: 19 | -------------------------------------------------------------------------------- /benchmark/parallelism/llama-13b_n8.yaml: -------------------------------------------------------------------------------- 1 | num_gpus: 8 2 | max_num_seqs: 1024 3 | overload_threshold: 3 4 | gpu_memory_utilization: 0.4 5 | 6 | models: 7 | - 8 | name: llm-0 9 | model: /mnt/afs/share/LLMCKPTs/huggyllama/llama-13b 10 | tensor_parallel_size: 8 11 | pipeline_parallel_size: 1 12 | placement: 13 | - [0, 1, 2, 3, 4, 5, 6, 7] 14 | mps_percentage: [100, 90, 80, 70, 60, 50, 40, 30] 15 | model_dtype: fp16 16 | workloads: 17 | # workload generation refer to README 18 | workload_file: 19 | -------------------------------------------------------------------------------- /benchmark/parallelism/llama-30b_n2.yaml: -------------------------------------------------------------------------------- 1 | num_gpus: 2 2 | max_num_seqs: 256 3 | overload_threshold: 3 4 | gpu_memory_utilization: 0.4 5 | 6 | models: 7 | - 8 | name: llm-0 9 | model: /mnt/afs/share/LLMCKPTs/huggyllama/llama-30b 10 | tensor_parallel_size: 2 11 | pipeline_parallel_size: 1 12 | placement: 13 | - [0, 1] 14 | mps_percentage: [100, 90, 80, 70, 60, 50, 40, 30] 15 | model_dtype: fp16 16 | workloads: 17 | # workload generation refer to README 18 | workload_file: 19 | -------------------------------------------------------------------------------- /benchmark/parallelism/llama-30b_n4.yaml: -------------------------------------------------------------------------------- 1 | num_gpus: 4 2 | max_num_seqs: 256 3 | overload_threshold: 3 4 | gpu_memory_utilization: 0.4 5 | 6 | models: 7 | - 8 | name: llm-0 9 | model: /mnt/afs/share/LLMCKPTs/huggyllama/llama-30b 10 | tensor_parallel_size: 4 11 | pipeline_parallel_size: 1 12 | placement: 13 | - [0, 1, 2, 3] 14 | mps_percentage: [100, 90, 80, 70, 60, 50, 40, 30] 15 | model_dtype: fp16 16 | workloads: 17 | # workload generation refer to README 18 | workload_file: 19 | -------------------------------------------------------------------------------- /benchmark/parallelism/llama-65b_n16.yaml: -------------------------------------------------------------------------------- 1 | num_gpus: 16 2 | max_num_seqs: 1024 3 | overload_threshold: 3 4 | gpu_memory_utilization: 0.5 5 | 6 | models: 7 | - 8 | name: llm-0 9 | model: /mnt/afs/share/LLMCKPTs/huggyllama/llama-65b 10 | tensor_parallel_size: 16 11 | pipeline_parallel_size: 1 12 | placement: 13 | - [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15] 14 | mps_percentage: [100] 15 | model_dtype: fp16 16 | workloads: 17 | # workload generation refer to README 18 | workload_file: 19 | -------------------------------------------------------------------------------- /benchmark/parallelism/llama-65b_n4.yaml: -------------------------------------------------------------------------------- 1 | num_gpus: 4 2 | max_num_seqs: 256 3 | overload_threshold: 3 4 | gpu_memory_utilization: 0.4 5 | 6 | models: 7 | - 8 | name: llm-0 9 | model: /mnt/afs/share/LLMCKPTs/huggyllama/llama-65b 10 | tensor_parallel_size: 4 11 | pipeline_parallel_size: 1 12 | placement: 13 | - [0, 1, 2, 3] 14 | mps_percentage: [100, 90, 80, 70, 60, 50, 40, 30] 15 | model_dtype: fp16 16 | workloads: 17 | # workload generation refer to README 18 | workload_file: 19 | -------------------------------------------------------------------------------- /benchmark/parallelism/llama-65b_n8.yaml: -------------------------------------------------------------------------------- 1 | num_gpus: 8 2 | max_num_seqs: 512 3 | overload_threshold: 3 4 | gpu_memory_utilization: 0.5 5 | 6 | models: 7 | - 8 | name: llm-0 9 | model: /mnt/afs/share/LLMCKPTs/huggyllama/llama-65b 10 | tensor_parallel_size: 8 11 | pipeline_parallel_size: 1 12 | placement: 13 | - [0, 1, 2, 3, 4, 5, 6, 7] 14 | mps_percentage: [30, 40, 50, 60, 70, 80, 90, 100] 15 | model_dtype: fp16 16 | workloads: 17 | # workload generation refer to README 18 | workload_file: 19 | -------------------------------------------------------------------------------- /benchmark/parallelism/llama-7b_n1.yaml: -------------------------------------------------------------------------------- 1 | num_gpus: 1 2 | max_num_seqs: 256 3 | overload_threshold: 3 4 | gpu_memory_utilization: 0.5 5 | 6 | models: 7 | - 8 | name: llm-0 9 | model: /mnt/afs/share/LLMCKPTs/huggyllama/llama-7b 10 | tensor_parallel_size: 1 11 | pipeline_parallel_size: 1 12 | placement: 13 | - [0] 14 | mps_percentage: [100, 90, 80, 70, 60, 50, 40, 30] 15 | model_dtype: fp16 16 | workloads: 17 | # workload generation refer to README 18 | workload_file: 19 | -------------------------------------------------------------------------------- /benchmark/parallelism/llama-7b_n2.yaml: -------------------------------------------------------------------------------- 1 | num_gpus: 2 2 | max_num_seqs: 256 3 | overload_threshold: 3 4 | gpu_memory_utilization: 0.6 5 | 6 | models: 7 | - 8 | name: llm-0 9 | model: /mnt/afs/share/LLMCKPTs/huggyllama/llama-7b 10 | tensor_parallel_size: 2 11 | pipeline_parallel_size: 1 12 | placement: 13 | - [0, 1] 14 | mps_percentage: [100, 90, 80, 70, 60, 50, 40, 30] 15 | model_dtype: fp16 16 | workloads: 17 | # workload generation refer to README 18 | workload_file: 19 | -------------------------------------------------------------------------------- /benchmark/parallelism/llama-7b_n4.yaml: -------------------------------------------------------------------------------- 1 | num_gpus: 4 2 | max_num_seqs: 512 3 | overload_threshold: 3 4 | gpu_memory_utilization: 0.5 5 | 6 | models: 7 | - 8 | name: llm-0 9 | model: /mnt/afs/share/LLMCKPTs/huggyllama/llama-7b 10 | tensor_parallel_size: 4 11 | pipeline_parallel_size: 1 12 | placement: 13 | - [0, 1, 2, 3] 14 | mps_percentage: [100, 90, 80, 70, 60, 50, 40, 30] 15 | model_dtype: fp16 16 | workloads: 17 | # workload generation refer to README 18 | workload_file: 19 | -------------------------------------------------------------------------------- /benchmark/parallelism/llama-7b_n8.yaml: -------------------------------------------------------------------------------- 1 | num_gpus: 8 2 | max_num_seqs: 1024 3 | overload_threshold: 3 4 | gpu_memory_utilization: 0.5 5 | 6 | models: 7 | - 8 | name: llm-0 9 | model: /mnt/afs/share/LLMCKPTs/huggyllama/llama-7b 10 | tensor_parallel_size: 8 11 | pipeline_parallel_size: 1 12 | placement: 13 | - [0, 1, 2, 3, 4, 5, 6, 7] 14 | mps_percentage: [100, 90, 80, 70, 60, 50, 40, 30] 15 | model_dtype: fp16 16 | workloads: 17 | # workload generation refer to README 18 | workload_file: 19 | -------------------------------------------------------------------------------- /csrc/batch_scheduler.h: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | 7 | namespace py = pybind11; 8 | 9 | struct Request { 10 | int idx_; 11 | int maxTokens_; 12 | int contextLen_; 13 | int blockSize_; 14 | int numPhysicalBlocks_; 15 | int logicalBlockIdx_; 16 | int logicalBlockOffset_; 17 | py::array_t prompts_; 18 | std::vector outputs_; 19 | // shape: [maxBocksPerSeq, numLayers, numHeads] 20 | py::array_t allocatedBlocks_; 21 | 22 | Request(int reqId, int maxTokens, py::array_t &prompts, int blockSize) 23 | : idx_(reqId), maxTokens_(maxTokens), blockSize_(blockSize), 24 | numPhysicalBlocks_(0), prompts_(prompts), outputs_({}) { 25 | int promptLen = prompts_.size(); 26 | logicalBlockIdx_ = (promptLen + blockSize_ - 1) / blockSize_ - 1; 27 | logicalBlockOffset_ = (promptLen - 1) % blockSize_; 28 | contextLen_ = promptLen; 29 | } 30 | 31 | int appendLogicalToken(int tokenId) { 32 | outputs_.push_back(tokenId); 33 | contextLen_++; 34 | logicalBlockOffset_++; 35 | if (logicalBlockOffset_ == blockSize_) { 36 | logicalBlockIdx_++; 37 | logicalBlockOffset_ = 0; 38 | } 39 | return logicalBlockIdx_ + 1 - numPhysicalBlocks_; 40 | } 41 | 42 | void getPromptSlots(int numLayers, int numHeads, 43 | std::vector &slotMapping) { 44 | int blocksPerToken = numLayers * numHeads; 45 | auto allocatedBlocksPtr = allocatedBlocks_.mutable_data(); 46 | for (int i = 0; i < prompts_.size(); i++) { 47 | int blockIdx = i / blockSize_; 48 | int blockOffset = i % blockSize_; 49 | for (int j = 0; j < blocksPerToken; j++) { 50 | slotMapping.push_back( 51 | int64_t(allocatedBlocksPtr[blockIdx * blocksPerToken + j]) * 52 | blockSize_ + 53 | blockOffset); 54 | } 55 | } 56 | } 57 | 58 | void getLastEmptySlot(int numLayers, int numHeads, 59 | std::vector &slotMapping) { 60 | auto allocatedBlocksPtr = allocatedBlocks_.mutable_data(); 61 | int offset = (numPhysicalBlocks_ - 1) * numLayers * numHeads; 62 | for (int i = 0; i < numLayers * numHeads; i++) { 63 | slotMapping.push_back(int64_t(allocatedBlocksPtr[offset + i]) * 64 | blockSize_ + 65 | logicalBlockOffset_); 66 | } 67 | } 68 | }; 69 | 70 | class BatchScheduler { 71 | public: 72 | BatchScheduler(int numLayers, int numHeads, int maxSeqLen, int blockSize); 73 | 74 | void addRequest(py::array_t &prompt, int reqId, int maxTokens); 75 | 76 | py::array_t tryBatch(std::vector &batchRequestIds, 77 | std::vector &batchLastOutputTokens); 78 | 79 | std::vector getBatchInfo(py::array_t &batchBlockRequest, 80 | py::array_t &blockInfo); 81 | 82 | void getBatch(py::array_t &batchInfo, py::array_t &blockInfo, 83 | torch::Tensor &tokenTensor, torch::Tensor &tokenPositionTensor, 84 | torch::Tensor &contextLenTensor, 85 | torch::Tensor &blockTableTensor, 86 | torch::Tensor &slotMappingTensor); 87 | 88 | void releaseRequests(std::vector &requestIds); 89 | 90 | std::vector getPreemptRequests() { return preemptRequests_; } 91 | 92 | std::vector getPromptInfo() { return promptInfo_; } 93 | 94 | std::vector> getBatchReqs() { return batchReqs_; } 95 | 96 | int numLayers_; 97 | int numHeads_; 98 | int maxSeqLen_; 99 | int KVBlockSize_; 100 | int maxBocksPerSeq_; 101 | std::map> requests_; 102 | // placeholder for batch info 103 | std::vector preemptRequests_; 104 | std::vector promptInfo_; 105 | std::vector> batchReqs_; 106 | }; 107 | -------------------------------------------------------------------------------- /csrc/memory_manager.h: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | 6 | namespace py = pybind11; 7 | 8 | class KVStorage { 9 | public: 10 | KVStorage(int numBlocks); 11 | 12 | py::array_t allocate(int size); 13 | 14 | py::array_t allocateBatch(py::array_t &batchAllocInfo, 15 | int numLayers, int numHeads); 16 | 17 | int newBlocksAllocated() { return newBlocksAllocated_; } 18 | 19 | int freeBatch(std::vector &batchRequestIds); 20 | 21 | int numBlocks_; 22 | std::vector freeBlocks_; 23 | std::map> allocatedBlocks_; 24 | 25 | int numBlocksAllocated_; 26 | // temporary record for logging 27 | int newBlocksAllocated_; 28 | }; 29 | -------------------------------------------------------------------------------- /examples/basic/model_config.yaml: -------------------------------------------------------------------------------- 1 | num_gpus: 4 2 | max_num_seqs: 256 3 | overload_threshold: 3 4 | gpu_memory_utilization: 0.4 5 | 6 | models: 7 | - 8 | name: llm-0 9 | model: /mnt/afs/share/LLMCKPTs/huggyllama/llama-7b 10 | tensor_parallel_size: 4 11 | pipeline_parallel_size: 1 12 | placement: 13 | - [0, 1, 2, 3] 14 | mps_percentage: [100, 30] 15 | model_dtype: fp16 16 | - 17 | name: llm-1 18 | model: /mnt/afs/share/LLMCKPTs/huggyllama/llama-13b 19 | tensor_parallel_size: 4 20 | pipeline_parallel_size: 1 21 | placement: 22 | - [0, 1, 2, 3] 23 | mps_percentage: [100, 40] 24 | model_dtype: fp16 25 | - 26 | name: llm-2 27 | model: /mnt/afs/share/LLMCKPTs/huggyllama/llama-30b 28 | tensor_parallel_size: 4 29 | pipeline_parallel_size: 1 30 | placement: 31 | - [0, 1, 2, 3] 32 | mps_percentage: [100, 40] 33 | model_dtype: fp16 34 | workloads: 35 | # leave empty, workload generation refer to README 36 | workload_file: 37 | -------------------------------------------------------------------------------- /examples/basic/model_config_spatial_0.yaml: -------------------------------------------------------------------------------- 1 | num_gpus: 1 2 | max_num_seqs: 256 3 | overload_threshold: 3 4 | gpu_memory_utilization: 0.7 5 | 6 | models: 7 | - 8 | name: llm-0 9 | model: /mnt/afs/share/LLMCKPTs/huggyllama/llama-7b 10 | tensor_parallel_size: 1 11 | pipeline_parallel_size: 1 12 | placement: 13 | - [0] 14 | mps_percentage: [100, 40] 15 | model_dtype: fp16 16 | workloads: 17 | # workload generation refer to README 18 | workload_file: 19 | 20 | 21 | 22 | -------------------------------------------------------------------------------- /examples/basic/model_config_spatial_1.yaml: -------------------------------------------------------------------------------- 1 | num_gpus: 1 2 | max_num_seqs: 256 3 | overload_threshold: 3 4 | gpu_memory_utilization: 0.5 5 | 6 | models: 7 | - 8 | name: llm-0 9 | model: /mnt/afs/share/LLMCKPTs/huggyllama/llama-13b 10 | tensor_parallel_size: 1 11 | pipeline_parallel_size: 1 12 | placement: 13 | - [0] 14 | mps_percentage: [100, 40] 15 | model_dtype: fp16 16 | workloads: 17 | # workload generation refer to README 18 | workload_file: 19 | 20 | 21 | 22 | -------------------------------------------------------------------------------- /examples/basic/model_config_spatial_2.yaml: -------------------------------------------------------------------------------- 1 | num_gpus: 2 2 | max_num_seqs: 256 3 | overload_threshold: 3 4 | gpu_memory_utilization: 0.7 5 | 6 | models: 7 | - 8 | name: llm-0 9 | model: /mnt/afs/share/LLMCKPTs/huggyllama/llama-30b 10 | tensor_parallel_size: 2 11 | pipeline_parallel_size: 1 12 | placement: 13 | - [0, 1] 14 | mps_percentage: [100, 40] 15 | model_dtype: fp16 16 | workloads: 17 | # workload generation refer to README 18 | workload_file: 19 | -------------------------------------------------------------------------------- /examples/basic/models.yaml: -------------------------------------------------------------------------------- 1 | cluster: 2 | nnodes: 1 3 | ngpus_per_node: 4 4 | models: 5 | - name: llm-0 6 | model: /mnt/afs/share/LLMCKPTs/huggyllama/llama-7b 7 | rate: 12 8 | - name: llm-1 9 | model: /mnt/afs/share/LLMCKPTs/huggyllama/llama-13b 10 | rate: 5 11 | - name: llm-2 12 | model: /mnt/afs/share/LLMCKPTs/huggyllama/llama-30b 13 | rate: 3 -------------------------------------------------------------------------------- /examples/model_cfg.yaml: -------------------------------------------------------------------------------- 1 | num_gpus: 4 2 | max_num_seqs: 256 3 | overload_threshold: 2 4 | gpu_memory_utilization: 0.4 5 | 6 | models: 7 | - 8 | # unique name for the model to be served 9 | name: llm-0 10 | # model checkpoint name or path 11 | model: /mnt/afs/share/LLMCKPTs/huggyllama/llama-7b 12 | tensor_parallel_size: 1 13 | pipeline_parallel_size: 4 14 | placement: 15 | - [0, 1, 2, 3] 16 | mps_percentage: [80, 30] 17 | model_dtype: fp16 18 | - 19 | name: llm-1 20 | model: /mnt/afs/share/LLMCKPTs/huggyllama/llama-13b 21 | tensor_parallel_size: 1 22 | pipeline_parallel_size: 4 23 | placement: 24 | - [0, 1, 2, 3] 25 | mps_percentage: [90, 40] 26 | model_dtype: fp16 27 | - 28 | name: llm-2 29 | model: /mnt/afs/share/LLMCKPTs/huggyllama/llama-13b 30 | tensor_parallel_size: 1 31 | pipeline_parallel_size: 4 32 | placement: 33 | - [0, 1, 2, 3] 34 | mps_percentage: [100, 50] 35 | model_dtype: fp16 36 | workloads: 37 | # workload generation refer to README 38 | workload_file: /mnt/afs/jfduan/LLMInfer/MuxServe/examples/workloads/sharegpt_r5_r5_r5.json 39 | dataset: /mnt/afs/jfduan/datas/raw/sharegpt_v3/ShareGPT_V3_unfiltered_cleaned_split.json 40 | arrival_rates: [20, 20] 41 | start: 0 42 | duration: 1000 43 | num_requests: 10 44 | -------------------------------------------------------------------------------- /examples/placement/README.md: -------------------------------------------------------------------------------- 1 | ## Cost File 2 | 3 | The `llama.json` contains the profiled cost of LLaMA models on A100 GPU (80GB). The format of `llama.json` is: 4 | ``` 5 | model_name: { 6 | "num_gpus": { 7 | "percentage of SMs": { 8 | "batch size": { 9 | "sequence length": { 10 | "prefill": 0.0, 11 | "decoding": 0.0, 12 | } 13 | } 14 | } 15 | } 16 | } 17 | ``` 18 | 19 | For `prefill`, the input tokens is `batch size`$\times$`sequence length`. For `decoding`, the KV cache is `batch size`$\times$`sequence length`, while the input tokens is `batch size`. All the cost is time in milliseconds. 20 | -------------------------------------------------------------------------------- /muxserve/__init__.py: -------------------------------------------------------------------------------- 1 | from .arg_utils import MuxServeArgs 2 | from .muxsched.scheduler import MuxScheduler 3 | 4 | __version__ = "0.0.0" 5 | -------------------------------------------------------------------------------- /muxserve/config.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from typing import List, Dict, Any 3 | 4 | 5 | class JobConfig: 6 | """Configuration for one job. 7 | 8 | Args: 9 | model: Name or path of the huggingface model to use. 10 | pipeline_parallel_size: Number of pipeline parallel groups. 11 | tensor_parallel_size: Number of tensor parallel groups. 12 | """ 13 | 14 | def __init__(self, 15 | name: str, 16 | model: str, 17 | pipeline_parallel_size: int, 18 | tensor_parallel_size: int, 19 | placement: List[List[int]], 20 | mps_percentage: List[int], 21 | max_num_seqs: int, 22 | model_dtype: torch.dtype = torch.float16): 23 | self.name = name 24 | self.model = model 25 | self.pipeline_parallel_size = pipeline_parallel_size 26 | self.tensor_parallel_size = tensor_parallel_size 27 | self.placement = placement 28 | self.mps_percentage = mps_percentage 29 | self.max_num_seqs = max_num_seqs 30 | self.model_dtype = model_dtype 31 | 32 | 33 | class MuxServeConfig: 34 | """Configuration for muxserve. 35 | 36 | Args: 37 | job_configs: List of JobConfig. 38 | num_gpus: Number of GPUs to use. 39 | block_size: Token block size. 40 | gpu_memory_utilization: The percentage of GPU memory to be used for the 41 | flexstore. 42 | """ 43 | 44 | def __init__(self, job_configs: List[JobConfig], num_gpus: int, 45 | ray_node_address: str, base_ray_port: int, 46 | num_ray_cluster: int, mps_dir: str, block_size: int, 47 | overload_threshold: int, gpu_memory_utilization: float, 48 | max_num_batched_tokens: int, max_num_seqs: int, 49 | muxserve_host: str, flexstore_port: int, server_port: int, 50 | workload_config: Dict[str, Any], model_config: Dict[Any, Any], 51 | model_config_path: str, schedule_approach: bool, nnodes: int, 52 | nproc_per_node: int, node_rank: int, master_addr: str, 53 | master_port: int): 54 | self.job_configs = job_configs 55 | self.num_gpus = num_gpus 56 | self.ray_node_address = ray_node_address 57 | self.base_ray_port = base_ray_port 58 | self.num_ray_cluster = num_ray_cluster 59 | self.mps_dir = mps_dir 60 | self.block_size = block_size 61 | self.overload_threshold = overload_threshold 62 | self.gpu_memory_utilization = gpu_memory_utilization 63 | if max_num_batched_tokens is not None: 64 | self.max_num_batched_tokens = max_num_batched_tokens 65 | else: 66 | self.max_num_batched_tokens = 2048 67 | self.max_num_seqs = max_num_seqs 68 | self.muxserve_host = muxserve_host 69 | self.flexstore_port = flexstore_port 70 | self.server_port = server_port 71 | self.workload_config = workload_config 72 | self.model_config = model_config 73 | self.model_config_path = model_config_path 74 | self.schedule_approach = schedule_approach 75 | self.nnodes = nnodes 76 | self.nproc_per_node = nproc_per_node 77 | self.node_rank = node_rank 78 | self.master_addr = master_addr 79 | self.master_port = master_port 80 | 81 | self.head_size = 128 82 | 83 | self.num_runtime_processes = 0 84 | for job_config in self.job_configs: 85 | self.num_runtime_processes += len(job_config.mps_percentage) 86 | -------------------------------------------------------------------------------- /muxserve/constants.py: -------------------------------------------------------------------------------- 1 | SM_HOLD_NAME_FMT = "{}_HD_{}" 2 | 3 | ADD_REQ_NAME_FMT = "{}_ADD_{}" 4 | RET_REQ_NAME_FMT = "{}_RET_{}" 5 | 6 | PREEMPT_REQ_NAME_FMT = "{}_PRE_{}" 7 | -------------------------------------------------------------------------------- /muxserve/entrypoint.py: -------------------------------------------------------------------------------- 1 | import os 2 | import argparse 3 | import asyncio 4 | 5 | from muxserve.arg_utils import MuxServeArgs 6 | from muxserve.config import MuxServeConfig 7 | from muxserve.flexstore.manager import FlexStoreManager 8 | from muxserve.muxsched.scheduler import MuxScheduler 9 | from muxserve.logger import get_logger 10 | 11 | logger = get_logger() 12 | 13 | 14 | def main_flexstore(muxserve_config: MuxServeConfig): 15 | flexstore_manager = FlexStoreManager(muxserve_config) 16 | flexstore_manager.deploy() 17 | 18 | 19 | def main_muxsched(muxserve_config: MuxServeConfig): 20 | muxscheduler = MuxScheduler(muxserve_config) 21 | muxscheduler.serve_models() 22 | asyncio.run(muxscheduler.schedule_loop()) 23 | 24 | 25 | def main(args: argparse.Namespace): 26 | muxserve_args = MuxServeArgs.from_cli_args(args) 27 | muxserve_config = muxserve_args.create_muxserve_config() 28 | 29 | if args.flexstore: 30 | main_flexstore(muxserve_config) 31 | 32 | if args.muxscheduler: 33 | main_muxsched(muxserve_config) 34 | 35 | 36 | if __name__ == '__main__': 37 | parser = argparse.ArgumentParser(description='MuxServe Entry Point') 38 | parser.add_argument("--flexstore", 39 | action="store_true", 40 | help="Launch FlexStore process.") 41 | parser.add_argument("--muxscheduler", 42 | action="store_true", 43 | help="Launch MuxScheduler process.") 44 | parser = MuxServeArgs.add_cli_args(parser) 45 | args = parser.parse_args() 46 | main(args) 47 | -------------------------------------------------------------------------------- /muxserve/flexserver/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/EfficientLLMSys/MuxServe/9fb0fa555bb4051bee612b72938e5d85796dffec/muxserve/flexserver/__init__.py -------------------------------------------------------------------------------- /muxserve/flexserver/model_loader.py: -------------------------------------------------------------------------------- 1 | """Utilities for selecting and loading models.""" 2 | import contextlib 3 | from typing import Type, List, Optional, Dict 4 | 5 | import torch 6 | import torch.nn as nn 7 | from torch.multiprocessing.reductions import rebuild_cuda_tensor 8 | from transformers import PretrainedConfig 9 | 10 | from vllm.config import ModelConfig 11 | from vllm.model_executor.weight_utils import initialize_dummy_weights 12 | from vllm.zmq_tool import ZMQClient 13 | 14 | from muxserve.flexserver.models import * # pylint: disable=wildcard-import 15 | from muxserve.logger import get_logger 16 | 17 | logger = get_logger() 18 | 19 | _MODEL_REGISTRY = { 20 | "LlamaForCausalLM": LlamaForCausalLM, 21 | "LLaMAForCausalLM": LlamaForCausalLM, # For decapoda-research/llama-* 22 | } 23 | 24 | 25 | @contextlib.contextmanager 26 | def _set_default_torch_dtype(dtype: torch.dtype): 27 | """Sets the default torch dtype to the given dtype.""" 28 | old_dtype = torch.get_default_dtype() 29 | torch.set_default_dtype(dtype) 30 | yield 31 | torch.set_default_dtype(old_dtype) 32 | 33 | 34 | def _get_model_architecture(config: PretrainedConfig) -> Type[nn.Module]: 35 | architectures = getattr(config, "architectures", []) 36 | for arch in architectures: 37 | if arch in _MODEL_REGISTRY: 38 | return _MODEL_REGISTRY[arch] 39 | raise ValueError( 40 | f"Model architectures {architectures} are not supported for now. " 41 | f"Supported architectures: {list(_MODEL_REGISTRY.keys())}") 42 | 43 | 44 | def update_parameters(model: nn.Module, data: Dict[str, dict]): 45 | for param_name, param in model.named_parameters(): 46 | cuda_tensor = rebuild_cuda_tensor(torch.Tensor, **(data[param_name])) 47 | assert param.shape == cuda_tensor.shape 48 | assert cuda_tensor.is_cuda 49 | param.data = cuda_tensor 50 | 51 | 52 | def load_from_server(model: nn.Module, tcp_client: ZMQClient, 53 | model_config: ModelConfig): 54 | # suppose our model was deployed on single card now 55 | logger.info('connecting server ' 56 | f'from client cuda{str(torch.cuda.current_device())}') 57 | 58 | # ask for the server about the weight 59 | rank = torch.distributed.get_rank() 60 | tcp_client.send_pyobj(["weight", [rank, model_config.model_name]]) 61 | 62 | logger.info('client: connected, waiting data') 63 | data = tcp_client.recv_pyobj() 64 | logger.info('client: data received, rebuilding and printing') 65 | 66 | update_parameters(model, data) 67 | # could be commented because of assert cuda_tensor.is_cuda 68 | model = model.cuda() 69 | 70 | 71 | def get_model(model_config: ModelConfig, 72 | tcp_client: Optional[ZMQClient] = None, 73 | **kwargs) -> nn.Module: 74 | model_class = _get_model_architecture(model_config.hf_config) 75 | 76 | with _set_default_torch_dtype(model_config.dtype): 77 | 78 | # Create a model instance. 79 | # The weights will be initialized as empty tensors. 80 | model = model_class(model_config.hf_config, **kwargs) 81 | 82 | if tcp_client is not None: 83 | load_from_server(model, tcp_client, model_config) 84 | else: 85 | if model_config.load_format == "dummy": 86 | model = model.cuda() 87 | # NOTE(woosuk): For accurate performance evaluation, we assign 88 | # random values to the weights. 89 | initialize_dummy_weights(model) 90 | else: 91 | # Load the weights from the cached or downloaded files. 92 | model.load_weights(model_config.model, 93 | model_config.download_dir, 94 | model_config.load_format, 95 | model_config.revision) 96 | model = model.cuda() 97 | 98 | return model.eval() 99 | -------------------------------------------------------------------------------- /muxserve/flexserver/models/__init__.py: -------------------------------------------------------------------------------- 1 | from muxserve.flexserver.models.llama import LlamaForCausalLM 2 | 3 | __all__ = ["LlamaForCausalLM"] 4 | -------------------------------------------------------------------------------- /muxserve/flexserver/sampler.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import torch 3 | import torch.nn as nn 4 | 5 | from vllm.model_executor.input_metadata import InputMetadata 6 | from vllm.model_executor.layers.sampler import _get_logits 7 | 8 | 9 | class GreedySampler(nn.Module): 10 | 11 | def __init__(self, vocab_size: int) -> None: 12 | super().__init__() 13 | self.vocab_size = vocab_size 14 | 15 | def forward(self, embedding: torch.Tensor, hidden_states: torch.Tensor, 16 | input_metadata: InputMetadata): 17 | 18 | if input_metadata.num_prompts > 0: 19 | assert input_metadata.num_generation_tokens == 0 20 | indices = np.cumsum(input_metadata.prompt_lens) - 1 21 | indices = torch.tensor(indices, 22 | dtype=torch.int, 23 | device=hidden_states.device) 24 | hidden_states = hidden_states.index_select(0, indices) 25 | 26 | logits = _get_logits(hidden_states, embedding, None, self.vocab_size) 27 | 28 | next_tokens = [] 29 | if input_metadata.num_prompts > 0: 30 | num_tokens = input_metadata.num_prompts 31 | if input_metadata.num_generation_tokens > 0: 32 | num_tokens = input_metadata.num_generation_tokens 33 | next_tokens = torch.argmax(logits, dim=-1) 34 | next_tokens = next_tokens.cpu().numpy().tolist() 35 | next_tokens = next_tokens[:num_tokens] 36 | 37 | return next_tokens 38 | -------------------------------------------------------------------------------- /muxserve/flexstore/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/EfficientLLMSys/MuxServe/9fb0fa555bb4051bee612b72938e5d85796dffec/muxserve/flexstore/__init__.py -------------------------------------------------------------------------------- /muxserve/logger.py: -------------------------------------------------------------------------------- 1 | # Borrowed from 2 | # https://github.com/vllm-project/vllm/blob/main/vllm/logger.py 3 | import logging 4 | import sys 5 | 6 | _FORMAT = "%(levelname)s %(asctime)s.%(msecs)03d %(filename)s:%(lineno)d] %(message)s" 7 | _DATE_FORMAT = "%m-%d %H:%M:%S" 8 | 9 | 10 | class NewLineFormatter(logging.Formatter): 11 | """Adds logging prefix to newlines to align multi-line messages.""" 12 | 13 | def __init__(self, fmt, datefmt=None): 14 | logging.Formatter.__init__(self, fmt, datefmt) 15 | 16 | def format(self, record): 17 | msg = logging.Formatter.format(self, record) 18 | if record.message != "": 19 | parts = msg.split(record.message) 20 | msg = msg.replace("\n", "\r\n" + parts[0]) 21 | return msg 22 | 23 | 24 | _root_logger = logging.getLogger("MuxServe") 25 | _default_handler = None 26 | 27 | 28 | def _setup_logger(): 29 | _root_logger.setLevel(logging.DEBUG) 30 | global _default_handler 31 | if _default_handler is None: 32 | _default_handler = logging.StreamHandler(sys.stdout) 33 | _default_handler.flush = sys.stdout.flush # type: ignore 34 | _default_handler.setLevel(logging.INFO) 35 | _root_logger.addHandler(_default_handler) 36 | fmt = NewLineFormatter(_FORMAT, datefmt=_DATE_FORMAT) 37 | _default_handler.setFormatter(fmt) 38 | # Setting this will avoid the message 39 | # being propagated to the parent logger. 40 | _root_logger.propagate = False 41 | 42 | 43 | # The logger is initialized when the module is imported. 44 | # This is thread-safe as the module is only imported once, 45 | # guaranteed by the Python GIL. 46 | _setup_logger() 47 | 48 | 49 | def init_logger(name: str): 50 | return logging.getLogger(name) 51 | 52 | 53 | def get_logger(): 54 | return logging.getLogger("MuxServe") 55 | -------------------------------------------------------------------------------- /muxserve/muxsched/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/EfficientLLMSys/MuxServe/9fb0fa555bb4051bee612b72938e5d85796dffec/muxserve/muxsched/__init__.py -------------------------------------------------------------------------------- /muxserve/muxsched/launcher.py: -------------------------------------------------------------------------------- 1 | import os 2 | import copy 3 | import subprocess 4 | from muxserve.logger import get_logger 5 | 6 | logger = get_logger() 7 | 8 | 9 | def launch_flexserver_process(model_id, 10 | name, 11 | model, 12 | nnodes, 13 | nproc_per_node, 14 | pipeline_parallel_size, 15 | tensor_parallel_size, 16 | block_size, 17 | placement, 18 | flexstore_port, 19 | master_addr, 20 | master_port, 21 | mps_percentage, 22 | mps_dir, 23 | workload_file, 24 | split_by_model, 25 | max_num_batched_tokens, 26 | max_num_seqs, 27 | is_prefill=False, 28 | ray_address=None, 29 | runtime_profile=False, 30 | logfile=None, 31 | schedule_approach=None): 32 | prefill_option = "--is-prefill" if is_prefill else "" 33 | split_option = f"--split-by-model {split_by_model}" if split_by_model else "" 34 | runtime_profile_option = "--runtime-profile" if runtime_profile else "" 35 | 36 | cmd = f"python -m torch.distributed.launch " \ 37 | f"--nnodes={nnodes} " \ 38 | f"--nproc-per-node={nproc_per_node} " \ 39 | f"--master-addr {master_addr} " \ 40 | f"--master-port {master_port} " \ 41 | f"muxserve/flexserver/muxserve_server.py " \ 42 | f"--model-id {model_id} --model-name {name} " \ 43 | f"--model {model} --tensor-parallel-size {tensor_parallel_size} " \ 44 | f"--pipeline-parallel-size {pipeline_parallel_size} " \ 45 | f"--block-size {block_size} --swap-space 1 " \ 46 | f"--max-num-batched-tokens {max_num_batched_tokens} " \ 47 | f"--max-num-seqs {max_num_seqs} " \ 48 | f"--flexstore-port {flexstore_port} --workload-file {workload_file} " \ 49 | f"--mps-percentage {mps_percentage} {prefill_option} {split_option} " \ 50 | f"{runtime_profile_option} " 51 | 52 | proc_env = copy.deepcopy(os.environ) 53 | # proc_env["CUDA_VISIBLE_DEVICES"] = ",".join([str(g) for g in placement]) 54 | if mps_dir is not None: 55 | proc_env["CUDA_MPS_PIPE_DIRECTORY"] = f"{mps_dir}/nvidia-mps" 56 | proc_env["CUDA_MPS_LOG_DIRECTORY"] = f"{mps_dir}/nvidia-log" 57 | if schedule_approach == "fix": 58 | real_mps = mps_percentage - 10 if is_prefill else mps_percentage 59 | else: 60 | real_mps = mps_percentage 61 | proc_env[f"CUDA_MPS_ACTIVE_THREAD_PERCENTAGE"] = str(real_mps) 62 | proc_env["MASTER_ADDR"] = master_addr 63 | proc_env["MASTER_PORT"] = str(master_port) 64 | 65 | logdir = os.environ.get("VLLM_PROC_LOG", "log/vllm_proc") 66 | if not os.path.exists(logdir): 67 | os.makedirs(logdir, exist_ok=True) 68 | model_name = name.split("/")[-1] 69 | if logfile is None: 70 | logfile = f"{logdir}/{model_name}_sm{mps_percentage}.log" 71 | logger.info(f"Start process cmd: {cmd}, Output log file: {logfile}") 72 | 73 | logfile_writer = open(logfile, "w") 74 | logfile_writer.write(f"Start process cmd: {cmd}\n") 75 | logfile_writer.write(f"Environment Variable: \n") 76 | for k, v in proc_env.items(): 77 | logfile_writer.write(f" {k}: {v}\n") 78 | proc = subprocess.Popen( 79 | cmd, 80 | env=proc_env, 81 | shell=True, 82 | stdout=logfile_writer, 83 | stderr=subprocess.STDOUT, 84 | ) 85 | return proc 86 | -------------------------------------------------------------------------------- /muxserve/tracer.py: -------------------------------------------------------------------------------- 1 | import json 2 | 3 | from muxserve.logger import init_logger 4 | 5 | logger = init_logger(__name__) 6 | 7 | 8 | def pack_to_proc_name(model_name, mps_percentage): 9 | model_name = model_name.split("/")[-1] 10 | return f"{model_name} (MPS {mps_percentage})" 11 | 12 | 13 | class FlexTracer: 14 | 15 | def __init__(self, process_names): 16 | self.trace_events = [] 17 | 18 | self.proc_to_pid = {} 19 | for pid, proc_name in enumerate(process_names): 20 | self.proc_to_pid[proc_name] = pid 21 | meta_event = { 22 | 'name': 'thread_name', 23 | 'ph': 'M', 24 | 'pid': pid, 25 | 'tid': 0, 26 | 'args': { 27 | 'name': f'{proc_name}' 28 | } 29 | } 30 | self.trace_events.append(meta_event) 31 | meta_event = { 32 | 'name': 'thread_sort_index', 33 | 'ph': 'M', 34 | 'pid': pid, 35 | 'tid': 0, 36 | 'args': { 37 | 'sort_index': pid, 38 | } 39 | } 40 | self.trace_events.append(meta_event) 41 | 42 | self.event_holder = {} 43 | 44 | def add_trace_event(self, name, cat, proc_name, times): 45 | for ph, ts in zip(['B', 'E'], times): 46 | event = { 47 | 'name': name, 48 | 'cat': cat, 49 | 'ph': ph, 50 | 'pid': self.proc_to_pid[proc_name], 51 | 'tid': 0, 52 | 'ts': ts * 1e6, 53 | } 54 | self.trace_events.append(event) 55 | 56 | def add_event(self, name, cat, proc_name, ts, start=True): 57 | if start: 58 | self.event_holder[proc_name] = ts 59 | return 60 | 61 | st_tick = self.event_holder.pop(proc_name) 62 | times = [st_tick, ts] 63 | self.add_trace_event(name, cat, proc_name, times) 64 | 65 | def export(self, trace_file): 66 | logger.info(f"Export execution trace to {trace_file}") 67 | trace_data = { 68 | 'traceEvents': self.trace_events, 69 | 'displayTimeUnit': 'ms' 70 | } 71 | with open(f'{trace_file}', 'w') as f: 72 | json.dump(trace_data, f) 73 | -------------------------------------------------------------------------------- /muxserve/zmq_utils.py: -------------------------------------------------------------------------------- 1 | import zmq 2 | 3 | 4 | class ZMQAgent: 5 | 6 | def __init__(self): 7 | self.socket = None 8 | 9 | def __del__(self): 10 | self.socket.close() 11 | 12 | def send_string(self, msg): 13 | self.socket.send_string(msg) 14 | 15 | def recv_string(self): 16 | return self.socket.recv_string() 17 | 18 | def send_pyobj(self, msg): 19 | self.socket.send_pyobj(msg) 20 | 21 | def recv_pyobj(self): 22 | return self.socket.recv_pyobj() 23 | 24 | 25 | class ZMQServer(ZMQAgent): 26 | 27 | def __init__(self, address, port, method=zmq.REP): 28 | super().__init__() 29 | self.address = address 30 | self.port = port 31 | 32 | self.context = zmq.Context() 33 | self.socket = self.context.socket(method) 34 | self.socket.bind(f"tcp://*:{port}") 35 | 36 | 37 | class ZMQClient(ZMQAgent): 38 | 39 | def __init__(self, address, port, method=zmq.REQ): 40 | super().__init__() 41 | self.address = address 42 | self.port = port 43 | 44 | self.context = zmq.Context() 45 | self.socket = self.context.socket(method) 46 | self.socket.connect(f"tcp://{address}:{port}") 47 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | aiohttp 2 | zmq 3 | -------------------------------------------------------------------------------- /scripts/batch_temporal.sh: -------------------------------------------------------------------------------- 1 | export PYTHONPATH=$(pwd):$PYTHONPATH 2 | 3 | logdir="log/bench_7b_13b_30b_same" 4 | 5 | # Temporal multiplexing 6 | mkdir -p ${logdir}/temporal 7 | for rate in 13 11 10 1; do 8 | echo "Temporal multiplexing with sharegpt_n3_rate${rate}.json" 9 | python -m muxserve.launch examples/workloads/cfg_temporal_n3.yaml \ 10 | --workload-file examples/workloads/sharegpt_n3_rate${rate}.json \ 11 | 2>&1 | tee ${logdir}/temporal/temporal_7b_13b_30b_bs256_rate${rate}.log 12 | echo "\n\n" 13 | kill -9 $(pgrep -f muxserve) 14 | sleep 3 15 | done 16 | -------------------------------------------------------------------------------- /scripts/run_flexserver.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | ########################################################### 4 | # for single card test without zmq server 5 | torchrun --standalone --nnodes=1 --nproc-per-node=1 \ 6 | muxserve/flexserver/muxserve_server.py \ 7 | --model /mnt/afs/share/LLMCKPTs/huggyllama/llama-7b \ 8 | --tensor-parallel-size 1 \ 9 | --pipeline-parallel-size 1 \ 10 | --block-size 16 \ 11 | --swap-space 1 \ 12 | --load-format dummy \ 13 | --workload-file /mnt/afs/lurunyu/projects/profiling-muxserve/workloads/workload_bs1_inputlen64_outputlen64.json \ 14 | --mps-percentage 40 15 | ########################################################### 16 | 17 | 18 | ########################################################### 19 | # for single card test with zmq server 20 | python -m muxserve.entrypoint --flexstore --model-config examples/test_cfg.yaml \ 21 | --mps-dir /home/lurunyu/projects/profilig-muxserve/log/mps1 \ 22 | --gpu-memory-utilization 0.2 \ 23 | --flexstore-port 51051 24 | 25 | torchrun --standalone --nnodes=1 --nproc-per-node=1 \ 26 | muxserve/flexserver/muxserve_server.py \ 27 | --model /mnt/afs/share/LLMCKPTs/huggyllama/llama-7b \ 28 | --tensor-parallel-size 1 \ 29 | --pipeline-parallel-size 1 \ 30 | --block-size 16 \ 31 | --swap-space 1 \ 32 | --load-format dummy \ 33 | --flexstore-port 51051 \ 34 | --workload-file /mnt/afs/lurunyu/projects/profiling-muxserve/workloads/workload_bs1_inputlen64_outputlen64.json \ 35 | --mps-percentage 40 36 | ########################################################### 37 | 38 | 39 | ########################################################### 40 | # for multi-card test(not support yet) 41 | python -m muxserve.entrypoint --flexstore --model-config examples/model_cfg.yaml \ 42 | --mps-dir /home/lurunyu/projects/profilig-muxserve/log/mps1 \ 43 | --gpu-memory-utilization 0.2 \ 44 | --flexstore-port 50051 45 | 46 | torchrun --standalone --nnodes=1 --nproc-per-node=4 \ 47 | muxserve/flexserver/muxserve_server.py \ 48 | --model /mnt/afs/share/LLMCKPTs/huggyllama/llama-7b \ 49 | --tensor-parallel-size 1 \ 50 | --pipeline-parallel-size 4 \ 51 | --block-size 16 \ 52 | --swap-space 1 \ 53 | --load-format dummy \ 54 | --flexstore-port 50051 \ 55 | --workload-file /mnt/afs/lurunyu/projects/profiling-muxserve/workloads/workload_bs1_inputlen64_outputlen64.json \ 56 | --mps-percentage 40 57 | ########################################################### 58 | -------------------------------------------------------------------------------- /scripts/srun_flexsm.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | dir=$(dirname $0) 3 | workdir=$(realpath $dir/..) 4 | 5 | CUDA_PATH=/usr/local/cuda-11.8 6 | 7 | NNODES=1 8 | 9 | model_config=$1 10 | workload_file=$2 11 | NPROC_PER_NODE=$3 12 | muxserve_log=$4 13 | sub_mps_dir=$5 14 | proc_id=$6 15 | cluster=$7 16 | 17 | mps_log_dir=log/mps/$sub_mps_dir 18 | 19 | echo "=============================================" 20 | echo "model_config: $model_config" 21 | echo "workload_file: $workload_file" 22 | echo "NPROC_PER_NODE: $NPROC_PER_NODE" 23 | echo "muxserve_log: $muxserve_log" 24 | echo "mps_log_dir: $mps_log_dir" 25 | echo "proc_id: $proc_id" 26 | echo "cluster: $cluster" 27 | echo "vllm_log_dir: $VLLM_PROC_LOG" 28 | echo "=============================================" 29 | mkdir -p $mps_log_dir 30 | mkdir -p $(dirname $muxserve_log) 31 | mkdir -p $VLLM_PROC_LOG 32 | 33 | usrname=dmhj 34 | resource=N2lS.Ie.I60.${NPROC_PER_NODE} 35 | if [ ${NNODES} -le 1 ]; then 36 | DIST=StandAlone 37 | DIST_ARG="--nproc_per_node=$NPROC_PER_NODE" 38 | else 39 | DIST=AllReduce 40 | DIST_ARG="--nnodes=\$WORLD_SIZE --nproc_per_node=$NPROC_PER_NODE \ 41 | --master-addr=\$MASTER_ADDR --master-port=\$MASTER_PORT \ 42 | --node-rank=\$RANK" 43 | fi 44 | name=test 45 | Image_ID=registry.cn-sh-01.sensecore.cn/cpii-ccr/clouddev-snapshot:20240102-14h01m22s 46 | 47 | # cluster1 afb99c73-b2be-428d-963c-352460ab84cd d43c2524-492e-4df1-be8b-a95e688bd0f7 48 | # cluster2 fa7dc572-ab64-4ad1-b7f2-335823fc8781 d43c2524-492e-4df1-be8b-a95e688bd0f7 49 | 50 | # if 8 gpus => cluster1; 4 gpus => cluster2 51 | if [ "$cluster" -eq 1 ]; then 52 | partition_id=afb99c73-b2be-428d-963c-352460ab84cd 53 | workspace_id=d43c2524-492e-4df1-be8b-a95e688bd0f7 54 | elif [ "$cluster" -eq 2 ]; then 55 | partition_id=fa7dc572-ab64-4ad1-b7f2-335823fc8781 56 | workspace_id=d43c2524-492e-4df1-be8b-a95e688bd0f7 57 | else 58 | echo "Invalid cluster value: $cluster" 59 | exit 1 60 | fi 61 | 62 | 63 | # --begin "2024-01-27T03:30" \ 64 | srun -p $partition_id \ 65 | --workspace-id $workspace_id \ 66 | --async -o log/srun \ 67 | -N ${NNODES} -r ${resource} -j ${name} --framework pytorch -d ${DIST} \ 68 | --container-image ${Image_ID} \ 69 | --container-mounts=e70f5aef-dd05-11ed-9103-ba18b4912d57:/mnt/afs \ 70 | bash -c "rm -rf /usr/local/nvidia/lib64/*; \ 71 | su - ${usrname} -c \" \ 72 | cd ${workdir} && \ 73 | export CUDA_PATH=${CUDA_PATH} && \ 74 | export LD_LIBRARY_PATH=${CUDA_PATH}/lib64:${CUDA_PATH}/extras/CUPTI/lib64:/usr/local/nvidia/lib64:$LD_LIBRARY_PATH && \ 75 | export PATH=/mnt/afs/dmhj/miniconda3/envs/muxserve/bin:/mnt/afs/dmhj/miniconda3/condabin:$PATH && \ 76 | export PATH=${CUDA_PATH}/bin:$PATH && \ 77 | export PYTHONPATH=${workspace}:${PYTHONPATH} && \ 78 | export CUDA_MPS_PIPE_DIRECTORY=$mps_log_dir/nvidia-mps && \ 79 | export CUDA_MPS_LOG_DIRECTORY=$mps_log_dir/nvidia-log && \ 80 | export VLLM_PROC_LOG=$VLLM_PROC_LOG && \ 81 | echo \"dmhj@123\" | sudo -S sh $workdir/scripts/start_mps.sh $mps_log_dir && \ 82 | python -m muxserve.launch $model_config \ 83 | --nproc_per_node=$NPROC_PER_NODE \ 84 | --mps-dir $mps_log_dir \ 85 | --workload-file $workload_file \ 86 | --max-num-batched-tokens 2048 \ 87 | --server-port 48${proc_id}2 --flexstore-port 58${proc_id}1 \ 88 | --schedule-approach adbs \ 89 | ${DIST_ARG} \ 90 | 2>&1 | tee $muxserve_log && \ 91 | echo -e \"\n\n\" && \ 92 | echo \"dmhj@123\" | sudo -S sh $workdir/scripts/stop_mps.sh $mps_log_dir && \ 93 | kill -9 \$(pgrep -f 58${proc_id}1) && \ 94 | sleep 180 \"" 95 | -------------------------------------------------------------------------------- /scripts/srun_spatial.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | dir=$(dirname $0) 3 | workdir=$(realpath $dir/..) 4 | 5 | CUDA_PATH=/usr/local/cuda-11.8 6 | 7 | NNODES=1 8 | 9 | model_config=$1 10 | workload_file=$2 11 | llm_id=$3 12 | NPROC_PER_NODE=$4 13 | spatial_log=$5 14 | proc_id=$6 15 | cluster=$7 16 | 17 | 18 | echo "=============================================" 19 | echo "model_config: $model_config" 20 | echo "workload_file: $workload_file" 21 | echo "llm_id: $llm_id" 22 | echo "NPROC_PER_NODE: $NPROC_PER_NODE" 23 | echo "spatial_log: $spatial_log" 24 | echo "proc_id: $proc_id" 25 | echo "cluster: $cluster" 26 | echo "vllm_log_dir: $VLLM_PROC_LOG" 27 | echo "=============================================" 28 | mkdir -p $(dirname $spatial_log) 29 | mkdir -p $VLLM_PROC_LOG 30 | 31 | usrname=dmhj 32 | resource=N2lS.Ie.I60.${NPROC_PER_NODE} 33 | if [ ${NNODES} -le 1 ]; then 34 | DIST=StandAlone 35 | DIST_ARG="--nproc_per_node=$NPROC_PER_NODE" 36 | else 37 | DIST=AllReduce 38 | DIST_ARG="--nnodes=\$WORLD_SIZE --nproc_per_node=$NPROC_PER_NODE \ 39 | --master-addr=\$MASTER_ADDR --master-port=\$MASTER_PORT \ 40 | --node-rank=\$RANK" 41 | fi 42 | name=test 43 | Image_ID=registry.cn-sh-01.sensecore.cn/cpii-ccr/clouddev-snapshot:20240102-14h01m22s 44 | 45 | # cluster1 afb99c73-b2be-428d-963c-352460ab84cd d43c2524-492e-4df1-be8b-a95e688bd0f7 46 | # cluster2 fa7dc572-ab64-4ad1-b7f2-335823fc8781 d43c2524-492e-4df1-be8b-a95e688bd0f7 47 | 48 | # if 8 gpus => cluster1; 4 gpus => cluster2 49 | if [ "$cluster" -eq 1 ]; then 50 | partition_id=afb99c73-b2be-428d-963c-352460ab84cd 51 | workspace_id=d43c2524-492e-4df1-be8b-a95e688bd0f7 52 | elif [ "$cluster" -eq 2 ]; then 53 | partition_id=fa7dc572-ab64-4ad1-b7f2-335823fc8781 54 | workspace_id=d43c2524-492e-4df1-be8b-a95e688bd0f7 55 | else 56 | echo "Invalid cluster value: $cluster" 57 | exit 1 58 | fi 59 | 60 | 61 | # --begin "2024-01-27T03:30" \ 62 | srun -p $partition_id \ 63 | --workspace-id $workspace_id \ 64 | --async -o log/srun \ 65 | -N ${NNODES} -r ${resource} -j ${name} --framework pytorch -d ${DIST} \ 66 | --container-image ${Image_ID} \ 67 | --container-mounts=e70f5aef-dd05-11ed-9103-ba18b4912d57:/mnt/afs \ 68 | bash -c "rm -rf /usr/local/nvidia/lib64/*; \ 69 | su - ${usrname} -c \" \ 70 | cd ${workdir} && \ 71 | export CUDA_PATH=${CUDA_PATH} && \ 72 | export LD_LIBRARY_PATH=${CUDA_PATH}/lib64:${CUDA_PATH}/extras/CUPTI/lib64:/usr/local/nvidia/lib64:$LD_LIBRARY_PATH && \ 73 | export PATH=/mnt/afs/dmhj/miniconda3/envs/muxserve/bin:/mnt/afs/dmhj/miniconda3/condabin:$PATH && \ 74 | export PATH=${CUDA_PATH}/bin:$PATH && \ 75 | export PYTHONPATH=${workspace}:${PYTHONPATH} && \ 76 | export VLLM_PROC_LOG=$VLLM_PROC_LOG && \ 77 | python -m muxserve.launch $model_config \ 78 | --workload-file $workload_file \ 79 | --split-by-model llm-$llm_id \ 80 | --server-port 48${proc_id}2 --flexstore-port 58${proc_id}1 \ 81 | ${DIST_ARG} \ 82 | 2>&1 | tee $spatial_log && \ 83 | echo "\n\n" && \ 84 | kill -9 \$(pgrep -f 58${proc_id}1) && \ 85 | sleep 180 \"" 86 | -------------------------------------------------------------------------------- /scripts/srun_temporal.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | dir=$(dirname $0) 3 | workdir=$(realpath $dir/..) 4 | 5 | CUDA_PATH=/usr/local/cuda-11.8 6 | 7 | NNODES=1 8 | 9 | model_config=$1 10 | workload_file=$2 11 | NPROC_PER_NODE=$3 12 | muxserve_log=$4 13 | proc_id=$5 14 | cluster=$6 15 | 16 | 17 | echo "=============================================" 18 | echo "model_config: $model_config" 19 | echo "workload_file: $workload_file" 20 | echo "NPROC_PER_NODE: $NPROC_PER_NODE" 21 | echo "muxserve_log: $muxserve_log" 22 | echo "proc_id: $proc_id" 23 | echo "cluster: $cluster" 24 | echo "vllm_log_dir: $VLLM_PROC_LOG" 25 | echo "=============================================" 26 | mkdir -p $(dirname $muxserve_log) 27 | mkdir -p $VLLM_PROC_LOG 28 | 29 | usrname=dmhj 30 | resource=N2lS.Ie.I60.${NPROC_PER_NODE} 31 | if [ ${NNODES} -le 1 ]; then 32 | DIST=StandAlone 33 | DIST_ARG="--nproc_per_node=$NPROC_PER_NODE" 34 | else 35 | DIST=AllReduce 36 | DIST_ARG="--nnodes=\$WORLD_SIZE --nproc_per_node=$NPROC_PER_NODE \ 37 | --master-addr=\$MASTER_ADDR --master-port=\$MASTER_PORT \ 38 | --node-rank=\$RANK" 39 | fi 40 | name=test 41 | Image_ID=registry.cn-sh-01.sensecore.cn/cpii-ccr/clouddev-snapshot:20240102-14h01m22s 42 | 43 | # cluster1 afb99c73-b2be-428d-963c-352460ab84cd d43c2524-492e-4df1-be8b-a95e688bd0f7 44 | # cluster2 fa7dc572-ab64-4ad1-b7f2-335823fc8781 d43c2524-492e-4df1-be8b-a95e688bd0f7 45 | 46 | # if 8 gpus => cluster1; 4 gpus => cluster2 47 | if [ "$cluster" -eq 1 ]; then 48 | partition_id=afb99c73-b2be-428d-963c-352460ab84cd 49 | workspace_id=d43c2524-492e-4df1-be8b-a95e688bd0f7 50 | elif [ "$cluster" -eq 2 ]; then 51 | partition_id=fa7dc572-ab64-4ad1-b7f2-335823fc8781 52 | workspace_id=d43c2524-492e-4df1-be8b-a95e688bd0f7 53 | else 54 | echo "Invalid cluster value: $cluster" 55 | exit 1 56 | fi 57 | 58 | 59 | # --begin "2024-01-27T03:30" \ 60 | srun -p $partition_id \ 61 | --workspace-id $workspace_id \ 62 | --async -o log/srun \ 63 | -N ${NNODES} -r ${resource} -j ${name} --framework pytorch -d ${DIST} \ 64 | --container-image ${Image_ID} \ 65 | --container-mounts=e70f5aef-dd05-11ed-9103-ba18b4912d57:/mnt/afs \ 66 | bash -c "rm -rf /usr/local/nvidia/lib64/*; \ 67 | su - ${usrname} -c \" \ 68 | cd ${workdir} && \ 69 | export CUDA_PATH=${CUDA_PATH} && \ 70 | export LD_LIBRARY_PATH=${CUDA_PATH}/lib64:${CUDA_PATH}/extras/CUPTI/lib64:/usr/local/nvidia/lib64:$LD_LIBRARY_PATH && \ 71 | export PATH=/mnt/afs/dmhj/miniconda3/envs/muxserve/bin:/mnt/afs/dmhj/miniconda3/condabin:$PATH && \ 72 | export PATH=${CUDA_PATH}/bin:$PATH && \ 73 | export PYTHONPATH=${workspace}:${PYTHONPATH} && \ 74 | export VLLM_PROC_LOG=$VLLM_PROC_LOG && \ 75 | python -m muxserve.launch $model_config \ 76 | --nproc_per_node=$NPROC_PER_NODE \ 77 | --workload-file $workload_file \ 78 | --max-num-batched-tokens 2048 \ 79 | --server-port 48${proc_id}2 --flexstore-port 58${proc_id}1 \ 80 | --schedule-approach fcfs \ 81 | ${DIST_ARG} \ 82 | 2>&1 | tee $muxserve_log && \ 83 | kill -9 \$(pgrep -f 58${proc_id}1) && \ 84 | sleep 180 \"" 85 | -------------------------------------------------------------------------------- /scripts/start_mps.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # the following must be performed with root privilege 3 | # >>> sudo sh scripts/start_mps.sh [MPS_DIR] 4 | 5 | if [ "$#" -ne 1 ]; then 6 | echo "Usage: $0 " 7 | echo "bash scripts/start_mps.sh /mnt/afs/jfduan/LLMInfer/MuxServe/log/mps" 8 | exit 1 9 | fi 10 | 11 | MPSDIR=$1 12 | 13 | mkdir -p $MPSDIR 14 | 15 | # export CUDA_VISIBLE_DEVICES=0 16 | export CUDA_MPS_PIPE_DIRECTORY=${MPSDIR}/nvidia-mps 17 | export CUDA_MPS_LOG_DIRECTORY=${MPSDIR}/nvidia-log 18 | 19 | nvidia-cuda-mps-control -d 20 | 21 | mkdir -p ${MPSDIR}/nvidia-mps 22 | mkdir -p ${MPSDIR}/nvidia-log 23 | 24 | # change the permission of the pipe directory 25 | chmod 777 ${MPSDIR}/nvidia-log 26 | -------------------------------------------------------------------------------- /scripts/stop_mps.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # the following must be performed with root privilege 3 | # >>> sudo sh scripts/stop_mps.sh 4 | 5 | if [ "$#" -ne 1 ]; then 6 | echo "Usage: $0 " 7 | echo "bash scripts/stop_mps.sh /mnt/afs/jfduan/LLMInfer/MuxServe/log/mps" 8 | exit 1 9 | fi 10 | 11 | MPSDIR=$1 12 | 13 | echo quit | nvidia-cuda-mps-control 14 | pkill -f nvidia-cuda-mps-control 15 | 16 | rm -rf ${MPSDIR}/nvidia-mps 17 | rm -rf ${MPSDIR}/nvidia-log 18 | -------------------------------------------------------------------------------- /scripts/test.sh: -------------------------------------------------------------------------------- 1 | export PYTHONPATH=$(pwd):$PYTHONPATH 2 | 3 | # start muxserve 4 | echo "Starting muxserve..." 5 | python -m muxserve.launch examples/workloads/cfg_muxserve_n1_s1.yaml \ 6 | --nnodes=1 --node-rank=0 --master-addr=127.0.0.1 \ 7 | --nproc_per_node=1 \ 8 | --server-port 4145 --flexstore-port 50025 \ 9 | --mps-dir /mnt/afs/jfduan/LLMInfer/MuxServe/log/mps \ 10 | --workload-file examples/workloads/sharegpt_n1_rate10.json \ 11 | 2>&1 | tee log/muxserve_test.log 12 | -------------------------------------------------------------------------------- /tools/ipc_tensor/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/EfficientLLMSys/MuxServe/9fb0fa555bb4051bee612b72938e5d85796dffec/tools/ipc_tensor/__init__.py -------------------------------------------------------------------------------- /tools/ipc_tensor/zmq_client.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import time 3 | import torch 4 | from torch.multiprocessing.reductions import rebuild_cuda_tensor 5 | from ipc_tensor.zmq_tool import ZMQClient, timestamp 6 | 7 | PORT = 10050 8 | 9 | 10 | def main(): 11 | timestamp('connecting server', 'client') 12 | tcp_client = ZMQClient('localhost', PORT) 13 | 14 | tcp_client.send_string('hello') 15 | 16 | timestamp('connected, waiting data', 'client') 17 | data = tcp_client.recv_pyobj() 18 | print(data) 19 | 20 | timestamp('data received, rebuilding and printing', 'client') 21 | test_tensor = rebuild_cuda_tensor(torch.Tensor, **data) 22 | print(test_tensor) 23 | 24 | time.sleep(1) 25 | timestamp('modifying tensor and notify server', 'client') 26 | test_tensor[:, 1] = 8 27 | print(test_tensor) 28 | tcp_client.send_string('MODF') 29 | 30 | time.sleep(1) 31 | timestamp('waiting for server sending signal', 'client') 32 | _ = tcp_client.recv_string() 33 | 34 | timestamp('signal received, print', 'client') 35 | print(test_tensor) 36 | 37 | time.sleep(1) 38 | timestamp('prepare to quit', 'client') 39 | del test_tensor 40 | tcp_client.send_string(sys.argv[1]) 41 | 42 | 43 | if __name__ == '__main__': 44 | """ 45 | Usage: PYTHONPATH=ipc_tensor:$PYTHONPATH python ipc_tensor/zmq_client.py break 46 | """ 47 | assert len(sys.argv) > 1 48 | main() 49 | -------------------------------------------------------------------------------- /tools/ipc_tensor/zmq_server.py: -------------------------------------------------------------------------------- 1 | import time 2 | import torch 3 | import traceback 4 | from ipc_tensor.zmq_tool import ZMQServer, timestamp 5 | 6 | PORT = 10050 7 | 8 | test_tensor = torch.Tensor([[1, 2, 3], [4, 5, 6]]).cuda() 9 | 10 | 11 | def main(): 12 | print(test_tensor) 13 | test_tensor_1 = test_tensor[1:2, 0:2] 14 | print(test_tensor.size(), test_tensor.stride(), 15 | test_tensor.storage_offset()) 16 | print(test_tensor_1.size(), test_tensor_1.stride(), 17 | test_tensor_1.storage_offset()) 18 | 19 | storage = test_tensor.storage() 20 | storage_1 = test_tensor_1.storage() 21 | storage = test_tensor.storage() 22 | ''' 23 | (storage_device, storage_handle, storage_size_bytes, storage_offset_bytes, 24 | [require_grad], ref_counter_handle, ref_counter_offset, event_handle, event_sync_required) 25 | ''' 26 | t = storage._share_cuda_() 27 | tcp_server = ZMQServer('localhost', PORT) 28 | 29 | while True: 30 | timestamp('socket opened, waiting', 'server') 31 | 32 | data = tcp_server.recv_string() 33 | timestamp(f'received from client: {data}', 'server') 34 | 35 | params_dict = { 36 | "tensor_size": test_tensor.size(), 37 | "tensor_stride": test_tensor.stride(), 38 | "tensor_offset": test_tensor.storage_offset(), 39 | "storage_cls": type(storage), 40 | "dtype": test_tensor.dtype, 41 | "storage_device": t[0], 42 | "storage_handle": t[1], 43 | "storage_size_bytes": t[2], 44 | "storage_offset_bytes": t[3], 45 | "requires_grad": test_tensor.requires_grad, 46 | "ref_counter_handle": t[4], 47 | "ref_counter_offset": t[5], 48 | "event_handle": t[6], 49 | "event_sync_required": t[7] 50 | } 51 | time.sleep(1) 52 | timestamp('sending metadata of tensor storage', 'server') 53 | tcp_server.send_pyobj(params_dict) 54 | 55 | time.sleep(1) 56 | data = tcp_server.recv_string() 57 | timestamp(f'received signal from client, printing: {data}', 'server') 58 | print(test_tensor) 59 | 60 | time.sleep(1) 61 | test_tensor[1, :] = 0 62 | timestamp('server modified tensor, sending signal to client', 'server') 63 | tcp_server.send_string('MODF') 64 | 65 | time.sleep(1) 66 | timestamp('printing final tensor, and waiting for client signal', 67 | 'server') 68 | print(test_tensor) 69 | 70 | data = tcp_server.recv_string() 71 | timestamp(f'received signal from client, bye: {data}', 'server') 72 | tcp_server.send_string('bye') 73 | if data == 'break': 74 | break 75 | 76 | 77 | if __name__ == '__main__': 78 | """ 79 | Usage: PYTHONPATH=ipc_tensor:$PYTHONPATH python ipc_tensor/zmq_server.py 80 | """ 81 | try: 82 | main() 83 | except Exception: 84 | traceback.print_exc() 85 | finally: 86 | print('del') 87 | print(test_tensor) 88 | del test_tensor 89 | -------------------------------------------------------------------------------- /tools/ipc_tensor/zmq_tool.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import time 3 | from datetime import datetime 4 | import zmq 5 | 6 | 7 | def timestamp(name, stage): 8 | print(f'[{datetime.now()}] {name}, {stage}, {time.time():.3f}', 9 | file=sys.stderr) 10 | 11 | 12 | class ZMQAgent: 13 | 14 | def __init__(self): 15 | self.socket = None 16 | 17 | def __del__(self): 18 | self.socket.close() 19 | 20 | def send_string(self, msg): 21 | self.socket.send_string(msg) 22 | 23 | def recv_string(self): 24 | return self.socket.recv_string() 25 | 26 | def send_pyobj(self, msg): 27 | self.socket.send_pyobj(msg) 28 | 29 | def recv_pyobj(self): 30 | return self.socket.recv_pyobj() 31 | 32 | 33 | class ZMQServer(ZMQAgent): 34 | 35 | def __init__(self, address, port, method=zmq.REP): 36 | super().__init__() 37 | self.address = address 38 | self.port = port 39 | 40 | self.context = zmq.Context() 41 | self.socket = self.context.socket(method) 42 | self.socket.bind(f"tcp://*:{port}") 43 | 44 | 45 | class ZMQClient(ZMQAgent): 46 | 47 | def __init__(self, address, port, method=zmq.REQ): 48 | super().__init__() 49 | self.address = address 50 | self.port = port 51 | 52 | self.context = zmq.Context() 53 | self.socket = self.context.socket(method) 54 | self.socket.connect(f"tcp://{address}:{port}") 55 | --------------------------------------------------------------------------------