├── .gitattributes ├── .gitignore ├── .gitmodules ├── README.md ├── auto_search ├── new_search.py ├── profileAnalysis.py ├── search_result_json │ ├── 70B_search_result_reverse_v3.json │ └── 8B_allreduce_naive_search_result.json └── test_db.py ├── config_all ├── .gitignore ├── llama2-70B │ ├── 1024.json │ ├── 2048.json │ ├── 768.json │ ├── correct_40G │ │ ├── 2048.json │ │ ├── nanobatch-only.json │ │ ├── non-overlap.json │ │ └── offload.json │ ├── fewer_layers │ │ ├── 1024.json │ │ ├── 2048.json │ │ ├── 768.json │ │ ├── nanobatch-only.json │ │ └── non-overlap.json │ ├── nanobatch-only.json │ ├── non-overlap.json │ └── pllm-offload.json ├── llama3-70B │ ├── 1024.json │ ├── 2048.json │ ├── 768.json │ ├── correct_40G │ │ ├── 2048.json │ │ ├── nanobatch-only.json │ │ └── non-overlap.json │ ├── fewer_layers │ │ ├── 1024.json │ │ ├── 2048.json │ │ ├── 768.json │ │ ├── nanobatch-only.json │ │ └── non-overlap.json │ ├── nanobatch-only.json │ └── non-overlap.json ├── llama3-8B │ ├── 1024-1-layer.json │ ├── 1024-h100-2.json │ ├── 1024-h100.json │ ├── 1024-small-batch.json │ ├── 1024.json │ ├── correct_40G │ │ ├── 1024-reconfig.json │ │ ├── 1024.json │ │ ├── nanobatch-only.json │ │ └── non-overlap.json │ ├── fewer_layers │ │ ├── 1024.json │ │ ├── nanobatch-only.json │ │ └── non-overlap.json │ ├── nanobatch-only-h100.json │ ├── nanobatch-only.json │ ├── non-overlap-h100.json │ ├── non-overlap.json │ └── offload.json ├── llama3.1-70B │ ├── 1024.json │ ├── 2048.json │ ├── 768.json │ ├── correct_40G │ │ ├── 2048.json │ │ ├── nanobatch-only.json │ │ └── non-overlap.json │ ├── fewer_layers │ │ ├── 1024.json │ │ ├── 2048.json │ │ ├── 768.json │ │ ├── nanobatch-only.json │ │ └── non-overlap.json │ ├── nanobatch-only.json │ └── non-overlap.json ├── llama3.1-8B │ ├── 1024.json │ ├── correct_40G │ │ ├── 1024.json │ │ ├── nanobatch-only.json │ │ └── non-overlap.json │ ├── fewer_layers │ │ ├── 1024.json │ │ ├── nanobatch-only.json │ │ └── non-overlap.json │ ├── nanobatch-only.json │ └── non-overlap.json ├── mixtral-8-7B │ ├── 6144.json │ ├── correct_40G │ │ ├── 6144.json │ │ ├── nanobatch-only.json │ │ └── non-overlap.json │ ├── fewer_layers │ │ ├── 6144.json │ │ ├── nanobatch-only.json │ │ └── non-overlap.json │ ├── nanobatch-only.json │ └── non-overlap.json └── qwen2-72B │ ├── 2048.json │ ├── correct_40G │ ├── 2048.json │ ├── nanobatch-only.json │ └── non-overlap.json │ ├── fewer_layers │ ├── 2048.json │ ├── nanobatch-only.json │ └── non-overlap.json │ ├── nanobatch-only.json │ └── non-overlap.json ├── core ├── IOWrapper.py ├── bufferAllocate.py ├── categoryType.py ├── executor.py ├── nanobatchSplit.py ├── processWeight.py ├── weightManager.py ├── weightSaver.py ├── weightWrapper.py └── worker.py ├── entry ├── .gitignore ├── compare.py ├── compare_kv_cache.py ├── easy_test.py ├── run_llama3.py ├── run_llama3.sh ├── test.py ├── test_multi_gpu3.py ├── test_multi_gpu_tp2.sh └── test_multi_gpu_tp4.sh ├── figures ├── NanoflowLogo.png ├── OfflineThroughput.png ├── SampleOutput.png ├── SystemDesign.png ├── async-schedule.png ├── feasibility.png ├── nanoflow-osdi-simplify.pdf ├── online-latency.png ├── pipeline.gif └── serve.png ├── installAnaconda.sh ├── kvcache ├── kv.py └── triton │ └── kv_copy.py ├── matplot ├── draw_profile_dec.py ├── draw_profile_gemm.py └── draw_single_gpu_performance_compare.py ├── models ├── llama3-70B.py ├── llama3_70B_FlashinferKVCache_allgather.py ├── llama3_70B_FlashinferKVCache_allreduce.py ├── llama3_70B_KVCacheFA_TP8.py ├── llama3_70B_KVCacheTorch_allgather.py ├── llama3_70B_KVCacheTorch_allreduce.py ├── llama3_70B_allreduce_AutoSearch.py ├── llama3_8B_FlashinferKVCache_allreduce.py ├── llama3_8B_KVCacheFA_allgather.py ├── llama3_8B_KVCacheFA_allreduce.py ├── llama3_8B_allreduce_AutoSearch.py ├── llama3_AutoSearch.py ├── llama3_FlashinferKVCache.py ├── llama3_KVCacheFA.py └── llama3_KVCacheTorch.py ├── operations ├── activation │ └── silu.py ├── allgather │ └── allgather.py ├── allreduce │ └── allreduce.py ├── attention │ ├── llamaAttention_flashattn.py │ ├── llamaAttention_flashinfer.py │ ├── llamaAttention_torch.py │ └── llamaAttention_vllm.py ├── embedding │ └── embedding.py ├── gemm │ ├── gemm_K_parallel.py │ ├── gemm_N_parallel.py │ └── gemm_impls.py ├── globalOp │ └── globalOp.py ├── impl_base.py ├── norm │ ├── rmsnorm.py │ └── triton │ │ └── kernels │ │ └── rmsnorm.py ├── operation_base.py ├── recv │ └── recv.py ├── rope │ ├── help_functions.py │ ├── rope_fa.py │ ├── rope_flashinfer.py │ ├── rope_torch.py │ └── triton │ │ └── kernels │ │ └── rope.py ├── sampling │ └── max_sampling.py ├── send │ └── send.py └── virtualOp │ └── virtual_ops.py ├── platform_config.py ├── playground ├── compare.py ├── gt_llama.py ├── operations.ipynb ├── operations.py ├── other.py ├── roctx.py ├── testMarker │ ├── CMakeLists.txt │ ├── MatrixTranspose.cpp │ └── hip_helper.h └── test_multi_gpu.py ├── pybind ├── .gitignore ├── CMakeLists.txt ├── include │ ├── comm.h │ ├── config.h │ ├── cutlassGemmBase.cuh │ ├── cutlassGemmWrapper.cuh │ ├── cutlassH100Wrapper.cuh │ ├── gemmFactory.cuh │ ├── helper.h │ ├── netWrapper.cuh │ ├── networkManager.cuh │ ├── operatorWrapper.cuh │ ├── rms_norm.cuh │ ├── small_cuda_operator.cuh │ └── vortexData.cuh └── src │ ├── bind_all_reduce.cu │ ├── bind_gemm.cu │ ├── bind_genEmbedding.cu │ ├── bind_init_net.cu │ ├── bind_rms_norm.cu │ ├── bind_ropeappend.cu │ ├── bind_sample.cu │ ├── bind_silu_multiply.cu │ ├── comm.cu │ ├── fast_uring.cpp │ ├── generate_gemm │ ├── .gitignore │ ├── Makefile │ ├── gemmFactory.in │ └── genGEMM.py │ └── test.cu ├── pybind_amd ├── bind_gemm │ ├── CMakeLists.txt │ ├── bind_ck.cpp │ ├── common.hpp │ ├── gemm_lib.cpp │ ├── gemm_lib.hpp │ ├── gemm_lib_universal.cpp │ └── gemm_lib_universal.hpp └── bind_marker │ ├── CMakeLists.txt │ └── bind_marker.cpp ├── setup.sh ├── tests ├── benchmark_llama_model.py ├── benchmark_vllm.py ├── test_compute_comm_overlap.py ├── test_custom_all_reduce.py ├── test_gemm.py ├── test_nccl_attn_overlap.py ├── test_nccl_wrapper.py ├── test_overlap.py └── test_overlap_script.py └── utils ├── affinity_utils.py ├── cu_mask.py ├── frontend.py ├── gen_req.py ├── graph_plot.py ├── green_ctx.py ├── input_test.py ├── offload.py ├── prof_marker.py ├── request_info.py └── util_functions.py /.gitattributes: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/efeslab/Nanoflow/HEAD/.gitattributes -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/efeslab/Nanoflow/HEAD/.gitignore -------------------------------------------------------------------------------- /.gitmodules: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/efeslab/Nanoflow/HEAD/.gitmodules -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/efeslab/Nanoflow/HEAD/README.md -------------------------------------------------------------------------------- /auto_search/new_search.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/efeslab/Nanoflow/HEAD/auto_search/new_search.py -------------------------------------------------------------------------------- /auto_search/profileAnalysis.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/efeslab/Nanoflow/HEAD/auto_search/profileAnalysis.py -------------------------------------------------------------------------------- /auto_search/search_result_json/70B_search_result_reverse_v3.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/efeslab/Nanoflow/HEAD/auto_search/search_result_json/70B_search_result_reverse_v3.json -------------------------------------------------------------------------------- /auto_search/search_result_json/8B_allreduce_naive_search_result.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/efeslab/Nanoflow/HEAD/auto_search/search_result_json/8B_allreduce_naive_search_result.json -------------------------------------------------------------------------------- /auto_search/test_db.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/efeslab/Nanoflow/HEAD/auto_search/test_db.py -------------------------------------------------------------------------------- /config_all/.gitignore: -------------------------------------------------------------------------------- 1 | !*.json -------------------------------------------------------------------------------- /config_all/llama2-70B/1024.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/efeslab/Nanoflow/HEAD/config_all/llama2-70B/1024.json -------------------------------------------------------------------------------- /config_all/llama2-70B/2048.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/efeslab/Nanoflow/HEAD/config_all/llama2-70B/2048.json -------------------------------------------------------------------------------- /config_all/llama2-70B/768.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/efeslab/Nanoflow/HEAD/config_all/llama2-70B/768.json -------------------------------------------------------------------------------- /config_all/llama2-70B/correct_40G/2048.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/efeslab/Nanoflow/HEAD/config_all/llama2-70B/correct_40G/2048.json -------------------------------------------------------------------------------- /config_all/llama2-70B/correct_40G/nanobatch-only.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/efeslab/Nanoflow/HEAD/config_all/llama2-70B/correct_40G/nanobatch-only.json -------------------------------------------------------------------------------- /config_all/llama2-70B/correct_40G/non-overlap.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/efeslab/Nanoflow/HEAD/config_all/llama2-70B/correct_40G/non-overlap.json -------------------------------------------------------------------------------- /config_all/llama2-70B/correct_40G/offload.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/efeslab/Nanoflow/HEAD/config_all/llama2-70B/correct_40G/offload.json -------------------------------------------------------------------------------- /config_all/llama2-70B/fewer_layers/1024.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/efeslab/Nanoflow/HEAD/config_all/llama2-70B/fewer_layers/1024.json -------------------------------------------------------------------------------- /config_all/llama2-70B/fewer_layers/2048.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/efeslab/Nanoflow/HEAD/config_all/llama2-70B/fewer_layers/2048.json -------------------------------------------------------------------------------- /config_all/llama2-70B/fewer_layers/768.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/efeslab/Nanoflow/HEAD/config_all/llama2-70B/fewer_layers/768.json -------------------------------------------------------------------------------- /config_all/llama2-70B/fewer_layers/nanobatch-only.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/efeslab/Nanoflow/HEAD/config_all/llama2-70B/fewer_layers/nanobatch-only.json -------------------------------------------------------------------------------- /config_all/llama2-70B/fewer_layers/non-overlap.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/efeslab/Nanoflow/HEAD/config_all/llama2-70B/fewer_layers/non-overlap.json -------------------------------------------------------------------------------- /config_all/llama2-70B/nanobatch-only.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/efeslab/Nanoflow/HEAD/config_all/llama2-70B/nanobatch-only.json -------------------------------------------------------------------------------- /config_all/llama2-70B/non-overlap.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/efeslab/Nanoflow/HEAD/config_all/llama2-70B/non-overlap.json -------------------------------------------------------------------------------- /config_all/llama2-70B/pllm-offload.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/efeslab/Nanoflow/HEAD/config_all/llama2-70B/pllm-offload.json -------------------------------------------------------------------------------- /config_all/llama3-70B/1024.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/efeslab/Nanoflow/HEAD/config_all/llama3-70B/1024.json -------------------------------------------------------------------------------- /config_all/llama3-70B/2048.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/efeslab/Nanoflow/HEAD/config_all/llama3-70B/2048.json -------------------------------------------------------------------------------- /config_all/llama3-70B/768.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/efeslab/Nanoflow/HEAD/config_all/llama3-70B/768.json -------------------------------------------------------------------------------- /config_all/llama3-70B/correct_40G/2048.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/efeslab/Nanoflow/HEAD/config_all/llama3-70B/correct_40G/2048.json -------------------------------------------------------------------------------- /config_all/llama3-70B/correct_40G/nanobatch-only.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/efeslab/Nanoflow/HEAD/config_all/llama3-70B/correct_40G/nanobatch-only.json -------------------------------------------------------------------------------- /config_all/llama3-70B/correct_40G/non-overlap.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/efeslab/Nanoflow/HEAD/config_all/llama3-70B/correct_40G/non-overlap.json -------------------------------------------------------------------------------- /config_all/llama3-70B/fewer_layers/1024.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/efeslab/Nanoflow/HEAD/config_all/llama3-70B/fewer_layers/1024.json -------------------------------------------------------------------------------- /config_all/llama3-70B/fewer_layers/2048.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/efeslab/Nanoflow/HEAD/config_all/llama3-70B/fewer_layers/2048.json -------------------------------------------------------------------------------- /config_all/llama3-70B/fewer_layers/768.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/efeslab/Nanoflow/HEAD/config_all/llama3-70B/fewer_layers/768.json -------------------------------------------------------------------------------- /config_all/llama3-70B/fewer_layers/nanobatch-only.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/efeslab/Nanoflow/HEAD/config_all/llama3-70B/fewer_layers/nanobatch-only.json -------------------------------------------------------------------------------- /config_all/llama3-70B/fewer_layers/non-overlap.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/efeslab/Nanoflow/HEAD/config_all/llama3-70B/fewer_layers/non-overlap.json -------------------------------------------------------------------------------- /config_all/llama3-70B/nanobatch-only.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/efeslab/Nanoflow/HEAD/config_all/llama3-70B/nanobatch-only.json -------------------------------------------------------------------------------- /config_all/llama3-70B/non-overlap.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/efeslab/Nanoflow/HEAD/config_all/llama3-70B/non-overlap.json -------------------------------------------------------------------------------- /config_all/llama3-8B/1024-1-layer.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/efeslab/Nanoflow/HEAD/config_all/llama3-8B/1024-1-layer.json -------------------------------------------------------------------------------- /config_all/llama3-8B/1024-h100-2.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/efeslab/Nanoflow/HEAD/config_all/llama3-8B/1024-h100-2.json -------------------------------------------------------------------------------- /config_all/llama3-8B/1024-h100.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/efeslab/Nanoflow/HEAD/config_all/llama3-8B/1024-h100.json -------------------------------------------------------------------------------- /config_all/llama3-8B/1024-small-batch.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/efeslab/Nanoflow/HEAD/config_all/llama3-8B/1024-small-batch.json -------------------------------------------------------------------------------- /config_all/llama3-8B/1024.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/efeslab/Nanoflow/HEAD/config_all/llama3-8B/1024.json -------------------------------------------------------------------------------- /config_all/llama3-8B/correct_40G/1024-reconfig.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/efeslab/Nanoflow/HEAD/config_all/llama3-8B/correct_40G/1024-reconfig.json -------------------------------------------------------------------------------- /config_all/llama3-8B/correct_40G/1024.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/efeslab/Nanoflow/HEAD/config_all/llama3-8B/correct_40G/1024.json -------------------------------------------------------------------------------- /config_all/llama3-8B/correct_40G/nanobatch-only.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/efeslab/Nanoflow/HEAD/config_all/llama3-8B/correct_40G/nanobatch-only.json -------------------------------------------------------------------------------- /config_all/llama3-8B/correct_40G/non-overlap.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/efeslab/Nanoflow/HEAD/config_all/llama3-8B/correct_40G/non-overlap.json -------------------------------------------------------------------------------- /config_all/llama3-8B/fewer_layers/1024.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/efeslab/Nanoflow/HEAD/config_all/llama3-8B/fewer_layers/1024.json -------------------------------------------------------------------------------- /config_all/llama3-8B/fewer_layers/nanobatch-only.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/efeslab/Nanoflow/HEAD/config_all/llama3-8B/fewer_layers/nanobatch-only.json -------------------------------------------------------------------------------- /config_all/llama3-8B/fewer_layers/non-overlap.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/efeslab/Nanoflow/HEAD/config_all/llama3-8B/fewer_layers/non-overlap.json -------------------------------------------------------------------------------- /config_all/llama3-8B/nanobatch-only-h100.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/efeslab/Nanoflow/HEAD/config_all/llama3-8B/nanobatch-only-h100.json -------------------------------------------------------------------------------- /config_all/llama3-8B/nanobatch-only.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/efeslab/Nanoflow/HEAD/config_all/llama3-8B/nanobatch-only.json -------------------------------------------------------------------------------- /config_all/llama3-8B/non-overlap-h100.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/efeslab/Nanoflow/HEAD/config_all/llama3-8B/non-overlap-h100.json -------------------------------------------------------------------------------- /config_all/llama3-8B/non-overlap.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/efeslab/Nanoflow/HEAD/config_all/llama3-8B/non-overlap.json -------------------------------------------------------------------------------- /config_all/llama3-8B/offload.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/efeslab/Nanoflow/HEAD/config_all/llama3-8B/offload.json -------------------------------------------------------------------------------- /config_all/llama3.1-70B/1024.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/efeslab/Nanoflow/HEAD/config_all/llama3.1-70B/1024.json -------------------------------------------------------------------------------- /config_all/llama3.1-70B/2048.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/efeslab/Nanoflow/HEAD/config_all/llama3.1-70B/2048.json -------------------------------------------------------------------------------- /config_all/llama3.1-70B/768.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/efeslab/Nanoflow/HEAD/config_all/llama3.1-70B/768.json -------------------------------------------------------------------------------- /config_all/llama3.1-70B/correct_40G/2048.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/efeslab/Nanoflow/HEAD/config_all/llama3.1-70B/correct_40G/2048.json -------------------------------------------------------------------------------- /config_all/llama3.1-70B/correct_40G/nanobatch-only.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/efeslab/Nanoflow/HEAD/config_all/llama3.1-70B/correct_40G/nanobatch-only.json -------------------------------------------------------------------------------- /config_all/llama3.1-70B/correct_40G/non-overlap.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/efeslab/Nanoflow/HEAD/config_all/llama3.1-70B/correct_40G/non-overlap.json -------------------------------------------------------------------------------- /config_all/llama3.1-70B/fewer_layers/1024.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/efeslab/Nanoflow/HEAD/config_all/llama3.1-70B/fewer_layers/1024.json -------------------------------------------------------------------------------- /config_all/llama3.1-70B/fewer_layers/2048.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/efeslab/Nanoflow/HEAD/config_all/llama3.1-70B/fewer_layers/2048.json -------------------------------------------------------------------------------- /config_all/llama3.1-70B/fewer_layers/768.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/efeslab/Nanoflow/HEAD/config_all/llama3.1-70B/fewer_layers/768.json -------------------------------------------------------------------------------- /config_all/llama3.1-70B/fewer_layers/nanobatch-only.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/efeslab/Nanoflow/HEAD/config_all/llama3.1-70B/fewer_layers/nanobatch-only.json -------------------------------------------------------------------------------- /config_all/llama3.1-70B/fewer_layers/non-overlap.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/efeslab/Nanoflow/HEAD/config_all/llama3.1-70B/fewer_layers/non-overlap.json -------------------------------------------------------------------------------- /config_all/llama3.1-70B/nanobatch-only.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/efeslab/Nanoflow/HEAD/config_all/llama3.1-70B/nanobatch-only.json -------------------------------------------------------------------------------- /config_all/llama3.1-70B/non-overlap.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/efeslab/Nanoflow/HEAD/config_all/llama3.1-70B/non-overlap.json -------------------------------------------------------------------------------- /config_all/llama3.1-8B/1024.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/efeslab/Nanoflow/HEAD/config_all/llama3.1-8B/1024.json -------------------------------------------------------------------------------- /config_all/llama3.1-8B/correct_40G/1024.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/efeslab/Nanoflow/HEAD/config_all/llama3.1-8B/correct_40G/1024.json -------------------------------------------------------------------------------- /config_all/llama3.1-8B/correct_40G/nanobatch-only.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/efeslab/Nanoflow/HEAD/config_all/llama3.1-8B/correct_40G/nanobatch-only.json -------------------------------------------------------------------------------- /config_all/llama3.1-8B/correct_40G/non-overlap.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/efeslab/Nanoflow/HEAD/config_all/llama3.1-8B/correct_40G/non-overlap.json -------------------------------------------------------------------------------- /config_all/llama3.1-8B/fewer_layers/1024.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/efeslab/Nanoflow/HEAD/config_all/llama3.1-8B/fewer_layers/1024.json -------------------------------------------------------------------------------- /config_all/llama3.1-8B/fewer_layers/nanobatch-only.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/efeslab/Nanoflow/HEAD/config_all/llama3.1-8B/fewer_layers/nanobatch-only.json -------------------------------------------------------------------------------- /config_all/llama3.1-8B/fewer_layers/non-overlap.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/efeslab/Nanoflow/HEAD/config_all/llama3.1-8B/fewer_layers/non-overlap.json -------------------------------------------------------------------------------- /config_all/llama3.1-8B/nanobatch-only.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/efeslab/Nanoflow/HEAD/config_all/llama3.1-8B/nanobatch-only.json -------------------------------------------------------------------------------- /config_all/llama3.1-8B/non-overlap.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/efeslab/Nanoflow/HEAD/config_all/llama3.1-8B/non-overlap.json -------------------------------------------------------------------------------- /config_all/mixtral-8-7B/6144.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/efeslab/Nanoflow/HEAD/config_all/mixtral-8-7B/6144.json -------------------------------------------------------------------------------- /config_all/mixtral-8-7B/correct_40G/6144.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/efeslab/Nanoflow/HEAD/config_all/mixtral-8-7B/correct_40G/6144.json -------------------------------------------------------------------------------- /config_all/mixtral-8-7B/correct_40G/nanobatch-only.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/efeslab/Nanoflow/HEAD/config_all/mixtral-8-7B/correct_40G/nanobatch-only.json -------------------------------------------------------------------------------- /config_all/mixtral-8-7B/correct_40G/non-overlap.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/efeslab/Nanoflow/HEAD/config_all/mixtral-8-7B/correct_40G/non-overlap.json -------------------------------------------------------------------------------- /config_all/mixtral-8-7B/fewer_layers/6144.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/efeslab/Nanoflow/HEAD/config_all/mixtral-8-7B/fewer_layers/6144.json -------------------------------------------------------------------------------- /config_all/mixtral-8-7B/fewer_layers/nanobatch-only.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/efeslab/Nanoflow/HEAD/config_all/mixtral-8-7B/fewer_layers/nanobatch-only.json -------------------------------------------------------------------------------- /config_all/mixtral-8-7B/fewer_layers/non-overlap.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/efeslab/Nanoflow/HEAD/config_all/mixtral-8-7B/fewer_layers/non-overlap.json -------------------------------------------------------------------------------- /config_all/mixtral-8-7B/nanobatch-only.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/efeslab/Nanoflow/HEAD/config_all/mixtral-8-7B/nanobatch-only.json -------------------------------------------------------------------------------- /config_all/mixtral-8-7B/non-overlap.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/efeslab/Nanoflow/HEAD/config_all/mixtral-8-7B/non-overlap.json -------------------------------------------------------------------------------- /config_all/qwen2-72B/2048.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/efeslab/Nanoflow/HEAD/config_all/qwen2-72B/2048.json -------------------------------------------------------------------------------- /config_all/qwen2-72B/correct_40G/2048.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/efeslab/Nanoflow/HEAD/config_all/qwen2-72B/correct_40G/2048.json -------------------------------------------------------------------------------- /config_all/qwen2-72B/correct_40G/nanobatch-only.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/efeslab/Nanoflow/HEAD/config_all/qwen2-72B/correct_40G/nanobatch-only.json -------------------------------------------------------------------------------- /config_all/qwen2-72B/correct_40G/non-overlap.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/efeslab/Nanoflow/HEAD/config_all/qwen2-72B/correct_40G/non-overlap.json -------------------------------------------------------------------------------- /config_all/qwen2-72B/fewer_layers/2048.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/efeslab/Nanoflow/HEAD/config_all/qwen2-72B/fewer_layers/2048.json -------------------------------------------------------------------------------- /config_all/qwen2-72B/fewer_layers/nanobatch-only.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/efeslab/Nanoflow/HEAD/config_all/qwen2-72B/fewer_layers/nanobatch-only.json -------------------------------------------------------------------------------- /config_all/qwen2-72B/fewer_layers/non-overlap.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/efeslab/Nanoflow/HEAD/config_all/qwen2-72B/fewer_layers/non-overlap.json -------------------------------------------------------------------------------- /config_all/qwen2-72B/nanobatch-only.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/efeslab/Nanoflow/HEAD/config_all/qwen2-72B/nanobatch-only.json -------------------------------------------------------------------------------- /config_all/qwen2-72B/non-overlap.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/efeslab/Nanoflow/HEAD/config_all/qwen2-72B/non-overlap.json -------------------------------------------------------------------------------- /core/IOWrapper.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/efeslab/Nanoflow/HEAD/core/IOWrapper.py -------------------------------------------------------------------------------- /core/bufferAllocate.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/efeslab/Nanoflow/HEAD/core/bufferAllocate.py -------------------------------------------------------------------------------- /core/categoryType.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/efeslab/Nanoflow/HEAD/core/categoryType.py -------------------------------------------------------------------------------- /core/executor.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/efeslab/Nanoflow/HEAD/core/executor.py -------------------------------------------------------------------------------- /core/nanobatchSplit.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/efeslab/Nanoflow/HEAD/core/nanobatchSplit.py -------------------------------------------------------------------------------- /core/processWeight.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/efeslab/Nanoflow/HEAD/core/processWeight.py -------------------------------------------------------------------------------- /core/weightManager.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/efeslab/Nanoflow/HEAD/core/weightManager.py -------------------------------------------------------------------------------- /core/weightSaver.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/efeslab/Nanoflow/HEAD/core/weightSaver.py -------------------------------------------------------------------------------- /core/weightWrapper.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/efeslab/Nanoflow/HEAD/core/weightWrapper.py -------------------------------------------------------------------------------- /core/worker.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/efeslab/Nanoflow/HEAD/core/worker.py -------------------------------------------------------------------------------- /entry/.gitignore: -------------------------------------------------------------------------------- 1 | test_data -------------------------------------------------------------------------------- /entry/compare.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/efeslab/Nanoflow/HEAD/entry/compare.py -------------------------------------------------------------------------------- /entry/compare_kv_cache.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/efeslab/Nanoflow/HEAD/entry/compare_kv_cache.py -------------------------------------------------------------------------------- /entry/easy_test.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/efeslab/Nanoflow/HEAD/entry/easy_test.py -------------------------------------------------------------------------------- /entry/run_llama3.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/efeslab/Nanoflow/HEAD/entry/run_llama3.py -------------------------------------------------------------------------------- /entry/run_llama3.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/efeslab/Nanoflow/HEAD/entry/run_llama3.sh -------------------------------------------------------------------------------- /entry/test.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/efeslab/Nanoflow/HEAD/entry/test.py -------------------------------------------------------------------------------- /entry/test_multi_gpu3.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/efeslab/Nanoflow/HEAD/entry/test_multi_gpu3.py -------------------------------------------------------------------------------- /entry/test_multi_gpu_tp2.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/efeslab/Nanoflow/HEAD/entry/test_multi_gpu_tp2.sh -------------------------------------------------------------------------------- /entry/test_multi_gpu_tp4.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/efeslab/Nanoflow/HEAD/entry/test_multi_gpu_tp4.sh -------------------------------------------------------------------------------- /figures/NanoflowLogo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/efeslab/Nanoflow/HEAD/figures/NanoflowLogo.png -------------------------------------------------------------------------------- /figures/OfflineThroughput.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/efeslab/Nanoflow/HEAD/figures/OfflineThroughput.png -------------------------------------------------------------------------------- /figures/SampleOutput.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/efeslab/Nanoflow/HEAD/figures/SampleOutput.png -------------------------------------------------------------------------------- /figures/SystemDesign.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/efeslab/Nanoflow/HEAD/figures/SystemDesign.png -------------------------------------------------------------------------------- /figures/async-schedule.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/efeslab/Nanoflow/HEAD/figures/async-schedule.png -------------------------------------------------------------------------------- /figures/feasibility.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/efeslab/Nanoflow/HEAD/figures/feasibility.png -------------------------------------------------------------------------------- /figures/nanoflow-osdi-simplify.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/efeslab/Nanoflow/HEAD/figures/nanoflow-osdi-simplify.pdf -------------------------------------------------------------------------------- /figures/online-latency.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/efeslab/Nanoflow/HEAD/figures/online-latency.png -------------------------------------------------------------------------------- /figures/pipeline.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/efeslab/Nanoflow/HEAD/figures/pipeline.gif -------------------------------------------------------------------------------- /figures/serve.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/efeslab/Nanoflow/HEAD/figures/serve.png -------------------------------------------------------------------------------- /installAnaconda.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/efeslab/Nanoflow/HEAD/installAnaconda.sh -------------------------------------------------------------------------------- /kvcache/kv.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/efeslab/Nanoflow/HEAD/kvcache/kv.py -------------------------------------------------------------------------------- /kvcache/triton/kv_copy.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/efeslab/Nanoflow/HEAD/kvcache/triton/kv_copy.py -------------------------------------------------------------------------------- /matplot/draw_profile_dec.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/efeslab/Nanoflow/HEAD/matplot/draw_profile_dec.py -------------------------------------------------------------------------------- /matplot/draw_profile_gemm.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/efeslab/Nanoflow/HEAD/matplot/draw_profile_gemm.py -------------------------------------------------------------------------------- /matplot/draw_single_gpu_performance_compare.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/efeslab/Nanoflow/HEAD/matplot/draw_single_gpu_performance_compare.py -------------------------------------------------------------------------------- /models/llama3-70B.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/efeslab/Nanoflow/HEAD/models/llama3-70B.py -------------------------------------------------------------------------------- /models/llama3_70B_FlashinferKVCache_allgather.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/efeslab/Nanoflow/HEAD/models/llama3_70B_FlashinferKVCache_allgather.py -------------------------------------------------------------------------------- /models/llama3_70B_FlashinferKVCache_allreduce.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/efeslab/Nanoflow/HEAD/models/llama3_70B_FlashinferKVCache_allreduce.py -------------------------------------------------------------------------------- /models/llama3_70B_KVCacheFA_TP8.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/efeslab/Nanoflow/HEAD/models/llama3_70B_KVCacheFA_TP8.py -------------------------------------------------------------------------------- /models/llama3_70B_KVCacheTorch_allgather.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/efeslab/Nanoflow/HEAD/models/llama3_70B_KVCacheTorch_allgather.py -------------------------------------------------------------------------------- /models/llama3_70B_KVCacheTorch_allreduce.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/efeslab/Nanoflow/HEAD/models/llama3_70B_KVCacheTorch_allreduce.py -------------------------------------------------------------------------------- /models/llama3_70B_allreduce_AutoSearch.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/efeslab/Nanoflow/HEAD/models/llama3_70B_allreduce_AutoSearch.py -------------------------------------------------------------------------------- /models/llama3_8B_FlashinferKVCache_allreduce.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/efeslab/Nanoflow/HEAD/models/llama3_8B_FlashinferKVCache_allreduce.py -------------------------------------------------------------------------------- /models/llama3_8B_KVCacheFA_allgather.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/efeslab/Nanoflow/HEAD/models/llama3_8B_KVCacheFA_allgather.py -------------------------------------------------------------------------------- /models/llama3_8B_KVCacheFA_allreduce.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/efeslab/Nanoflow/HEAD/models/llama3_8B_KVCacheFA_allreduce.py -------------------------------------------------------------------------------- /models/llama3_8B_allreduce_AutoSearch.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/efeslab/Nanoflow/HEAD/models/llama3_8B_allreduce_AutoSearch.py -------------------------------------------------------------------------------- /models/llama3_AutoSearch.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/efeslab/Nanoflow/HEAD/models/llama3_AutoSearch.py -------------------------------------------------------------------------------- /models/llama3_FlashinferKVCache.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/efeslab/Nanoflow/HEAD/models/llama3_FlashinferKVCache.py -------------------------------------------------------------------------------- /models/llama3_KVCacheFA.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/efeslab/Nanoflow/HEAD/models/llama3_KVCacheFA.py -------------------------------------------------------------------------------- /models/llama3_KVCacheTorch.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/efeslab/Nanoflow/HEAD/models/llama3_KVCacheTorch.py -------------------------------------------------------------------------------- /operations/activation/silu.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/efeslab/Nanoflow/HEAD/operations/activation/silu.py -------------------------------------------------------------------------------- /operations/allgather/allgather.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/efeslab/Nanoflow/HEAD/operations/allgather/allgather.py -------------------------------------------------------------------------------- /operations/allreduce/allreduce.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/efeslab/Nanoflow/HEAD/operations/allreduce/allreduce.py -------------------------------------------------------------------------------- /operations/attention/llamaAttention_flashattn.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/efeslab/Nanoflow/HEAD/operations/attention/llamaAttention_flashattn.py -------------------------------------------------------------------------------- /operations/attention/llamaAttention_flashinfer.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/efeslab/Nanoflow/HEAD/operations/attention/llamaAttention_flashinfer.py -------------------------------------------------------------------------------- /operations/attention/llamaAttention_torch.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/efeslab/Nanoflow/HEAD/operations/attention/llamaAttention_torch.py -------------------------------------------------------------------------------- /operations/attention/llamaAttention_vllm.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/efeslab/Nanoflow/HEAD/operations/attention/llamaAttention_vllm.py -------------------------------------------------------------------------------- /operations/embedding/embedding.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/efeslab/Nanoflow/HEAD/operations/embedding/embedding.py -------------------------------------------------------------------------------- /operations/gemm/gemm_K_parallel.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/efeslab/Nanoflow/HEAD/operations/gemm/gemm_K_parallel.py -------------------------------------------------------------------------------- /operations/gemm/gemm_N_parallel.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/efeslab/Nanoflow/HEAD/operations/gemm/gemm_N_parallel.py -------------------------------------------------------------------------------- /operations/gemm/gemm_impls.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/efeslab/Nanoflow/HEAD/operations/gemm/gemm_impls.py -------------------------------------------------------------------------------- /operations/globalOp/globalOp.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/efeslab/Nanoflow/HEAD/operations/globalOp/globalOp.py -------------------------------------------------------------------------------- /operations/impl_base.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/efeslab/Nanoflow/HEAD/operations/impl_base.py -------------------------------------------------------------------------------- /operations/norm/rmsnorm.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/efeslab/Nanoflow/HEAD/operations/norm/rmsnorm.py -------------------------------------------------------------------------------- /operations/norm/triton/kernels/rmsnorm.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/efeslab/Nanoflow/HEAD/operations/norm/triton/kernels/rmsnorm.py -------------------------------------------------------------------------------- /operations/operation_base.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/efeslab/Nanoflow/HEAD/operations/operation_base.py -------------------------------------------------------------------------------- /operations/recv/recv.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /operations/rope/help_functions.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/efeslab/Nanoflow/HEAD/operations/rope/help_functions.py -------------------------------------------------------------------------------- /operations/rope/rope_fa.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/efeslab/Nanoflow/HEAD/operations/rope/rope_fa.py -------------------------------------------------------------------------------- /operations/rope/rope_flashinfer.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/efeslab/Nanoflow/HEAD/operations/rope/rope_flashinfer.py -------------------------------------------------------------------------------- /operations/rope/rope_torch.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/efeslab/Nanoflow/HEAD/operations/rope/rope_torch.py -------------------------------------------------------------------------------- /operations/rope/triton/kernels/rope.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/efeslab/Nanoflow/HEAD/operations/rope/triton/kernels/rope.py -------------------------------------------------------------------------------- /operations/sampling/max_sampling.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/efeslab/Nanoflow/HEAD/operations/sampling/max_sampling.py -------------------------------------------------------------------------------- /operations/send/send.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/efeslab/Nanoflow/HEAD/operations/send/send.py -------------------------------------------------------------------------------- /operations/virtualOp/virtual_ops.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/efeslab/Nanoflow/HEAD/operations/virtualOp/virtual_ops.py -------------------------------------------------------------------------------- /platform_config.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/efeslab/Nanoflow/HEAD/platform_config.py -------------------------------------------------------------------------------- /playground/compare.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/efeslab/Nanoflow/HEAD/playground/compare.py -------------------------------------------------------------------------------- /playground/gt_llama.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/efeslab/Nanoflow/HEAD/playground/gt_llama.py -------------------------------------------------------------------------------- /playground/operations.ipynb: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/efeslab/Nanoflow/HEAD/playground/operations.ipynb -------------------------------------------------------------------------------- /playground/operations.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/efeslab/Nanoflow/HEAD/playground/operations.py -------------------------------------------------------------------------------- /playground/other.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/efeslab/Nanoflow/HEAD/playground/other.py -------------------------------------------------------------------------------- /playground/roctx.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/efeslab/Nanoflow/HEAD/playground/roctx.py -------------------------------------------------------------------------------- /playground/testMarker/CMakeLists.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/efeslab/Nanoflow/HEAD/playground/testMarker/CMakeLists.txt -------------------------------------------------------------------------------- /playground/testMarker/MatrixTranspose.cpp: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/efeslab/Nanoflow/HEAD/playground/testMarker/MatrixTranspose.cpp -------------------------------------------------------------------------------- /playground/testMarker/hip_helper.h: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/efeslab/Nanoflow/HEAD/playground/testMarker/hip_helper.h -------------------------------------------------------------------------------- /playground/test_multi_gpu.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/efeslab/Nanoflow/HEAD/playground/test_multi_gpu.py -------------------------------------------------------------------------------- /pybind/.gitignore: -------------------------------------------------------------------------------- 1 | ./build/* -------------------------------------------------------------------------------- /pybind/CMakeLists.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/efeslab/Nanoflow/HEAD/pybind/CMakeLists.txt -------------------------------------------------------------------------------- /pybind/include/comm.h: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/efeslab/Nanoflow/HEAD/pybind/include/comm.h -------------------------------------------------------------------------------- /pybind/include/config.h: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/efeslab/Nanoflow/HEAD/pybind/include/config.h -------------------------------------------------------------------------------- /pybind/include/cutlassGemmBase.cuh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/efeslab/Nanoflow/HEAD/pybind/include/cutlassGemmBase.cuh -------------------------------------------------------------------------------- /pybind/include/cutlassGemmWrapper.cuh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/efeslab/Nanoflow/HEAD/pybind/include/cutlassGemmWrapper.cuh -------------------------------------------------------------------------------- /pybind/include/cutlassH100Wrapper.cuh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/efeslab/Nanoflow/HEAD/pybind/include/cutlassH100Wrapper.cuh -------------------------------------------------------------------------------- /pybind/include/gemmFactory.cuh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/efeslab/Nanoflow/HEAD/pybind/include/gemmFactory.cuh -------------------------------------------------------------------------------- /pybind/include/helper.h: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/efeslab/Nanoflow/HEAD/pybind/include/helper.h -------------------------------------------------------------------------------- /pybind/include/netWrapper.cuh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/efeslab/Nanoflow/HEAD/pybind/include/netWrapper.cuh -------------------------------------------------------------------------------- /pybind/include/networkManager.cuh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/efeslab/Nanoflow/HEAD/pybind/include/networkManager.cuh -------------------------------------------------------------------------------- /pybind/include/operatorWrapper.cuh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/efeslab/Nanoflow/HEAD/pybind/include/operatorWrapper.cuh -------------------------------------------------------------------------------- /pybind/include/rms_norm.cuh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/efeslab/Nanoflow/HEAD/pybind/include/rms_norm.cuh -------------------------------------------------------------------------------- /pybind/include/small_cuda_operator.cuh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/efeslab/Nanoflow/HEAD/pybind/include/small_cuda_operator.cuh -------------------------------------------------------------------------------- /pybind/include/vortexData.cuh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/efeslab/Nanoflow/HEAD/pybind/include/vortexData.cuh -------------------------------------------------------------------------------- /pybind/src/bind_all_reduce.cu: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/efeslab/Nanoflow/HEAD/pybind/src/bind_all_reduce.cu -------------------------------------------------------------------------------- /pybind/src/bind_gemm.cu: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/efeslab/Nanoflow/HEAD/pybind/src/bind_gemm.cu -------------------------------------------------------------------------------- /pybind/src/bind_genEmbedding.cu: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/efeslab/Nanoflow/HEAD/pybind/src/bind_genEmbedding.cu -------------------------------------------------------------------------------- /pybind/src/bind_init_net.cu: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/efeslab/Nanoflow/HEAD/pybind/src/bind_init_net.cu -------------------------------------------------------------------------------- /pybind/src/bind_rms_norm.cu: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/efeslab/Nanoflow/HEAD/pybind/src/bind_rms_norm.cu -------------------------------------------------------------------------------- /pybind/src/bind_ropeappend.cu: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/efeslab/Nanoflow/HEAD/pybind/src/bind_ropeappend.cu -------------------------------------------------------------------------------- /pybind/src/bind_sample.cu: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/efeslab/Nanoflow/HEAD/pybind/src/bind_sample.cu -------------------------------------------------------------------------------- /pybind/src/bind_silu_multiply.cu: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/efeslab/Nanoflow/HEAD/pybind/src/bind_silu_multiply.cu -------------------------------------------------------------------------------- /pybind/src/comm.cu: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/efeslab/Nanoflow/HEAD/pybind/src/comm.cu -------------------------------------------------------------------------------- /pybind/src/fast_uring.cpp: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/efeslab/Nanoflow/HEAD/pybind/src/fast_uring.cpp -------------------------------------------------------------------------------- /pybind/src/generate_gemm/.gitignore: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/efeslab/Nanoflow/HEAD/pybind/src/generate_gemm/.gitignore -------------------------------------------------------------------------------- /pybind/src/generate_gemm/Makefile: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/efeslab/Nanoflow/HEAD/pybind/src/generate_gemm/Makefile -------------------------------------------------------------------------------- /pybind/src/generate_gemm/gemmFactory.in: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/efeslab/Nanoflow/HEAD/pybind/src/generate_gemm/gemmFactory.in -------------------------------------------------------------------------------- /pybind/src/generate_gemm/genGEMM.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/efeslab/Nanoflow/HEAD/pybind/src/generate_gemm/genGEMM.py -------------------------------------------------------------------------------- /pybind/src/test.cu: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/efeslab/Nanoflow/HEAD/pybind/src/test.cu -------------------------------------------------------------------------------- /pybind_amd/bind_gemm/CMakeLists.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/efeslab/Nanoflow/HEAD/pybind_amd/bind_gemm/CMakeLists.txt -------------------------------------------------------------------------------- /pybind_amd/bind_gemm/bind_ck.cpp: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/efeslab/Nanoflow/HEAD/pybind_amd/bind_gemm/bind_ck.cpp -------------------------------------------------------------------------------- /pybind_amd/bind_gemm/common.hpp: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/efeslab/Nanoflow/HEAD/pybind_amd/bind_gemm/common.hpp -------------------------------------------------------------------------------- /pybind_amd/bind_gemm/gemm_lib.cpp: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/efeslab/Nanoflow/HEAD/pybind_amd/bind_gemm/gemm_lib.cpp -------------------------------------------------------------------------------- /pybind_amd/bind_gemm/gemm_lib.hpp: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/efeslab/Nanoflow/HEAD/pybind_amd/bind_gemm/gemm_lib.hpp -------------------------------------------------------------------------------- /pybind_amd/bind_gemm/gemm_lib_universal.cpp: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/efeslab/Nanoflow/HEAD/pybind_amd/bind_gemm/gemm_lib_universal.cpp -------------------------------------------------------------------------------- /pybind_amd/bind_gemm/gemm_lib_universal.hpp: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/efeslab/Nanoflow/HEAD/pybind_amd/bind_gemm/gemm_lib_universal.hpp -------------------------------------------------------------------------------- /pybind_amd/bind_marker/CMakeLists.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/efeslab/Nanoflow/HEAD/pybind_amd/bind_marker/CMakeLists.txt -------------------------------------------------------------------------------- /pybind_amd/bind_marker/bind_marker.cpp: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/efeslab/Nanoflow/HEAD/pybind_amd/bind_marker/bind_marker.cpp -------------------------------------------------------------------------------- /setup.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/efeslab/Nanoflow/HEAD/setup.sh -------------------------------------------------------------------------------- /tests/benchmark_llama_model.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/efeslab/Nanoflow/HEAD/tests/benchmark_llama_model.py -------------------------------------------------------------------------------- /tests/benchmark_vllm.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/efeslab/Nanoflow/HEAD/tests/benchmark_vllm.py -------------------------------------------------------------------------------- /tests/test_compute_comm_overlap.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/efeslab/Nanoflow/HEAD/tests/test_compute_comm_overlap.py -------------------------------------------------------------------------------- /tests/test_custom_all_reduce.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/efeslab/Nanoflow/HEAD/tests/test_custom_all_reduce.py -------------------------------------------------------------------------------- /tests/test_gemm.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/efeslab/Nanoflow/HEAD/tests/test_gemm.py -------------------------------------------------------------------------------- /tests/test_nccl_attn_overlap.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/efeslab/Nanoflow/HEAD/tests/test_nccl_attn_overlap.py -------------------------------------------------------------------------------- /tests/test_nccl_wrapper.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/efeslab/Nanoflow/HEAD/tests/test_nccl_wrapper.py -------------------------------------------------------------------------------- /tests/test_overlap.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/efeslab/Nanoflow/HEAD/tests/test_overlap.py -------------------------------------------------------------------------------- /tests/test_overlap_script.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/efeslab/Nanoflow/HEAD/tests/test_overlap_script.py -------------------------------------------------------------------------------- /utils/affinity_utils.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/efeslab/Nanoflow/HEAD/utils/affinity_utils.py -------------------------------------------------------------------------------- /utils/cu_mask.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/efeslab/Nanoflow/HEAD/utils/cu_mask.py -------------------------------------------------------------------------------- /utils/frontend.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/efeslab/Nanoflow/HEAD/utils/frontend.py -------------------------------------------------------------------------------- /utils/gen_req.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/efeslab/Nanoflow/HEAD/utils/gen_req.py -------------------------------------------------------------------------------- /utils/graph_plot.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/efeslab/Nanoflow/HEAD/utils/graph_plot.py -------------------------------------------------------------------------------- /utils/green_ctx.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/efeslab/Nanoflow/HEAD/utils/green_ctx.py -------------------------------------------------------------------------------- /utils/input_test.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/efeslab/Nanoflow/HEAD/utils/input_test.py -------------------------------------------------------------------------------- /utils/offload.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/efeslab/Nanoflow/HEAD/utils/offload.py -------------------------------------------------------------------------------- /utils/prof_marker.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/efeslab/Nanoflow/HEAD/utils/prof_marker.py -------------------------------------------------------------------------------- /utils/request_info.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/efeslab/Nanoflow/HEAD/utils/request_info.py -------------------------------------------------------------------------------- /utils/util_functions.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/efeslab/Nanoflow/HEAD/utils/util_functions.py --------------------------------------------------------------------------------