├── .clang-format ├── .dockerignore ├── .github ├── PULL_REQUEST_TEMPLATE.md ├── runs-on.yml └── workflows │ ├── build.yml │ ├── clang-format-check.yml │ ├── docker-build.yml │ ├── gpu-ci.yml │ ├── helpers │ ├── free_space_on_runner.sh │ ├── gpu_ci_helper.py │ ├── install_cudnn.sh │ ├── install_dependencies.sh │ ├── install_nccl.sh │ └── oracle_con.py │ ├── pip-deploy.yml │ ├── pip-install.yml │ └── shell-check.yml ├── .gitignore ├── .gitmodules ├── .readthedocs.yaml ├── CMakeLists.txt ├── CODE_OF_CONDUCT.md ├── CONTRIBUTING.md ├── INSTALL.md ├── LICENSE ├── MANIFEST.in ├── MULTI-NODE.md ├── README.md ├── SERVE.md ├── cmake ├── cuda.cmake ├── cudnn.cmake ├── flash_attn.cmake ├── hip.cmake ├── json.cmake ├── legion.cmake ├── nccl.cmake ├── optional.cmake ├── pip_install │ └── CMakeLists.txt ├── utils.cmake └── variant.cmake ├── conda └── flexflow.yml ├── config ├── config.inc └── config.linux ├── docker ├── README.md ├── build.sh ├── flexflow-environment │ ├── Dockerfile │ └── install_pytorch.sh ├── flexflow │ └── Dockerfile ├── publish.sh ├── pull.sh └── run.sh ├── docs ├── Makefile ├── doxygen │ ├── Doxyfile │ ├── README.md │ └── theme │ │ ├── rust_customdoxygen.css │ │ ├── rust_footer.html │ │ └── rust_header.html ├── make.bat ├── requirements.txt └── source │ ├── chatbot.rst │ ├── conf.py │ ├── cpp_api.rst │ ├── developers_guide │ ├── developers_guide.rst │ ├── ff_internals.rst │ └── internals.md │ ├── docker.rst │ ├── imgs │ ├── gradio_api.png │ └── gradio_interface.png │ ├── index.rst │ ├── installation.rst │ ├── keras.rst │ ├── mt5.rst │ ├── multinode.rst │ ├── onnx.rst │ ├── prompt_template.rst │ ├── python │ ├── create.rst │ ├── dataloader.rst │ ├── init.rst │ ├── layers.rst │ ├── models.rst │ └── train.rst │ ├── pytorch.rst │ ├── rag.rst │ ├── serve_api.rst │ ├── serve_fastapi.rst │ ├── serve_gradioapi.rst │ ├── serve_overview.rst │ ├── serve_usecases.rst │ ├── train_examples.rst │ ├── train_interface.rst │ ├── train_overview.rst │ ├── train_python_api.rst │ └── welcome.rst ├── img ├── overview.png ├── performance.png └── spec_infer_demo.gif ├── include └── flexflow │ ├── accessor.h │ ├── attention_config.h │ ├── basic_graph.h │ ├── batch_config.h │ ├── config.h │ ├── dataloader.h │ ├── device.h │ ├── dominators.h │ ├── ffconst.h │ ├── ffconst_utils.h │ ├── fftype.h │ ├── flash_api.h │ ├── flexflow_c.h │ ├── gpt_tokenizer.h │ ├── graph.h │ ├── graph_structures.h │ ├── inference.h │ ├── initializer.h │ ├── layer.h │ ├── loss_functions.h │ ├── machine_view.h │ ├── mapper.h │ ├── memory_optimization.h │ ├── metrics_functions.h │ ├── model.h │ ├── node.h │ ├── op_meta.h │ ├── operator.h │ ├── operator_params.h │ ├── ops │ ├── add_bias_residual_layer_norm.h │ ├── add_bias_residual_layer_norm_params.h │ ├── aggregate.h │ ├── aggregate_params.h │ ├── aggregate_spec.h │ ├── aggregate_spec_params.h │ ├── arg_topk.h │ ├── arg_topk_params.h │ ├── argmax.h │ ├── argmax_params.h │ ├── attention.h │ ├── attention_params.h │ ├── batch_matmul.h │ ├── batch_matmul_params.h │ ├── batch_norm.h │ ├── beam_topk.h │ ├── beam_topk_params.h │ ├── cache.h │ ├── cast.h │ ├── cast_params.h │ ├── concat.h │ ├── concat_params.h │ ├── conv_2d.h │ ├── conv_2d_params.h │ ├── dropout.h │ ├── dropout_params.h │ ├── element_binary.h │ ├── element_binary_params.h │ ├── element_unary.h │ ├── element_unary_params.h │ ├── embedding.h │ ├── embedding_params.h │ ├── experts.h │ ├── experts_params.h │ ├── flat.h │ ├── flat_params.h │ ├── fused.h │ ├── gather.h │ ├── gather_params.h │ ├── groupby.h │ ├── groupby_params.h │ ├── inc_multihead_self_attention.h │ ├── inc_multihead_self_attention_params.h │ ├── kernels │ │ ├── batch_matmul_kernels.h │ │ ├── cast_kernels.h │ │ ├── concat_kernels.h │ │ ├── conv_2d_kernels.h │ │ ├── decompress_kernels.h │ │ ├── dropout_kernels.h │ │ ├── element_binary_kernels.h │ │ ├── embedding_kernels.h │ │ ├── flat_kernels.h │ │ ├── gather_kernels.h │ │ ├── inc_multihead_self_attention_kernels.h │ │ ├── inc_multihead_self_attention_utils.cuh │ │ ├── linear_kernels.h │ │ ├── lora_linear_kernels.h │ │ ├── pool_2d_kernels.h │ │ ├── reshape_kernels.h │ │ ├── residual_rms_norm_kernels.h │ │ ├── rms_norm_kernels.h │ │ ├── softmax_kernels.h │ │ ├── split_kernels.h │ │ └── transpose_kernels.h │ ├── layer_norm.h │ ├── layer_norm_params.h │ ├── linear.h │ ├── linear_params.h │ ├── lora_linear.h │ ├── lora_linear_params.h │ ├── mean.h │ ├── noop.h │ ├── pool_2d.h │ ├── pool_2d_params.h │ ├── reduce.h │ ├── reduce_params.h │ ├── reshape.h │ ├── reshape_params.h │ ├── residual_layer_norm.h │ ├── residual_layer_norm_params.h │ ├── residual_rms_norm.h │ ├── residual_rms_norm_params.h │ ├── reverse.h │ ├── rms_norm.h │ ├── rms_norm_params.h │ ├── sampling.h │ ├── sampling_params.h │ ├── sigmoid_silu_multi.h │ ├── sigmoid_silu_multi_params.h │ ├── softmax.h │ ├── softmax_params.h │ ├── spec_inc_multihead_self_attention.h │ ├── spec_inc_multihead_self_attention_params.h │ ├── split.h │ ├── split_params.h │ ├── topk.h │ ├── topk_params.h │ ├── transpose.h │ ├── transpose_params.h │ ├── tree_inc_multihead_self_attention.h │ └── tree_inc_multihead_self_attention_params.h │ ├── optimizer.h │ ├── page_manager.h │ ├── parallel_ops │ ├── allreduce.h │ ├── allreduce_params.h │ ├── combine.h │ ├── combine_params.h │ ├── fused_parallel_op.h │ ├── fused_parallel_op_params.h │ ├── kernels │ │ ├── allreduce_kernels.h │ │ ├── combine_kernels.h │ │ ├── parallel_identity_kernels.h │ │ ├── partition_kernels.h │ │ ├── reduction_kernels.h │ │ └── replicate_kernels.h │ ├── parallel_identity.h │ ├── parallel_identity_params.h │ ├── parallel_op.h │ ├── parallel_op_info.h │ ├── partition.h │ ├── partition_params.h │ ├── reduction.h │ ├── reduction_params.h │ ├── replicate.h │ └── replicate_params.h │ ├── parallel_tensor.h │ ├── recompile.h │ ├── request_manager.h │ ├── runtime.h │ ├── simulator.h │ ├── substitution.h │ ├── substitution_loader.h │ ├── tensor.h │ └── utils │ ├── cuda_helper.h │ ├── disjoint_set.h │ ├── dot │ ├── dot_file.h │ └── record_formatter.h │ ├── file_loader.h │ ├── hash_utils.h │ ├── hip_helper.h │ ├── memory_allocator.h │ ├── peft_weight_allocator.h │ ├── random_utils.h │ ├── recursive_logger.h │ ├── test_utils.h │ └── tuple.h ├── inference ├── .gitignore ├── README.md ├── flexllm │ ├── CMakeLists.txt │ └── peft_train.cc ├── incr_decoding │ ├── CMakeLists.txt │ └── incr_decoding.cc ├── inference_wrapper.in ├── models │ ├── falcon.cc │ ├── falcon.h │ ├── llama.cc │ ├── llama.h │ ├── mpt.cc │ ├── mpt.h │ ├── opt.cc │ ├── opt.h │ ├── starcoder.cc │ └── starcoder.h ├── peft │ ├── CMakeLists.txt │ └── peft.cc ├── python │ ├── chat.py │ ├── entrypoint │ │ ├── fastapi_incr.py │ │ └── fastapi_specinfer.py │ ├── ff_peft.py │ ├── incr_decoding.py │ ├── peft_demo │ │ ├── INSTRUCTIONS.md │ │ ├── demo.ipynb │ │ └── demo.py │ ├── save_dataset.py │ ├── spec_infer.py │ ├── streamlit │ │ ├── README.md │ │ ├── app.py │ │ └── fastapi_incr.py │ └── usecases │ │ ├── gradio_incr.py │ │ ├── gradio_specinfer.py │ │ ├── prompt_template_incr.py │ │ ├── prompt_template_specinfer.py │ │ ├── rag_incr.py │ │ └── rag_specinfer.py ├── spec_infer │ ├── CMakeLists.txt │ └── spec_infer.cc └── utils │ ├── compress_llama_weights.py │ ├── download_hf_model.py │ ├── download_peft_model.py │ ├── mem_analysis.py │ └── upload_peft_model.py ├── pyproject.toml ├── python ├── flexflow │ ├── __init__.py │ ├── config.py │ ├── core │ │ ├── __init__.py │ │ ├── flexflow_cffi.py │ │ ├── flexflow_logger.py │ │ ├── flexflow_top.py │ │ └── flexflowlib.py │ ├── findpylib.py │ ├── serve │ │ ├── __init__.py │ │ ├── models │ │ │ ├── __init__.py │ │ │ ├── base.py │ │ │ ├── falcon.py │ │ │ ├── llama.py │ │ │ ├── mpt.py │ │ │ ├── opt.py │ │ │ └── starcoder.py │ │ └── serve.py │ ├── torch │ │ ├── __init__.py │ │ ├── model.py │ │ └── nn │ │ │ ├── __init__.py │ │ │ └── modules │ │ │ ├── __init__.py │ │ │ └── module.py │ └── type.py ├── flexflow_cffi_build.py ├── flexflow_cffi_header.py.in └── flexflow_python_build.py ├── requirements.txt ├── scripts ├── format.sh ├── gdb │ └── pretty_print.py ├── install_tokenizer.sh ├── mnist_mlp_run.sh └── rdelacou │ └── generate_trace.py ├── setup.py ├── src ├── c │ └── flexflow_c.cc ├── dataloader │ ├── dataloader.cc │ ├── dataloader.cpp │ └── dataloader.cu ├── loss_functions │ ├── loss_functions.cc │ ├── loss_functions.cpp │ └── loss_functions.cu ├── mapper │ └── mapper.cc ├── metrics_functions │ ├── metrics_functions.cc │ ├── metrics_functions.cpp │ └── metrics_functions.cu ├── ops │ ├── add_bias_residual_layer_norm.cc │ ├── add_bias_residual_layer_norm.cpp │ ├── add_bias_residual_layer_norm.cu │ ├── aggregate.cc │ ├── aggregate.cpp │ ├── aggregate.cu │ ├── aggregate_spec.cc │ ├── aggregate_spec.cpp │ ├── aggregate_spec.cu │ ├── arg_topk.cc │ ├── arg_topk.cpp │ ├── arg_topk.cu │ ├── argmax.cc │ ├── argmax.cpp │ ├── argmax.cu │ ├── attention.cc │ ├── attention.cpp │ ├── attention.cu │ ├── attention_impl.cu │ ├── batch_matmul.cc │ ├── batch_norm.cc │ ├── batch_norm.cpp │ ├── batch_norm.cu │ ├── beam_topk.cc │ ├── beam_topk.cpp │ ├── beam_topk.cu │ ├── cache.cc │ ├── cache.cpp │ ├── cache.cu │ ├── cast.cc │ ├── concat.cc │ ├── conv_2d.cc │ ├── dropout.cc │ ├── element_binary.cc │ ├── element_unary.cc │ ├── element_unary.cpp │ ├── element_unary.cu │ ├── embedding.cc │ ├── experts.cc │ ├── experts.cpp │ ├── experts.cu │ ├── flat.cc │ ├── fused.cc │ ├── fused.cpp │ ├── fused.cu │ ├── gather.cc │ ├── group_by.cc │ ├── group_by.cpp │ ├── group_by.cu │ ├── inc_multihead_self_attention.cc │ ├── inc_multihead_self_attention.cpp │ ├── inc_multihead_self_attention.cu │ ├── kernels │ │ ├── batch_matmul.cpp │ │ ├── batch_matmul.cu │ │ ├── cast_kernels.cpp │ │ ├── cast_kernels.cu │ │ ├── concat_kernels.cpp │ │ ├── concat_kernels.cu │ │ ├── conv_2d_kernels.cpp │ │ ├── conv_2d_kernels.cu │ │ ├── decompress_kernels.cpp │ │ ├── decompress_kernels.cu │ │ ├── dropout_kernels.cpp │ │ ├── dropout_kernels.cu │ │ ├── element_binary_kernels.cpp │ │ ├── element_binary_kernels.cu │ │ ├── embedding_kernels.cpp │ │ ├── embedding_kernels.cu │ │ ├── flat_kernels.cpp │ │ ├── flat_kernels.cu │ │ ├── gather_kernels.cpp │ │ ├── gather_kernels.cu │ │ ├── linear_kernels.cpp │ │ ├── linear_kernels.cu │ │ ├── lora_linear_kernels.cpp │ │ ├── lora_linear_kernels.cu │ │ ├── pool_2d_kernels.cpp │ │ ├── pool_2d_kernels.cu │ │ ├── reshape_kernels.cpp │ │ ├── reshape_kernels.cu │ │ ├── residual_rms_norm_kernels.cpp │ │ ├── residual_rms_norm_kernels.cu │ │ ├── rms_norm_kernels.cpp │ │ ├── rms_norm_kernels.cu │ │ ├── softmax.cpp │ │ ├── softmax.cu │ │ ├── split_kernels.cpp │ │ ├── split_kernels.cu │ │ ├── transpose_kernels.cpp │ │ └── transpose_kernels.cu │ ├── layer_norm.cc │ ├── layer_norm.cpp │ ├── layer_norm.cu │ ├── linear.cc │ ├── lora_linear.cc │ ├── lora_linear_params.cc │ ├── mean.cc │ ├── mean.cpp │ ├── mean.cu │ ├── moe.cc │ ├── noop.cc │ ├── pool_2d.cc │ ├── reduce.cc │ ├── reduce.cpp │ ├── reduce.cu │ ├── reshape.cc │ ├── residual_layer_norm.cc │ ├── residual_layer_norm.cpp │ ├── residual_layer_norm.cu │ ├── residual_rms_norm.cc │ ├── reverse.cc │ ├── reverse.cpp │ ├── reverse.cu │ ├── rms_norm.cc │ ├── sampling.cc │ ├── sampling.cpp │ ├── sampling.cu │ ├── sigmoid_silu_multi.cc │ ├── sigmoid_silu_multi.cpp │ ├── sigmoid_silu_multi.cu │ ├── softmax.cc │ ├── spec_inc_multihead_self_attention.cc │ ├── spec_inc_multihead_self_attention.cpp │ ├── spec_inc_multihead_self_attention.cu │ ├── split.cc │ ├── topk.cc │ ├── topk.cpp │ ├── topk.cu │ ├── transpose.cc │ ├── tree_inc_multihead_self_attention.cc │ ├── tree_inc_multihead_self_attention.cpp │ └── tree_inc_multihead_self_attention.cu ├── parallel_ops │ ├── allreduce.cc │ ├── combine.cc │ ├── fused_parallel_op.cc │ ├── fused_parallel_op.cpp │ ├── fused_parallel_op.cu │ ├── kernels │ │ ├── allreduce_kernels.cpp │ │ ├── allreduce_kernels.cu │ │ ├── combine_kernels.cpp │ │ ├── combine_kernels.cu │ │ ├── parallel_identity_kernels.cpp │ │ ├── parallel_identity_kernels.cu │ │ ├── partition_kernels.cpp │ │ ├── partition_kernels.cu │ │ ├── reduction_kernels.cpp │ │ ├── reduction_kernels.cu │ │ ├── replicate_kernels.cpp │ │ └── replicate_kernels.cu │ ├── parallel_identity.cc │ ├── partition.cc │ ├── reduction.cc │ └── replicate.cc ├── recompile │ └── recompile_state.cc ├── runtime │ ├── accessor.cc │ ├── accessor_kernel.cpp │ ├── accessor_kernel.cu │ ├── batch_config.cc │ ├── beam_search_batch_config.cc │ ├── compile.sh │ ├── cpp_driver.cc │ ├── cuda_helper.cu │ ├── ffconst_utils.cc │ ├── fftype.cc │ ├── file_loader.cc │ ├── gpt_tokenizer.cc │ ├── graph.cc │ ├── hip_helper.cpp │ ├── inference_manager.cc │ ├── initializer.cc │ ├── initializer_kernel.cpp │ ├── initializer_kernel.cu │ ├── layer.cc │ ├── machine_model.cc │ ├── machine_view.cc │ ├── memory_allocator.cc │ ├── memory_optimization.cc │ ├── model.cc │ ├── model.cpp │ ├── model.cu │ ├── network.cc │ ├── operator.cc │ ├── operator_params.cc │ ├── optimizer.cc │ ├── optimizer_kernel.cpp │ ├── optimizer_kernel.cu │ ├── page_manager.cc │ ├── parallel_op.cc │ ├── parallel_tensor.cc │ ├── peft_weight_allocator.cc │ ├── peft_weight_allocator.cpp │ ├── peft_weight_allocator.cu │ ├── recursive_logger.cc │ ├── request_manager.cc │ ├── request_manager.cpp │ ├── request_manager.cu │ ├── simulator.cc │ ├── simulator.cpp │ ├── simulator.cu │ ├── strategy.cc │ ├── substitution.cc │ ├── substitution_loader.cc │ ├── tensor.cpp │ ├── tensor.cu │ └── tree_verify_batch_config.cc └── utils │ └── dot │ └── record_formatter.cc ├── tests ├── align │ ├── README.md │ ├── align_create_tensor_ff.py │ ├── align_create_tensor_torch.py │ ├── align_ff_utils.py │ ├── align_test.py │ ├── align_utils.py │ ├── mt5_encoder │ │ └── align_mt5_encoder_ff.py │ ├── mt5_ff_utils.py │ ├── peft_flash_attn │ │ ├── align.py │ │ ├── align_2_tensors_from_pt.py │ │ ├── launch.json │ │ ├── peft_flash_debug_note │ │ └── test_bwd_2x_grad_buf.py │ └── test_all_operators.sh ├── fine_grained_alignment_test.sh ├── inference │ ├── cpp_inference_tests.sh │ ├── generate_inf_test_configs.py │ ├── huggingface_inference.py │ ├── huggingface_inference_simple.py │ ├── huggingface_pipeline.py │ ├── inference_alignment_test.py │ └── test_inference_output.py ├── inference_tests.sh ├── multinode_helpers │ ├── mpi_wrapper1.sh │ └── mpi_wrapper2.sh ├── peft │ ├── alignment │ │ ├── align_test_utils.py │ │ ├── llama_alignment_tests.ipynb │ │ └── opt_alignment_tests.ipynb │ ├── hf_finetune.py │ ├── hf_serve.py │ ├── hf_train.py │ ├── hf_utils.py │ └── peft_alignment_test.py ├── peft_test.sh └── python_interface_test.sh └── triton ├── CMakeLists.txt ├── Dockerfile.QA ├── README.md ├── cmake └── TritonLegionBackendConfig.cmake.in ├── qa ├── L0_e2e │ ├── models │ │ ├── add │ │ │ ├── 1 │ │ │ │ ├── model.onnx │ │ │ │ └── model.strategy │ │ │ └── config.pbtxt │ │ ├── cast │ │ │ ├── 1 │ │ │ │ ├── model.onnx │ │ │ │ └── model.strategy │ │ │ └── config.pbtxt │ │ ├── identity │ │ │ ├── 1 │ │ │ │ ├── model.onnx │ │ │ │ └── model.strategy │ │ │ └── config.pbtxt │ │ ├── mul │ │ │ ├── 1 │ │ │ │ ├── model.onnx │ │ │ │ └── model.strategy │ │ │ └── config.pbtxt │ │ ├── reciprocal │ │ │ ├── 1 │ │ │ │ ├── model.onnx │ │ │ │ └── model.strategy │ │ │ └── config.pbtxt │ │ ├── softmax │ │ │ ├── 1 │ │ │ │ ├── model.onnx │ │ │ │ └── model.strategy │ │ │ └── config.pbtxt │ │ ├── softmax1 │ │ │ ├── 1 │ │ │ │ ├── model.onnx │ │ │ │ └── model.strategy │ │ │ └── config.pbtxt │ │ ├── sqrt │ │ │ ├── 1 │ │ │ │ ├── model.onnx │ │ │ │ └── model.strategy │ │ │ └── config.pbtxt │ │ ├── sub │ │ │ ├── 1 │ │ │ │ ├── model.onnx │ │ │ │ └── model.strategy │ │ │ └── config.pbtxt │ │ └── tanh │ │ │ ├── 1 │ │ │ ├── model.onnx │ │ │ └── model.strategy │ │ │ └── config.pbtxt │ ├── operator_test.py │ ├── test.sh │ └── test_helpers.py ├── L0_parser │ └── test.sh └── common │ └── util.sh └── src ├── CMakeLists.txt ├── Makefile ├── accessor.h ├── backend.cc ├── common.h ├── config.h ├── cudahelp.h ├── instance.cc ├── instance.h ├── libtriton_legion.ldscript ├── model.cc ├── model.h ├── onnx ├── onnx-data.proto ├── onnx-ml.proto └── onnx-operators-ml.proto ├── onnx_parser.cc ├── onnx_parser.h ├── operator.cc ├── operator.h ├── operators ├── binary.cc ├── binary.cu ├── binary.h ├── concat.cc ├── concat.h ├── conv2d.cc ├── conv2d.h ├── flat.h ├── linear.h ├── matmul.cc ├── matmul.h ├── pool2d.cc ├── pool2d.h ├── reshape.cc ├── reshape.h ├── softmax.cc ├── softmax.h ├── unary.cc ├── unary.cu └── unary.h ├── runtime.cc ├── runtime.h ├── strategy.cc ├── strategy.h ├── tensor.cc ├── tensor.h ├── test ├── CMakeLists.txt ├── data │ ├── add.onnx │ ├── avg_pool.onnx │ ├── avg_pool_autopad.onnx │ ├── avg_pool_ceil.onnx │ ├── avg_pool_count_include_pad.onnx │ ├── avg_pool_pad.onnx │ ├── cast.onnx │ ├── conv2d_with_bias.onnx │ ├── identity.onnx │ ├── max_pool.onnx │ ├── max_pool_autopad.onnx │ ├── max_pool_ceil.onnx │ ├── max_pool_dilations.onnx │ ├── max_pool_order.onnx │ ├── mul.onnx │ ├── reciprocal.onnx │ ├── softmax.onnx │ ├── softmax_default_axis.onnx │ ├── softmax_negative_axis.onnx │ ├── sqrt.onnx │ ├── sub.onnx │ └── tanh.onnx ├── mock │ ├── binary.cc │ ├── concat.cc │ ├── conv2d.cc │ ├── legion.cc │ ├── matmul.cc │ ├── pool2d.cc │ ├── reshape.cc │ ├── softmax.cc │ ├── strategy.cc │ ├── triton_error.cc │ └── unary.cc ├── onnx_parser_test.cc └── scripts │ └── onnx_maker.py └── types.h /.dockerignore: -------------------------------------------------------------------------------- 1 | # Ignore all folders which start with "build" 2 | /build*/ 3 | 4 | # Ignore compiled files 5 | /.tools/ 6 | /python/flexflow_python 7 | /python/flexflow/core/legion_cffi.py 8 | python/flexflow/core/flexflow_cffi_header.py 9 | python/flexflow/core/legion_cffi_header.py 10 | *.pb.cc 11 | *.pb.h 12 | *.o 13 | *.a 14 | 15 | # Ignore inference assets 16 | /inference/weights/* 17 | /inference/tokenizer/* 18 | /inference/prompt/* 19 | /inference/output/* 20 | 21 | /tests/inference/python_test_configs/*.json 22 | -------------------------------------------------------------------------------- /.github/PULL_REQUEST_TEMPLATE.md: -------------------------------------------------------------------------------- 1 | **Description of changes:** 2 | 3 | 4 | 5 | **Related Issues:** 6 | 7 | Linked Issues: 8 | - Issue # 9 | 10 | Issues closed by this PR: 11 | - Closes # 12 | 13 | -------------------------------------------------------------------------------- /.github/runs-on.yml: -------------------------------------------------------------------------------- 1 | images: 2 | dlami-x64: 3 | platform: "linux" 4 | arch: "x64" 5 | ami: "ami-04a2add47e78915e6" 6 | 7 | runners: 8 | gpu-nvidia: 9 | family: ["g5.12xlarge"] 10 | image: dlami-x64 11 | rocm-builder: 12 | family: ["c4.8xlarge"] 13 | image: dlami-x64 -------------------------------------------------------------------------------- /.github/workflows/clang-format-check.yml: -------------------------------------------------------------------------------- 1 | name: Clang format 2 | on: [push, pull_request, workflow_dispatch] 3 | jobs: 4 | formatting-check: 5 | name: Formatting Check 6 | runs-on: ubuntu-22.04 7 | strategy: 8 | matrix: 9 | path: 10 | - check: "src" 11 | exclude: '\.proto$' 12 | - check: "include" 13 | - check: "inference" 14 | - check: "python" 15 | - check: "scripts" 16 | - check: "tests" 17 | steps: 18 | - uses: actions/checkout@v2 19 | - name: Run clang-format style check for C/C++/Protobuf programs. 20 | uses: jidicula/clang-format-action@v4.8.0 21 | with: 22 | clang-format-version: "15" 23 | check-path: ${{ matrix.path['check'] }} 24 | exclude-regex: ${{ matrix.path['exclude'] }} 25 | -------------------------------------------------------------------------------- /.github/workflows/helpers/free_space_on_runner.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set -euo pipefail 3 | set -x 4 | 5 | sudo rm -rf /usr/share/dotnet 6 | sudo rm -rf /usr/local/lib/android 7 | sudo rm -rf /opt/ghc 8 | sudo rm -rf "/usr/local/share/boost" 9 | sudo rm -rf "$AGENT_TOOLSDIRECTORY" 10 | -------------------------------------------------------------------------------- /.github/workflows/helpers/install_cudnn.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set -euo pipefail 3 | set -x 4 | 5 | # Cd into directory holding this script 6 | cd "${BASH_SOURCE[0]%/*}" 7 | 8 | ubuntu_version=$(lsb_release -rs) 9 | ubuntu_version=${ubuntu_version//./} 10 | 11 | wget -c -q "https://developer.download.nvidia.com/compute/cuda/repos/ubuntu${ubuntu_version}/x86_64/cuda-keyring_1.1-1_all.deb" 12 | sudo dpkg -i cuda-keyring_1.1-1_all.deb 13 | sudo apt update -y 14 | rm -f cuda-keyring_1.1-1_all.deb 15 | sudo apt-get -y install libcudnn9-cuda-12 16 | sudo apt-get -y install libcudnn9-dev-cuda-12 17 | sudo apt-get -y install libcudnn9-samples 18 | sudo ldconfig 19 | -------------------------------------------------------------------------------- /.github/workflows/helpers/install_nccl.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set -euo pipefail 3 | set -x 4 | 5 | # Cd into directory holding this script 6 | cd "${BASH_SOURCE[0]%/*}" 7 | 8 | ubuntu_version=$(lsb_release -rs) 9 | ubuntu_version=${ubuntu_version//./} 10 | wget -c -q "https://developer.download.nvidia.com/compute/cuda/repos/ubuntu${ubuntu_version}/x86_64/cuda-keyring_1.1-1_all.deb" 11 | sudo dpkg -i cuda-keyring_1.1-1_all.deb 12 | sudo apt-get update -y --allow-change-held-packages 13 | rm -f cuda-keyring_1.1-1_all.deb 14 | sudo apt install -y --allow-change-held-packages libnccl2 libnccl-dev 15 | -------------------------------------------------------------------------------- /.github/workflows/helpers/oracle_con.py: -------------------------------------------------------------------------------- 1 | import oci 2 | import argparse 3 | import os 4 | 5 | parser = argparse.ArgumentParser(description="Program with optional flags") 6 | group = parser.add_mutually_exclusive_group() 7 | group.add_argument("--start", action="store_true", help="Start action") 8 | group.add_argument("--stop", action="store_true", help="Stop action") 9 | parser.add_argument("--instance_id", type=str, required=True, help="instance id required") 10 | args = parser.parse_args() 11 | 12 | oci_key_content = os.getenv("OCI_CLI_KEY_CONTENT") 13 | 14 | config = { 15 | "user": os.getenv("OCI_CLI_USER"), 16 | "key_content": os.getenv("OCI_CLI_KEY_CONTENT"), 17 | "fingerprint": os.getenv("OCI_CLI_FINGERPRINT"), 18 | "tenancy": os.getenv("OCI_CLI_TENANCY"), 19 | "region": os.getenv("OCI_CLI_REGION") 20 | } 21 | 22 | # Initialize the OCI configuration 23 | oci.config.validate_config(config) 24 | 25 | # Initialize the ComputeClient to interact with VM instances 26 | compute = oci.core.ComputeClient(config) 27 | 28 | # Replace 'your_instance_id' with the actual instance ID of your VM 29 | instance_id = args.instance_id 30 | 31 | # Perform the action 32 | if args.start: 33 | # Start the VM 34 | compute.instance_action(instance_id, "START") 35 | else: 36 | # Stop the VM 37 | compute.instance_action(instance_id, "STOP") 38 | -------------------------------------------------------------------------------- /.github/workflows/shell-check.yml: -------------------------------------------------------------------------------- 1 | name: Shell Check 2 | on: [push, pull_request, workflow_dispatch] 3 | jobs: 4 | shellcheck: 5 | name: Shellcheck 6 | runs-on: ubuntu-22.04 7 | steps: 8 | - uses: actions/checkout@v3 9 | - name: Run ShellCheck 10 | uses: ludeeus/action-shellcheck@master 11 | with: 12 | ignore_paths: ./triton/** 13 | -------------------------------------------------------------------------------- /.gitmodules: -------------------------------------------------------------------------------- 1 | [submodule "deps/legion"] 2 | path = deps/legion 3 | url = https://github.com/flexflow/legion.git 4 | [submodule "deps/nccl"] 5 | path = deps/nccl 6 | url = https://github.com/NVIDIA/nccl.git 7 | [submodule "deps/variant"] 8 | path = deps/variant 9 | url = https://github.com/mpark/variant 10 | [submodule "deps/optional"] 11 | path = deps/optional 12 | url = https://github.com/TartanLlama/optional.git 13 | [submodule "deps/json"] 14 | path = deps/json 15 | url = https://github.com/nlohmann/json.git 16 | [submodule "deps/tokenizers-cpp"] 17 | path = deps/tokenizers-cpp 18 | url = https://github.com/mlc-ai/tokenizers-cpp.git 19 | fetchRecurseSubmodules = true 20 | [submodule "deps/flashinfer"] 21 | path = deps/flashinfer 22 | url = https://github.com/flashinfer-ai/flashinfer.git 23 | -------------------------------------------------------------------------------- /.readthedocs.yaml: -------------------------------------------------------------------------------- 1 | # .readthedocs.yaml 2 | # Read the Docs configuration file 3 | # See https://docs.readthedocs.io/en/stable/config-file/v2.html for details 4 | 5 | # Required 6 | version: 2 7 | 8 | # Set the version of Python and other tools you might need 9 | build: 10 | os: ubuntu-20.04 11 | tools: 12 | python: "3.8" 13 | # You can also specify other tool versions: 14 | # nodejs: "16" 15 | # rust: "1.55" 16 | # golang: "1.17" 17 | 18 | # Build documentation in the docs/ directory with Sphinx 19 | sphinx: 20 | configuration: docs/source/conf.py 21 | 22 | #If using Sphinx, optionally build your docs in additional formats such as PDF 23 | formats: 24 | - pdf 25 | 26 | # Optionally declare the Python requirements required to build your docs 27 | python: 28 | install: 29 | - requirements: docs/requirements.txt 30 | -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- 1 | graft deps 2 | recursive-exclude . .git 3 | prune triton 4 | include python/flexflow/version.txt 5 | -------------------------------------------------------------------------------- /cmake/cudnn.cmake: -------------------------------------------------------------------------------- 1 | # find cudnn in CUDNN_ROOT and CUDA_ROOT 2 | if(CUDNN_PATH) 3 | set(CUDNN_ROOT ${CUDNN_PATH}) 4 | else() 5 | # if CUDNN_PATH is not set, let's try to find it in the CUDA root 6 | set(CUDNN_ROOT ${CUDA_TOOLKIT_ROOT_DIR}) 7 | endif() 8 | find_library(CUDNN_LIBRARY 9 | NAMES libcudnn${LIBEXT} 10 | PATHS ${CUDNN_ROOT} ${CUDA_ROOT} 11 | PATH_SUFFIXES lib lib64 12 | DOC "CUDNN library." ) 13 | 14 | find_path(CUDNN_INCLUDE_DIR 15 | NAMES cudnn.h 16 | HINTS ${CUDNN_ROOT} ${CUDA_ROOT} 17 | PATH_SUFFIXES include 18 | DOC "CUDNN include directory." ) 19 | 20 | # find cudnn, set cudnn lib and include 21 | if(CUDNN_LIBRARY AND CUDNN_INCLUDE_DIR) 22 | set(CUDNN_FOUND ON) 23 | set(CUDNN_LIBRARIES ${CUDNN_LIBRARY}) 24 | set(CUDNN_INCLUDE_DIRS ${CUDNN_INCLUDE_DIR}) 25 | endif() 26 | 27 | # find cuda and cudnn 28 | if(CUDNN_FOUND) 29 | list(APPEND FLEXFLOW_EXT_LIBRARIES 30 | ${CUDNN_LIBRARIES}) 31 | 32 | list(APPEND FLEXFLOW_INCLUDE_DIRS 33 | ${CUDNN_INCLUDE_DIR}) 34 | endif() 35 | 36 | if(CUDNN_FOUND) 37 | message( STATUS "CUDNN include : ${CUDNN_INCLUDE_DIR}" ) 38 | message( STATUS "CUDNN libraries : ${CUDNN_LIBRARIES}" ) 39 | else() 40 | message( FATAL_ERROR "CUDNN package not found -> specify search path via CUDNN_DIR variable") 41 | endif() 42 | -------------------------------------------------------------------------------- /cmake/hip.cmake: -------------------------------------------------------------------------------- 1 | if (NOT FF_HIP_ARCH STREQUAL "") 2 | if (FF_HIP_ARCH STREQUAL "all") 3 | set(FF_HIP_ARCH "gfx900,gfx902,gfx904,gfx906,gfx908,gfx909,gfx90a,gfx90c,gfx940,gfx1010,gfx1011,gfx1012,gfx1013,gfx1030,gfx1031,gfx1032,gfx1033,gfx1034,gfx1035,gfx1036,gfx1100,gfx1101,gfx1102,gfx1103") 4 | endif() 5 | string(REPLACE "," "," HIP_ARCH_LIST "${FF_HIP_ARCH}") 6 | endif() 7 | 8 | message(STATUS "FF_HIP_ARCH: ${FF_HIP_ARCH}") 9 | if(FF_GPU_BACKEND STREQUAL "hip_rocm") 10 | #set(HIP_CLANG_PATH ${ROCM_PATH}/llvm/bin CACHE STRING "Path to the clang compiler by ROCM" FORCE) 11 | set(GPU_TARGETS "${FF_HIP_ARCH}" CACHE STRING "The GPU TARGETs") 12 | endif() 13 | -------------------------------------------------------------------------------- /cmake/json.cmake: -------------------------------------------------------------------------------- 1 | include(FetchContent) 2 | 3 | FetchContent_Declare(json URL https://github.com/nlohmann/json/releases/download/v3.11.3/json.tar.xz) 4 | FetchContent_MakeAvailable(json) -------------------------------------------------------------------------------- /cmake/optional.cmake: -------------------------------------------------------------------------------- 1 | set(OPTIONAL_BUILD_TESTS OFF) 2 | set(OPTIONAL_BUILD_PACKAGE OFF) 3 | 4 | add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/deps/optional) 5 | 6 | list(APPEND FLEXFLOW_EXT_LIBRARIES optional) 7 | list(APPEND FLEXFLOW_INCLUDE_DIRS ${CMAKE_CURRENT_SOURCE_DIR}/deps/optional/include/) 8 | -------------------------------------------------------------------------------- /cmake/variant.cmake: -------------------------------------------------------------------------------- 1 | add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/deps/variant) 2 | 3 | list(APPEND FLEXFLOW_EXT_LIBRARIES mpark_variant) 4 | list(APPEND FLEXFLOW_INCLUDE_DIRS ${CMAKE_CURRENT_SOURCE_DIR}/deps/variant/include/) -------------------------------------------------------------------------------- /conda/flexflow.yml: -------------------------------------------------------------------------------- 1 | name: flexflow 2 | channels: 3 | - defaults 4 | - conda-forge 5 | dependencies: 6 | - python 7 | - cffi 8 | - rust 9 | - cmake-build-extension 10 | - jq 11 | - pytest 12 | - pip 13 | - pip: 14 | - numpy 15 | - torch 16 | - torchaudio 17 | - torchvision 18 | - regex 19 | - transformers>=4.47.1 20 | - sentencepiece 21 | - einops 22 | - requests 23 | - scipy 24 | - bitsandbytes 25 | - datasets 26 | - accelerate 27 | - loralib 28 | - triton 29 | - peft 30 | - pytest 31 | -------------------------------------------------------------------------------- /docker/flexflow/Dockerfile: -------------------------------------------------------------------------------- 1 | ARG FF_GPU_BACKEND=cuda 2 | ARG gpu_backend_version=12.1 3 | FROM flexflow-environment-$FF_GPU_BACKEND$gpu_backend_version:latest 4 | 5 | LABEL org.opencontainers.image.source=https://github.com/flexflow/flexflow-serve 6 | LABEL org.opencontainers.image.description="flexflow-serve container" 7 | 8 | # Copy flexflow-serve repository 9 | RUN mkdir flexflow-serve 10 | WORKDIR /usr/flexflow-serve 11 | COPY . . 12 | 13 | # Args to build flexflow-serve 14 | ARG BUILD_CONFIGS 15 | ARG N_BUILD_CORES 16 | 17 | # Create install directory if needed 18 | RUN for pair in $BUILD_CONFIGS; do \ 19 | key=${pair%%=*}; \ 20 | value=${pair#*=}; \ 21 | if [ "$key" = "INSTALL_DIR" ] && [ -n "$value" ]; then \ 22 | mkdir -p "$value"; \ 23 | fi; \ 24 | done 25 | 26 | # Build and install C++ and Python versions of flexflow-serve 27 | RUN mkdir -p build && cd build && \ 28 | eval "$BUILD_CONFIGS" ../config/config.linux && \ 29 | make -j $N_BUILD_CORES install && \ 30 | ldconfig 31 | 32 | ENTRYPOINT ["/bin/bash"] 33 | -------------------------------------------------------------------------------- /docs/Makefile: -------------------------------------------------------------------------------- 1 | # Minimal makefile for Sphinx documentation 2 | # 3 | 4 | # You can set these variables from the command line, and also 5 | # from the environment for the first two. 6 | SPHINXOPTS ?= 7 | SPHINXBUILD ?= sphinx-build 8 | SOURCEDIR = source 9 | BUILDDIR = build 10 | 11 | # Put it first so that "make" without argument is like "make help". 12 | help: 13 | @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) 14 | 15 | .PHONY: help Makefile clean 16 | 17 | clean: 18 | rm -rf build doxygen/output doxygen/cpp_api 19 | @$(SPHINXBUILD) -M clean "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) 20 | 21 | # Catch-all target: route all unknown targets to Sphinx using the new 22 | # "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS). 23 | %: Makefile 24 | @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) 25 | -------------------------------------------------------------------------------- /docs/doxygen/README.md: -------------------------------------------------------------------------------- 1 | # Doxygen Documentation 2 | 3 | This directory holds the configuration file for building 4 | the HTML Doxygen documentation for the C++ and Python code. 5 | This documentation is mainly for the developers of FlexFlow for now. 6 | 7 | ## Generate documentation locally 8 | 9 | 1. Install [doxygen](https://www.doxygen.nl/index.html). The configuration file is based on Doxygen 1.9.3. But all recent Doxygen versions should work. 10 | 2. Define `$FF_HOME` environmental variable to be the root directory of the FlexFlow repo. 11 | 3. Run Doxygen with `doxygen $FF_HOME/docs/doxygen/Doxyfile` 12 | 4. Now, you may browse the docs by opening the index page `$FF_HOME/docs/doxygen/output/html/index.html` with your favorite web browser. 13 | -------------------------------------------------------------------------------- /docs/doxygen/theme/rust_footer.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 16 | 17 | 18 | 19 | 24 | 25 | 26 | 27 | 28 | -------------------------------------------------------------------------------- /docs/make.bat: -------------------------------------------------------------------------------- 1 | @ECHO OFF 2 | 3 | pushd %~dp0 4 | 5 | REM Command file for Sphinx documentation 6 | 7 | if "%SPHINXBUILD%" == "" ( 8 | set SPHINXBUILD=sphinx-build 9 | ) 10 | set SOURCEDIR=source 11 | set BUILDDIR=build 12 | 13 | if "%1" == "" goto help 14 | 15 | %SPHINXBUILD% >NUL 2>NUL 16 | if errorlevel 9009 ( 17 | echo. 18 | echo.The 'sphinx-build' command was not found. Make sure you have Sphinx 19 | echo.installed, then set the SPHINXBUILD environment variable to point 20 | echo.to the full path of the 'sphinx-build' executable. Alternatively you 21 | echo.may add the Sphinx directory to PATH. 22 | echo. 23 | echo.If you don't have Sphinx installed, grab it from 24 | echo.http://sphinx-doc.org/ 25 | exit /b 1 26 | ) 27 | 28 | %SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% 29 | goto end 30 | 31 | :help 32 | %SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% 33 | 34 | :end 35 | popd 36 | -------------------------------------------------------------------------------- /docs/requirements.txt: -------------------------------------------------------------------------------- 1 | sphinx 2 | sphinx_rtd_theme 3 | m2r2 4 | breathe 5 | exhale 6 | cffi 7 | numpy 8 | qualname 9 | -------------------------------------------------------------------------------- /docs/source/chatbot.rst: -------------------------------------------------------------------------------- 1 | :tocdepth: 1 2 | ******** 3 | Chatbot 4 | ******** 5 | 6 | The chatbot use case involves setting up a conversational AI model using FlexFlow Serve, capable of engaging in interactive dialogues with users. 7 | 8 | Requirements 9 | ============ 10 | 11 | - FlexFlow Serve setup with required configurations. 12 | - Gradio or any interactive interface tool. 13 | 14 | Implementation 15 | ============== 16 | 17 | 1. FlexFlow Initialization 18 | Initialize FlexFlow Serve with desired configurations and specific LLM model. 19 | 20 | 2. Gradio Interface Setup 21 | Define a function for response generation based on user inputs. Setup Gradio Chat Interface for interaction. 22 | 23 | .. code-block:: python 24 | 25 | def generate_response(user_input): 26 | result = llm.generate(user_input) 27 | return result.output_text.decode('utf-8') 28 | 29 | 30 | 3. Running the Interface 31 | Launch the Gradio interface and interact with the model by entering text inputs. 32 | 33 | .. image:: /imgs/gradio_interface.png 34 | :alt: Gradio Chatbot Interface 35 | :align: center 36 | 37 | 4. Shutdown 38 | Stop the FlexFlow server after interaction. 39 | 40 | Example 41 | ======= 42 | 43 | Complete code example can be found here: 44 | 45 | 1. `Chatbot Example with incremental decoding `__ 46 | 47 | 2. `Chatbot Example with speculative inference `__ 48 | 49 | 50 | Example Implementation: 51 | 52 | .. code-block:: python 53 | 54 | import gradio as gr 55 | import flexflow.serve as ff 56 | 57 | ff.init(num_gpus=2, memory_per_gpu=14000, ...) 58 | 59 | def generate_response(user_input): 60 | result = llm.generate(user_input) 61 | return result.output_text.decode('utf-8') 62 | 63 | iface = gr.ChatInterface(fn=generate_response) 64 | iface.launch() -------------------------------------------------------------------------------- /docs/source/cpp_api.rst: -------------------------------------------------------------------------------- 1 | ************* 2 | C++ API 3 | ************* 4 | 5 | The FlexFlow backend is at the core of FlexFlow Train and FlexFlow Serve. It is written entirely in C/C++ and CUDA/HIP. This section documents the API, which is generated by Doxygen and it is available at the following links: 6 | 7 | * `CUDA version <./cuda_api/index.html>`_ (default version) 8 | * `HIP version <./hip_api/index.html>`_ 9 | 10 | The two versions only differ when it comes to the GPU kernels, so the great majority of the entries are identical. If you are unsure which version to use, take a look at the CUDA version. 11 | -------------------------------------------------------------------------------- /docs/source/developers_guide/developers_guide.rst: -------------------------------------------------------------------------------- 1 | ****************** 2 | Developers Guide 3 | ****************** 4 | 5 | .. mdinclude:: ../../../CONTRIBUTING.md 6 | :start-line: 2 7 | -------------------------------------------------------------------------------- /docs/source/developers_guide/ff_internals.rst: -------------------------------------------------------------------------------- 1 | ******************* 2 | FlexFlow Internals 3 | ******************* 4 | 5 | .. mdinclude:: internals.md 6 | :start-line: 2 7 | -------------------------------------------------------------------------------- /docs/source/developers_guide/internals.md: -------------------------------------------------------------------------------- 1 | # FlexFlow Internals 2 | 3 | ## The Parallel Computation Graph (PCG) 4 | 5 | FlexFlow uses a _Parallel Computation Graph (PCG)_ to simultaneously represent tensor operations, as well as parallelism choices and data movement across nodes. 6 | 7 | ### Tensor representations 8 | 9 | There are two types of tensor representations in FlexFlow: a [Tensor](./cuda_api/de/da9/structFlexFlow_1_1TensorBase.html) and a [ParallelTensor](./cuda_api/d3/dfc/structFlexFlow_1_1ParallelTensorBase.html). The first variant is used when writing a FlexFlow DNN program, whereas the second is used by the runtime to run all the computations in a distributed fashion. `Tensor` and `ParallelTensor` are implemented as typedef-ed pointers to, respectively, the `TensorBase` (defined in `include/flexflow/tensor.h`) and `ParallelTensorBase` (defined in `include/flexflow/parallel_tensor.h`) structs. 10 | 11 | The `ParallelTensor` struct contains all the information that a `Tensor` also stores, but in addition, it also codifies how the tensor should be parallelized. For instance, a ParallelTensor records how each dimension is *partitioned*, how many *replicas* of the tensors have been created, and the *mapping* between the partitions of the tensors and the physical machines that will store them. 12 | 13 | ## Transformation generation 14 | 15 | ## Joint optimization 16 | -------------------------------------------------------------------------------- /docs/source/docker.rst: -------------------------------------------------------------------------------- 1 | :tocdepth: 1 2 | ************* 3 | Docker 4 | ************* 5 | We provide a ready-to-use Docker container to quickly run FlexFlow with no manual installation required. To use it, follow the steps below. 6 | 7 | .. mdinclude:: ../../docker/README.md 8 | :start-line: 3 9 | -------------------------------------------------------------------------------- /docs/source/imgs/gradio_api.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/flexflow/flexflow-serve/dd7bee0e9d6e43b1aaf9355a4d2091fdae2b6942/docs/source/imgs/gradio_api.png -------------------------------------------------------------------------------- /docs/source/imgs/gradio_interface.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/flexflow/flexflow-serve/dd7bee0e9d6e43b1aaf9355a4d2091fdae2b6942/docs/source/imgs/gradio_interface.png -------------------------------------------------------------------------------- /docs/source/index.rst: -------------------------------------------------------------------------------- 1 | .. FlexFlow documentation master file, created by 2 | sphinx-quickstart on Tue Dec 15 14:16:53 2020. 3 | You can adapt this file completely to your liking, but it should at least 4 | contain the root `toctree` directive. 5 | 6 | Welcome to FlexFlow's documentation! 7 | ==================================== 8 | 9 | .. toctree:: 10 | :caption: Getting Started 11 | 12 | welcome 13 | installation 14 | docker 15 | multinode 16 | 17 | .. toctree:: 18 | :caption: FlexFlow Serve 19 | 20 | serve_overview 21 | serve_usecases 22 | serve_api 23 | 24 | .. toctree:: 25 | :caption: FlexFlow Train 26 | 27 | train_overview 28 | train_interface 29 | train_examples 30 | 31 | train_python_api 32 | 33 | .. toctree:: 34 | :caption: FlexFlow Backend 35 | 36 | cpp_api 37 | 38 | .. toctree:: 39 | :maxdepth: 3 40 | :caption: Developers Guide 41 | 42 | developers_guide/developers_guide.rst 43 | .. developers_guide/ff_internals.rst 44 | 45 | 46 | .. Indices and tables 47 | .. ================== 48 | .. 49 | .. * :ref:`genindex` 50 | .. * :ref:`modindex` 51 | .. * :ref:`search` 52 | -------------------------------------------------------------------------------- /docs/source/installation.rst: -------------------------------------------------------------------------------- 1 | :tocdepth: 1 2 | ************* 3 | Building from source 4 | ************* 5 | 6 | .. mdinclude:: ../../INSTALL.md 7 | :start-line: 2 8 | -------------------------------------------------------------------------------- /docs/source/keras.rst: -------------------------------------------------------------------------------- 1 | :tocdepth: 1 2 | **************** 3 | Keras Interface 4 | **************** 5 | 6 | FlexFlow provides a drop-in replacement for TensorFlow Keras. Running an existing Keras program on the FlexFlow backend only requires a few lines of changes to the program. The detailed instructions are as follows: 7 | 8 | 1. Replace the Keras header files 9 | ================================= 10 | 11 | Redirect the program to import Keras functions from FlexFlow by using the following import header lines:: 12 | 13 | from flexflow.keras.models import Model, Sequential 14 | from flexflow.keras.layers import Input, Dense, Conv2D, ... 15 | from flexflow.keras.callbacks import Callback, ... 16 | 17 | 2. Modify the main Keras program 18 | ================================ 19 | 20 | FlexFlow requires a Keras program to wrap its model construction in a Python function called ``top_level_task()``. This allows FlexFlow to automatically parallelize DNN training across all GPUs on all compute nodes. For example, the following code snippet shows parallelizing AlexNet training in FlexFlow:: 21 | 22 | def top_level_task(): 23 | model = Sequential() 24 | model.add(Conv2D(filters=64, input_shape=(3,229,229), kernel_size=(11,11), strides=(4,4), padding=(2,2), activation="relu")) 25 | model.add(MaxPooling2D(pool_size=(3,3), strides=(2,2), padding="valid")) 26 | model.add(Conv2D(filters=192, kernel_size=(5,5), strides=(1,1), padding=(2,2), activation="relu")) 27 | ## More lines for model construction 28 | model.add(Activation("softmax")) 29 | ## Model compilation 30 | model.compile(optimizer='sgd', loss='sparse_categorical_crossentropy', metrics=['accuracy']) 31 | ## Model training 32 | (x_train, y_train) = cifar10.load_data() 33 | model.fit(x_train, y_train, epochs=30) 34 | 35 | if __name__ == "__main__": 36 | top_level_task() 37 | 38 | More FlexFlow Keras examples are available on `GitHub `_. 39 | -------------------------------------------------------------------------------- /docs/source/mt5.rst: -------------------------------------------------------------------------------- 1 | ************************ 2 | mT5 Model 3 | ************************ 4 | 5 | .. mdinclude:: ../../examples/python/pytorch/mt5/README.md 6 | :start-line: 2 7 | -------------------------------------------------------------------------------- /docs/source/multinode.rst: -------------------------------------------------------------------------------- 1 | :tocdepth: 1 2 | ****************** 3 | Multinode tutorial 4 | ****************** 5 | 6 | 7 | .. mdinclude:: ../../MULTI-NODE.md 8 | :start-line: 3 9 | -------------------------------------------------------------------------------- /docs/source/onnx.rst: -------------------------------------------------------------------------------- 1 | :tocdepth: 1 2 | ************* 3 | ONNX Support 4 | ************* 5 | 6 | Similar to the PyTorch front-end, FlexFlow also supports training existing ONNX models. Since both ONNX and FlexFlow use Protocol Buffer, make sure they are linked with the Protocol Buffer of the same version. 7 | 8 | 1. Export a ONNX Model to a external file 9 | =============================================== 10 | 11 | A PyTorch model can be exported to the FlexFlow model format and saved into an external file:: 12 | 13 | import onnx 14 | import torch 15 | import torch.nn as nn 16 | from torch.onnx import TrainingMode 17 | 18 | # create a PyTorch Model 19 | class MyPyTorchModule(nn.Module): 20 | ... 21 | 22 | # export the PyTorch model to a ONNX model 23 | model = MyPyTorchModule() 24 | torch.onnx.export(model, (input), "filename", export_params=False, training=TrainingMode.TRAINING) 25 | 26 | 2. Import a FlexFlow model from a external file 27 | =============================================== 28 | 29 | A FlexFlow program can directly import a previously saved ONNX model and autotune the parallelization performance for a given parallel machine:: 30 | 31 | from flexflow.torch.model import PyTorchModel 32 | 33 | #create input tensors 34 | dims_input = [ffconfig.get_batch_size(), 3, 32, 32] 35 | input_tensor = ffmodel.create_tensor(dims_input, DataType.DT_FLOAT) 36 | 37 | # create a flexflow model from the file 38 | onnx_model = ONNXModel("cifar10_cnn.onnx") 39 | output_tensor = onnx_model.apply(ffmodel, {"input.1": input_tensor}) 40 | 41 | # use the Python API to train the model 42 | ffoptimizer = SGDOptimizer(ffmodel, 0.01) 43 | ffmodel.set_sgd_optimizer(ffoptimizer) 44 | ffmodel.compile(loss_type=LossType.LOSS_SPARSE_CATEGORICAL_CROSSENTROPY, metrics=[MetricsType.METRICS_ACCURACY, MetricsType.METRICS_SPARSE_CATEGORICAL_CROSSENTROPY]) 45 | ... 46 | ffmodel.fit(x=dataloader_input, y=dataloader_label, epochs=epochs) 47 | 48 | More FlexFlow ONNX examples are available on `GitHub `_. 49 | -------------------------------------------------------------------------------- /docs/source/prompt_template.rst: -------------------------------------------------------------------------------- 1 | :tocdepth: 1 2 | **************** 3 | Prompt Template 4 | **************** 5 | 6 | Prompt templates guide the model's response generation. This use case demonstrates setting up FlexFlow Serve to integrate with Langchain and using prompt templates to handle dynamic prompt templates. 7 | 8 | Requirements 9 | ============ 10 | 11 | - FlexFlow Serve setup with appropriate configurations. 12 | - Langchain integration with templates for prompt management. 13 | 14 | Implementation 15 | ============== 16 | 17 | 1. FlexFlow Initialization 18 | Initialize and configure FlexFlow Serve. 19 | 20 | 2. LLM Setup 21 | Compile and start the server for text generation. 22 | 23 | 3. Prompt Template Setup 24 | Setup a prompt template for guiding model's responses. 25 | 26 | 4. Response Generation 27 | Use the LLM with the prompt template to generate a response. 28 | 29 | 5. Shutdown 30 | Stop the FlexFlow server after generating the response. 31 | 32 | Example 33 | ======= 34 | 35 | Complete code example can be found here: 36 | 37 | 1. `Prompt Template Example with incremental decoding `__ 38 | 39 | 2. `Prompt Template Example with speculative inference `__ 40 | 41 | 42 | Example Implementation: 43 | 44 | .. code-block:: python 45 | 46 | import flexflow.serve as ff 47 | from langchain.prompts import PromptTemplate 48 | 49 | ff_llm = FlexFlowLLM(...) 50 | ff_llm.compile_and_start(...) 51 | 52 | template = "Question: {question}\nAnswer:" 53 | prompt = PromptTemplate(template=template, input_variables=["question"]) 54 | 55 | response = ff_llm.generate("Who was the US president in 1997?") 56 | -------------------------------------------------------------------------------- /docs/source/python/create.rst: -------------------------------------------------------------------------------- 1 | ************** 2 | Model Creation 3 | ************** 4 | .. automodule:: flexflow.core.flexflow_cffi 5 | :noindex: 6 | 7 | Model Creation 8 | ============== 9 | .. autoclass:: FFModel() 10 | :noindex: 11 | :members: __init__ 12 | 13 | Tensor Creation 14 | =============== 15 | .. autoclass:: FFModel() 16 | :noindex: 17 | :members: create_tensor 18 | -------------------------------------------------------------------------------- /docs/source/python/dataloader.rst: -------------------------------------------------------------------------------- 1 | ************** 2 | Dataloader API 3 | ************** 4 | .. automodule:: flexflow.core.flexflow_cffi 5 | :noindex: 6 | 7 | Dataloader Creation 8 | =================== 9 | .. autoclass:: FFModel() 10 | :noindex: 11 | :members: create_data_loader 12 | 13 | Use Dataloader for Training 14 | =========================== 15 | .. autoclass:: SingleDataLoader() 16 | :noindex: 17 | :members: reset, next_batch -------------------------------------------------------------------------------- /docs/source/python/init.rst: -------------------------------------------------------------------------------- 1 | ******************** 2 | Model Initialization 3 | ******************** 4 | .. automodule:: flexflow.core.flexflow_cffi 5 | :noindex: 6 | 7 | Compile 8 | ======= 9 | .. autoclass:: FFModel() 10 | :noindex: 11 | :members: compile 12 | 13 | Initialization 14 | ============== 15 | .. autoclass:: FFModel() 16 | :noindex: 17 | :members: init_layers 18 | -------------------------------------------------------------------------------- /docs/source/python/models.rst: -------------------------------------------------------------------------------- 1 | ************ 2 | Models API 3 | ************ 4 | 5 | Models API in FlexFlow is used to create models . 6 | 7 | .. toctree:: 8 | :maxdepth: 2 9 | 10 | create 11 | init 12 | train -------------------------------------------------------------------------------- /docs/source/python/train.rst: -------------------------------------------------------------------------------- 1 | ************************** 2 | Model Training and Testing 3 | ************************** 4 | .. automodule:: flexflow.core.flexflow_cffi 5 | :noindex: 6 | 7 | Fit 8 | ======= 9 | .. autoclass:: FFModel() 10 | :noindex: 11 | :members: fit 12 | 13 | Evaluate 14 | ============== 15 | .. autoclass:: FFModel() 16 | :noindex: 17 | :members: eval 18 | 19 | Customized Training 20 | =================== 21 | .. autoclass:: FFModel() 22 | :noindex: 23 | :members: forward, backward, zero_gradients, update, reset_metrics, compute_metrics 24 | 25 | -------------------------------------------------------------------------------- /docs/source/serve_api.rst: -------------------------------------------------------------------------------- 1 | ************************** 2 | FlexFlow Serve Python API 3 | ************************** 4 | 5 | .. toctree:: 6 | serve_fastapi 7 | serve_gradioapi -------------------------------------------------------------------------------- /docs/source/serve_gradioapi.rst: -------------------------------------------------------------------------------- 1 | :tocdepth: 1 2 | ************************* 3 | FlexFlow Serve Gradio API 4 | ************************* 5 | 6 | Introduction 7 | ============ 8 | 9 | Users can also set up the API endpoints with a Gradio Chatbot Interface. 10 | 11 | Requirements 12 | ------------ 13 | 14 | - FlexFlow Serve setup with necessary configurations. 15 | - Running the gradio chatbot interface. 16 | 17 | Example 18 | ======== 19 | 20 | In a running gradio chatbot interface, hit the "Use via API" button on the bottom left. 21 | 22 | .. image:: /imgs/gradio_interface.png 23 | :alt: Gradio Chatbot Interface 24 | :align: center 25 | 26 | Users can easily access an API endpoint for sending prompts to the model. 27 | 28 | .. image:: /imgs/gradio_api.png 29 | :alt: Gradio API 30 | :align: center -------------------------------------------------------------------------------- /docs/source/serve_overview.rst: -------------------------------------------------------------------------------- 1 | :tocdepth: 1 2 | ************* 3 | Serving Overview 4 | ************* 5 | 6 | .. mdinclude:: ../../SERVE.md 7 | :start-line: 3 8 | -------------------------------------------------------------------------------- /docs/source/serve_usecases.rst: -------------------------------------------------------------------------------- 1 | ******************* 2 | Serving Usecases 3 | ******************* 4 | 5 | .. toctree:: 6 | chatbot 7 | prompt_template 8 | rag -------------------------------------------------------------------------------- /docs/source/train_examples.rst: -------------------------------------------------------------------------------- 1 | ************* 2 | Training Examples 3 | ************* 4 | 5 | .. toctree:: 6 | mt5 -------------------------------------------------------------------------------- /docs/source/train_interface.rst: -------------------------------------------------------------------------------- 1 | ******************* 2 | Training Interface 3 | ******************* 4 | 5 | .. toctree:: 6 | keras 7 | pytorch 8 | onnx -------------------------------------------------------------------------------- /docs/source/train_overview.rst: -------------------------------------------------------------------------------- 1 | :tocdepth: 1 2 | ************* 3 | Training Overview 4 | ************* 5 | 6 | .. mdinclude:: ../../TRAIN.md 7 | :start-line: 3 8 | -------------------------------------------------------------------------------- /docs/source/train_python_api.rst: -------------------------------------------------------------------------------- 1 | ******************* 2 | Python API 3 | ******************* 4 | This section documents the Python API for FlexFlow Train. 5 | 6 | .. toctree:: 7 | :maxdepth: 3 8 | 9 | python/models 10 | python/layers 11 | python/dataloader -------------------------------------------------------------------------------- /docs/source/welcome.rst: -------------------------------------------------------------------------------- 1 | :tocdepth: 1 2 | ************* 3 | Overview 4 | ************* 5 | 6 | .. mdinclude:: ../../README.md 7 | :start-line: 3 8 | -------------------------------------------------------------------------------- /img/overview.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/flexflow/flexflow-serve/dd7bee0e9d6e43b1aaf9355a4d2091fdae2b6942/img/overview.png -------------------------------------------------------------------------------- /img/performance.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/flexflow/flexflow-serve/dd7bee0e9d6e43b1aaf9355a4d2091fdae2b6942/img/performance.png -------------------------------------------------------------------------------- /img/spec_infer_demo.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/flexflow/flexflow-serve/dd7bee0e9d6e43b1aaf9355a4d2091fdae2b6942/img/spec_infer_demo.gif -------------------------------------------------------------------------------- /include/flexflow/device.h: -------------------------------------------------------------------------------- 1 | #ifndef _FLEXFLOW_DEVICE_H_ 2 | #define _FLEXFLOW_DEVICE_H_ 3 | 4 | #if defined(FF_USE_CUDA) || defined(FF_USE_HIP_CUDA) 5 | #include 6 | #include 7 | #elif defined(FF_USE_HIP_ROCM) 8 | #include 9 | #include 10 | #else 11 | #error "Unknown device" 12 | #endif 13 | 14 | namespace FlexFlow { 15 | #if defined(FF_USE_CUDA) || defined(FF_USE_HIP_CUDA) 16 | typedef cudaStream_t ffStream_t; 17 | cudaError_t get_legion_stream(cudaStream_t *stream); 18 | typedef cudnnTensorDescriptor_t ffTensorDescriptor_t; 19 | typedef cudnnActivationDescriptor_t ffActivationDescriptor_t; 20 | typedef cudnnPoolingDescriptor_t ffPoolingDescriptor_t; 21 | #elif defined(FF_USE_HIP_ROCM) 22 | typedef hipStream_t ffStream_t; 23 | hipError_t get_legion_stream(hipStream_t *stream); 24 | typedef miopenTensorDescriptor_t ffTensorDescriptor_t; 25 | typedef miopenActivationDescriptor_t ffActivationDescriptor_t; 26 | typedef miopenPoolingDescriptor_t ffPoolingDescriptor_t; 27 | #else 28 | #error "Unknown device" 29 | #endif 30 | }; // namespace FlexFlow 31 | 32 | #endif // _FLEXFLOW_DEVICE_H_ 33 | -------------------------------------------------------------------------------- /include/flexflow/ffconst_utils.h: -------------------------------------------------------------------------------- 1 | #ifndef _FLEXFLOW_FFCONST_UTILS_H 2 | #define _FLEXFLOW_FFCONST_UTILS_H 3 | 4 | #include "flexflow/ffconst.h" 5 | #include 6 | 7 | namespace FlexFlow { 8 | 9 | std::string get_operator_type_name(OperatorType type); 10 | 11 | size_t data_type_size(DataType type); 12 | 13 | #define INT4_NUM_OF_ELEMENTS_PER_GROUP 32 14 | 15 | size_t get_quantization_to_byte_size(DataType type, 16 | DataType quantization_type, 17 | size_t num_elements); 18 | 19 | std::ostream &operator<<(std::ostream &, OperatorType); 20 | 21 | }; // namespace FlexFlow 22 | 23 | #endif // _FLEXFLOW_FFCONST_UTILS_H 24 | -------------------------------------------------------------------------------- /include/flexflow/fftype.h: -------------------------------------------------------------------------------- 1 | #ifndef _FF_TYPE_H 2 | #define _FF_TYPE_H 3 | 4 | #include "flexflow/ffconst.h" 5 | #include 6 | #include 7 | #include 8 | 9 | namespace FlexFlow { 10 | 11 | class LayerID { 12 | public: 13 | static const LayerID NO_ID; 14 | LayerID(); 15 | LayerID(size_t id, size_t transformer_layer_id, size_t model_id); 16 | bool is_valid_id() const; 17 | friend bool operator==(LayerID const &lhs, LayerID const &rhs); 18 | 19 | public: 20 | size_t id, transformer_layer_id, model_id; 21 | }; 22 | 23 | class PEFTModelID { 24 | public: 25 | static const PEFTModelID NO_ID; 26 | PEFTModelID(); 27 | PEFTModelID(size_t id); 28 | bool is_valid_id() const; 29 | friend bool operator==(PEFTModelID const &lhs, PEFTModelID const &rhs); 30 | friend bool operator!=(PEFTModelID const &lhs, PEFTModelID const &rhs); 31 | friend std::ostream &operator<<(std::ostream &os, 32 | PEFTModelID const &peft_model_id); 33 | 34 | public: 35 | size_t id; 36 | }; 37 | 38 | }; // namespace FlexFlow 39 | 40 | namespace std { 41 | template <> 42 | struct hash { 43 | size_t operator()(FlexFlow::PEFTModelID const &n) const { 44 | return n.id; 45 | } 46 | }; 47 | } // namespace std 48 | 49 | #endif // _FF_TYPE_H 50 | -------------------------------------------------------------------------------- /include/flexflow/node.h: -------------------------------------------------------------------------------- 1 | #ifndef _NODE_H 2 | #define _NODE_H 3 | 4 | #include 5 | 6 | #include "tl/optional.hpp" 7 | 8 | namespace FlexFlow { 9 | 10 | class Op; 11 | 12 | namespace PCG { 13 | 14 | struct Node { 15 | Node(void); 16 | Node(size_t _guid, Op *_ptr) : guid(_guid), ptr(_ptr) {} 17 | inline bool operator==(Node const &b) const { 18 | if (guid != b.guid) { 19 | return false; 20 | } 21 | if (ptr != b.ptr) { 22 | return false; 23 | } 24 | if (original_guid != b.original_guid) { 25 | return false; 26 | } 27 | return true; 28 | } 29 | inline bool operator!=(Node const &b) const { 30 | if (guid != b.guid) { 31 | return true; 32 | } 33 | if (ptr != b.ptr) { 34 | return true; 35 | } 36 | if (original_guid != b.original_guid) { 37 | return false; 38 | } 39 | return false; 40 | } 41 | inline bool operator<(Node const &b) const { 42 | if (guid != b.guid) { 43 | return guid < b.guid; 44 | } 45 | if (ptr != b.ptr) { 46 | return ptr < b.ptr; 47 | } 48 | if (original_guid != b.original_guid) { 49 | return false; 50 | } 51 | return false; 52 | } 53 | Node &operator=(Node const &n) { 54 | guid = n.guid; 55 | ptr = n.ptr; 56 | original_guid = n.original_guid; 57 | return *this; 58 | } 59 | std::string op_to_string(Op const *ptr) const; 60 | std::string to_string(void) const { 61 | if (ptr != NULL) { 62 | return op_to_string(ptr) + "_" + std::to_string(guid); 63 | } else { 64 | return "UnmappedOp_" + std::to_string(guid); 65 | } 66 | } 67 | static const Node INVALID_NODE; 68 | size_t guid; 69 | Op const *ptr; 70 | 71 | tl::optional original_guid = tl::nullopt; 72 | }; 73 | 74 | }; // namespace PCG 75 | 76 | }; // namespace FlexFlow 77 | 78 | #endif // _NODE_H 79 | -------------------------------------------------------------------------------- /include/flexflow/op_meta.h: -------------------------------------------------------------------------------- 1 | #ifndef _OP_META_H 2 | #define _OP_META_H 3 | 4 | #include "flexflow/config.h" 5 | 6 | namespace FlexFlow { 7 | 8 | class Op; 9 | 10 | class OpMeta { 11 | public: 12 | // OpMeta(FFHandler _handle); 13 | OpMeta(FFHandler _handle, Op const *op); 14 | 15 | public: 16 | FFHandler handle; 17 | bool profiling; // Measure the run time of the task 18 | bool inference_debugging; 19 | bool enable_peft_finetuning; 20 | int decoding_step; 21 | int bwd_step; 22 | char op_name[MAX_OPNAME]; 23 | LayerID layer_guid; 24 | bool trainable_inputs[MAX_NUM_INPUTS]; 25 | bool reset_input_grads[MAX_NUM_INPUTS]; 26 | DataType input_type[MAX_NUM_INPUTS]; 27 | DataType weight_type[MAX_NUM_WEIGHTS]; 28 | DataType output_type[MAX_NUM_OUTPUTS]; 29 | }; 30 | 31 | }; // namespace FlexFlow 32 | 33 | #endif //_OP_META_H 34 | -------------------------------------------------------------------------------- /include/flexflow/ops/add_bias_residual_layer_norm_params.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include "flexflow/ffconst.h" 4 | #include "flexflow/fftype.h" 5 | #include "flexflow/parallel_tensor.h" 6 | 7 | namespace FlexFlow { 8 | 9 | struct AddBiasResidualLayerNormParams { 10 | LayerID layer_guid; 11 | std::vector axes; 12 | bool elementwise_affine; 13 | float eps; 14 | bool use_bias; 15 | bool inplace_residual; 16 | char name[MAX_OPNAME]; 17 | bool is_valid( 18 | std::pair const &) const; 19 | }; 20 | 21 | bool operator==(AddBiasResidualLayerNormParams const &, 22 | AddBiasResidualLayerNormParams const &); 23 | 24 | } // namespace FlexFlow 25 | 26 | namespace std { 27 | template <> 28 | struct hash { 29 | size_t operator()(FlexFlow::AddBiasResidualLayerNormParams const &) const; 30 | }; 31 | } // namespace std 32 | -------------------------------------------------------------------------------- /include/flexflow/ops/aggregate_params.h: -------------------------------------------------------------------------------- 1 | #ifndef _FLEXFLOW_AGGREGATE_PARAMS_H 2 | #define _FLEXFLOW_AGGREGATE_PARAMS_H 3 | 4 | #include "flexflow/ffconst.h" 5 | #include "flexflow/parallel_tensor.h" 6 | 7 | namespace FlexFlow { 8 | 9 | struct AggregateParams { 10 | int n; 11 | float lambda_bal; 12 | char name[MAX_OPNAME]; 13 | bool is_valid(std::vector const &) const; 14 | }; 15 | bool operator==(AggregateParams const &, AggregateParams const &); 16 | 17 | } // namespace FlexFlow 18 | 19 | namespace std { 20 | template <> 21 | struct hash { 22 | size_t operator()(FlexFlow::AggregateParams const &) const; 23 | }; 24 | } // namespace std 25 | 26 | #endif // _FLEXFLOW_AGGREGATE_PARAMS_H 27 | -------------------------------------------------------------------------------- /include/flexflow/ops/aggregate_spec_params.h: -------------------------------------------------------------------------------- 1 | #ifndef _FLEXFLOW_AGGREGATE_SPEC_PARAMS_H 2 | #define _FLEXFLOW_AGGREGATE_SPEC_PARAMS_H 3 | 4 | #include "flexflow/ffconst.h" 5 | #include "flexflow/parallel_tensor.h" 6 | 7 | namespace FlexFlow { 8 | 9 | struct AggregateSpecParams { 10 | int n; 11 | float lambda_bal; 12 | char name[MAX_OPNAME]; 13 | bool is_valid(ParallelTensorShape const &) const; 14 | }; 15 | bool operator==(AggregateSpecParams const &, AggregateSpecParams const &); 16 | 17 | } // namespace FlexFlow 18 | 19 | namespace std { 20 | template <> 21 | struct hash { 22 | size_t operator()(FlexFlow::AggregateSpecParams const &) const; 23 | }; 24 | } // namespace std 25 | 26 | #endif // _FLEXFLOW_AGGREGATE_SPEC_PARAMS_H 27 | -------------------------------------------------------------------------------- /include/flexflow/ops/arg_topk_params.h: -------------------------------------------------------------------------------- 1 | #ifndef _FLEXFLOW_ARG_TOPK_PARAMS_H 2 | #define _FLEXFLOW_ARG_TOPK_PARAMS_H 3 | 4 | #include "flexflow/ffconst.h" 5 | #include "flexflow/fftype.h" 6 | #include "flexflow/parallel_tensor.h" 7 | 8 | namespace FlexFlow { 9 | 10 | struct ArgTopKParams { 11 | LayerID layer_guid; 12 | int k; 13 | bool sorted; 14 | bool speculative_decoding; 15 | char name[MAX_OPNAME]; 16 | bool is_valid(ParallelTensorShape const &) const; 17 | }; 18 | bool operator==(ArgTopKParams const &, ArgTopKParams const &); 19 | 20 | } // namespace FlexFlow 21 | 22 | namespace std { 23 | template <> 24 | struct hash { 25 | size_t operator()(FlexFlow::ArgTopKParams const &) const; 26 | }; 27 | } // namespace std 28 | 29 | #endif // _FLEXFLOW_ARG_TOPK_PARAMS_H 30 | -------------------------------------------------------------------------------- /include/flexflow/ops/argmax_params.h: -------------------------------------------------------------------------------- 1 | #ifndef _FLEXFLOW_ARGMAX_PARAMS_H 2 | #define _FLEXFLOW_ARGMAX_PARAMS_H 3 | 4 | #include "flexflow/ffconst.h" 5 | #include "flexflow/parallel_tensor.h" 6 | 7 | namespace FlexFlow { 8 | 9 | struct ArgMaxParams { 10 | bool beam_search; 11 | bool is_valid(ParallelTensorShape const &) const; 12 | char name[MAX_OPNAME]; 13 | }; 14 | bool operator==(ArgMaxParams const &, ArgMaxParams const &); 15 | 16 | } // namespace FlexFlow 17 | 18 | namespace std { 19 | template <> 20 | struct hash { 21 | size_t operator()(FlexFlow::ArgMaxParams const &) const; 22 | }; 23 | } // namespace std 24 | 25 | #endif // _FLEXFLOW_ARGMAX_PARAMS_H -------------------------------------------------------------------------------- /include/flexflow/ops/attention_params.h: -------------------------------------------------------------------------------- 1 | #ifndef _FLEXFLOW_ATTENTION_PARAMS_H 2 | #define _FLEXFLOW_ATTENTION_PARAMS_H 3 | 4 | #include "flexflow/fftype.h" 5 | #include "flexflow/parallel_tensor.h" 6 | 7 | namespace FlexFlow { 8 | 9 | struct MultiHeadAttentionParams { 10 | LayerID layer_guid; 11 | int embed_dim, num_heads, kdim, vdim; 12 | float dropout; 13 | bool bias, add_bias_kv, add_zero_attn; 14 | char name[MAX_OPNAME]; 15 | 16 | bool is_valid(std::tuple const &) const; 19 | }; 20 | 21 | bool operator==(MultiHeadAttentionParams const &, 22 | MultiHeadAttentionParams const &); 23 | 24 | } // namespace FlexFlow 25 | 26 | namespace std { 27 | template <> 28 | struct hash { 29 | size_t operator()(FlexFlow::MultiHeadAttentionParams const &) const; 30 | }; 31 | } // namespace std 32 | 33 | #endif // _FLEXFLOW_ATTENTION_PARAMS_H 34 | -------------------------------------------------------------------------------- /include/flexflow/ops/batch_matmul_params.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include "flexflow/parallel_tensor.h" 4 | 5 | namespace FlexFlow { 6 | 7 | struct BatchMatmulParams { 8 | int a_seq_length_dim, b_seq_length_dim; 9 | char name[MAX_OPNAME]; 10 | bool is_valid( 11 | std::pair const &) const; 12 | }; 13 | 14 | bool operator==(BatchMatmulParams const &, BatchMatmulParams const &); 15 | 16 | } // namespace FlexFlow 17 | 18 | namespace std { 19 | template <> 20 | struct hash { 21 | size_t operator()(FlexFlow::BatchMatmulParams const &) const; 22 | }; 23 | } // namespace std 24 | -------------------------------------------------------------------------------- /include/flexflow/ops/beam_topk_params.h: -------------------------------------------------------------------------------- 1 | #ifndef _FLEXFLOW_BEAM_TOPK_PARAMS_H 2 | #define _FLEXFLOW_BEAM_TOPK_PARAMS_H 3 | 4 | #include "flexflow/ffconst.h" 5 | #include "flexflow/fftype.h" 6 | #include "flexflow/parallel_tensor.h" 7 | 8 | namespace FlexFlow { 9 | 10 | struct BeamTopKParams { 11 | LayerID layer_guid; 12 | bool sorted; 13 | int max_beam_width; 14 | char name[MAX_OPNAME]; 15 | bool is_valid(ParallelTensorShape const &) const; 16 | }; 17 | bool operator==(BeamTopKParams const &, BeamTopKParams const &); 18 | 19 | } // namespace FlexFlow 20 | 21 | namespace std { 22 | template <> 23 | struct hash { 24 | size_t operator()(FlexFlow::BeamTopKParams const &) const; 25 | }; 26 | } // namespace std 27 | 28 | #endif // _FLEXFLOW_BEAM_TOPK_PARAMS_H 29 | -------------------------------------------------------------------------------- /include/flexflow/ops/cast_params.h: -------------------------------------------------------------------------------- 1 | #ifndef _FLEXFLOW_CAST_PARAMS_H 2 | #define _FLEXFLOW_CAST_PARAMS_H 3 | 4 | #include "flexflow/ffconst.h" 5 | #include "flexflow/parallel_tensor.h" 6 | 7 | namespace FlexFlow { 8 | 9 | struct CastParams { 10 | DataType dtype; 11 | char name[MAX_OPNAME]; 12 | bool is_valid(ParallelTensorShape const &) const; 13 | }; 14 | bool operator==(CastParams const &, CastParams const &); 15 | 16 | } // namespace FlexFlow 17 | 18 | namespace std { 19 | template <> 20 | struct hash { 21 | size_t operator()(FlexFlow::CastParams const &) const; 22 | }; 23 | } // namespace std 24 | 25 | #endif // _FLEXFLOW_CAST_PARAMS_H 26 | -------------------------------------------------------------------------------- /include/flexflow/ops/concat_params.h: -------------------------------------------------------------------------------- 1 | #ifndef _FLEXFLOW_CONCAT_PARAMS_H 2 | #define _FLEXFLOW_CONCAT_PARAMS_H 3 | 4 | #include "flexflow/parallel_tensor.h" 5 | 6 | namespace FlexFlow { 7 | 8 | struct ConcatParams { 9 | int axis; 10 | char name[MAX_OPNAME]; 11 | bool is_valid(std::vector const &) const; 12 | }; 13 | 14 | bool operator==(ConcatParams const &, ConcatParams const &); 15 | 16 | } // namespace FlexFlow 17 | 18 | namespace std { 19 | template <> 20 | struct hash { 21 | size_t operator()(FlexFlow::ConcatParams const &) const; 22 | }; 23 | } // namespace std 24 | 25 | #endif // _FLEXFLOW_CONCAT_PARAMS_H 26 | -------------------------------------------------------------------------------- /include/flexflow/ops/conv_2d_params.h: -------------------------------------------------------------------------------- 1 | #ifndef _FLEXFLOW_CONV_2D_PARAMS_H 2 | #define _FLEXFLOW_CONV_2D_PARAMS_H 3 | 4 | #include "flexflow/ffconst.h" 5 | #include "flexflow/fftype.h" 6 | #include "flexflow/parallel_tensor.h" 7 | 8 | namespace FlexFlow { 9 | 10 | struct Conv2DParams { 11 | LayerID layer_guid; 12 | int out_channels, kernel_h, kernel_w, stride_h, stride_w, padding_h, 13 | padding_w, groups; 14 | ActiMode activation; 15 | bool use_bias; 16 | char name[MAX_OPNAME]; 17 | 18 | bool is_valid(ParallelTensorShape const &input) const; 19 | void solve_dims(ParallelTensorShape const &input, 20 | ParallelDim output_dims[MAX_TENSOR_DIM], 21 | int *output_ndims, 22 | ParallelDim kernel_dims[MAX_TENSOR_DIM], 23 | int *kernel_ndims, 24 | ParallelDim bias_dims[MAX_TENSOR_DIM], 25 | int *bias_ndims) const; 26 | 27 | friend bool operator==(Conv2DParams const &lhs, Conv2DParams const &rhs); 28 | 29 | private: 30 | void mark_replica_dims(ParallelTensorShape const &input, 31 | ParallelDim output_dims[MAX_TENSOR_DIM], 32 | ParallelDim kernel_dims[MAX_TENSOR_DIM], 33 | ParallelDim bias_dims[MAX_TENSOR_DIM]) const; 34 | int output_size(ParallelTensorShape const &input, 35 | ParallelDim output_dims[MAX_TENSOR_DIM]) const; 36 | int kernel_size(ParallelTensorShape const &input_shape, 37 | ParallelDim kernel_dims[MAX_TENSOR_DIM]) const; 38 | int bias_size(ParallelTensorShape const &input, 39 | ParallelDim bias_dims[MAX_TENSOR_DIM]) const; 40 | }; 41 | 42 | } // namespace FlexFlow 43 | 44 | namespace std { 45 | template <> 46 | struct hash { 47 | size_t operator()(FlexFlow::Conv2DParams const &) const; 48 | }; 49 | } // namespace std 50 | 51 | #endif // _FLEXFLOW_CONV_2D_PARAMS_H 52 | -------------------------------------------------------------------------------- /include/flexflow/ops/dropout_params.h: -------------------------------------------------------------------------------- 1 | #ifndef _FLEXFLOW_DROPOUT_PARAMS_H 2 | #define _FLEXFLOW_DROPOUT_PARAMS_H 3 | 4 | #include "flexflow/ffconst.h" 5 | #include "flexflow/parallel_tensor.h" 6 | 7 | namespace FlexFlow { 8 | 9 | struct DropoutParams { 10 | float rate; 11 | unsigned long long seed; 12 | char name[MAX_OPNAME]; 13 | bool is_valid(ParallelTensorShape const &) const; 14 | }; 15 | bool operator==(DropoutParams const &, DropoutParams const &); 16 | 17 | } // namespace FlexFlow 18 | 19 | namespace std { 20 | template <> 21 | struct hash { 22 | size_t operator()(FlexFlow::DropoutParams const &) const; 23 | }; 24 | } // namespace std 25 | 26 | #endif // _FLEXFLOW_DROPOUT_PARAMS_H 27 | -------------------------------------------------------------------------------- /include/flexflow/ops/element_binary_params.h: -------------------------------------------------------------------------------- 1 | #ifndef _FLEXFLOW_ELEMENT_BINARY_PARAMS_H 2 | #define _FLEXFLOW_ELEMENT_BINARY_PARAMS_H 3 | 4 | #include "flexflow/ffconst.h" 5 | #include "flexflow/fftype.h" 6 | #include "flexflow/parallel_tensor.h" 7 | 8 | namespace FlexFlow { 9 | 10 | struct ElementBinaryParams { 11 | LayerID layer_guid; 12 | OperatorType type; 13 | bool inplace_a; 14 | char name[MAX_OPNAME]; 15 | 16 | bool is_valid( 17 | std::pair const &) const; 18 | }; 19 | 20 | bool operator==(ElementBinaryParams const &, ElementBinaryParams const &); 21 | 22 | } // namespace FlexFlow 23 | 24 | namespace std { 25 | template <> 26 | struct hash { 27 | size_t operator()(FlexFlow::ElementBinaryParams const &) const; 28 | }; 29 | } // namespace std 30 | 31 | #endif // _FLEXFLOW_ELEMENT_BINARY_PARAMS_H 32 | -------------------------------------------------------------------------------- /include/flexflow/ops/element_unary_params.h: -------------------------------------------------------------------------------- 1 | #ifndef _FLEXFLOW_ELEMENTARY_UNARY_PARAMS_H 2 | #define _FLEXFLOW_ELEMENTARY_UNARY_PARAMS_H 3 | 4 | #include "flexflow/ffconst.h" 5 | #include "flexflow/fftype.h" 6 | #include "flexflow/parallel_tensor.h" 7 | 8 | namespace FlexFlow { 9 | 10 | struct ElementUnaryParams { 11 | OperatorType op_type; 12 | bool inplace; 13 | float scalar = 0.0; 14 | LayerID layer_guid; 15 | char name[MAX_OPNAME]; 16 | 17 | bool is_valid(ParallelTensorShape const &) const; 18 | }; 19 | 20 | bool operator==(ElementUnaryParams const &, ElementUnaryParams const &); 21 | 22 | } // namespace FlexFlow 23 | 24 | namespace std { 25 | template <> 26 | struct hash { 27 | size_t operator()(FlexFlow::ElementUnaryParams const &) const; 28 | }; 29 | } // namespace std 30 | 31 | #endif // _FLEXFLOW_ELEMENTARY_UNARY_PARAMS_H 32 | -------------------------------------------------------------------------------- /include/flexflow/ops/embedding_params.h: -------------------------------------------------------------------------------- 1 | #ifndef _FLEXFLOW_EMBEDDING_PARAMS_H 2 | #define _FLEXFLOW_EMBEDDING_PARAMS_H 3 | 4 | #include "flexflow/ffconst.h" 5 | #include "flexflow/fftype.h" 6 | #include "flexflow/parallel_tensor.h" 7 | 8 | namespace FlexFlow { 9 | 10 | struct EmbeddingParams { 11 | int num_entries, out_channels; 12 | LayerID layer_guid; 13 | AggrMode aggr; 14 | DataType data_type; 15 | char name[MAX_OPNAME]; 16 | 17 | bool is_valid(ParallelTensorShape const &) const; 18 | }; 19 | bool operator==(EmbeddingParams const &, EmbeddingParams const &); 20 | 21 | } // namespace FlexFlow 22 | 23 | namespace std { 24 | template <> 25 | struct hash { 26 | size_t operator()(FlexFlow::EmbeddingParams const &) const; 27 | }; 28 | } // namespace std 29 | 30 | #endif // _FLEXFLOW_EMBEDDING_PARAMS_H 31 | -------------------------------------------------------------------------------- /include/flexflow/ops/experts_params.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include "flexflow/ffconst.h" 4 | #include "flexflow/fftype.h" 5 | #include "flexflow/operator.h" 6 | #include "flexflow/parallel_tensor.h" 7 | 8 | namespace FlexFlow { 9 | 10 | struct ExpertsParams { 11 | LayerID layer_guid; 12 | int num_experts; 13 | int experts_start_idx; 14 | int experts_output_dim_size; 15 | float alpha; 16 | int experts_num_layers; 17 | int experts_internal_dim_size; 18 | bool use_bias; 19 | ActiMode activation; 20 | char name[MAX_OPNAME]; 21 | 22 | bool is_valid(std::vector const &) const; 23 | }; 24 | 25 | bool operator==(ExpertsParams const &, ExpertsParams const &); 26 | 27 | } // namespace FlexFlow 28 | 29 | namespace std { 30 | template <> 31 | struct hash { 32 | size_t operator()(FlexFlow::ExpertsParams const &) const; 33 | }; 34 | } // namespace std 35 | -------------------------------------------------------------------------------- /include/flexflow/ops/flat_params.h: -------------------------------------------------------------------------------- 1 | #ifndef _FLEXFLOW_FLAT_PARAMS_H 2 | #define _FLEXFLOW_FLAT_PARAMS_H 3 | 4 | #include "flexflow/ffconst.h" 5 | #include "flexflow/parallel_tensor.h" 6 | 7 | namespace FlexFlow { 8 | 9 | struct FlatParams { 10 | char name[MAX_OPNAME]; 11 | bool is_valid(ParallelTensorShape const &) const; 12 | void solve_dims(ParallelTensorShape const &input, 13 | ParallelDim output_dims[MAX_TENSOR_DIM], 14 | int *output_ndims) const; 15 | 16 | private: 17 | int output_size(ParallelTensorShape const &input, 18 | ParallelDim output_dims[MAX_TENSOR_DIM]) const; 19 | }; 20 | 21 | bool operator==(FlatParams const &, FlatParams const &); 22 | 23 | } // namespace FlexFlow 24 | 25 | namespace std { 26 | template <> 27 | struct hash { 28 | size_t operator()(FlexFlow::FlatParams const &) const; 29 | }; 30 | } // namespace std 31 | 32 | #endif // _FLEXFLOW_FLAT_PARAMS_H 33 | -------------------------------------------------------------------------------- /include/flexflow/ops/gather_params.h: -------------------------------------------------------------------------------- 1 | #ifndef _FLEXFLOW_GATHER_PARAMS_H 2 | #define _FLEXFLOW_GATHER_PARAMS_H 3 | 4 | #include "flexflow/ffconst.h" 5 | #include "flexflow/fftype.h" 6 | #include "flexflow/parallel_tensor.h" 7 | 8 | namespace FlexFlow { 9 | 10 | struct GatherParams { 11 | int legion_dim; 12 | LayerID layer_guid; 13 | char name[MAX_OPNAME]; 14 | bool is_valid( 15 | std::pair const &input) const; 16 | }; 17 | 18 | bool operator==(GatherParams const &, GatherParams const &); 19 | 20 | } // namespace FlexFlow 21 | 22 | namespace std { 23 | template <> 24 | struct hash { 25 | size_t operator()(FlexFlow::GatherParams const &) const; 26 | }; 27 | } // namespace std 28 | 29 | #endif // _FLEXFLOW_GATHER_PARAMS_H 30 | -------------------------------------------------------------------------------- /include/flexflow/ops/groupby_params.h: -------------------------------------------------------------------------------- 1 | #ifndef _FLEXFLOW_GROUPBY_PARAMS_H 2 | #define _FLEXFLOW_GROUPBY_PARAMS_H 3 | 4 | #include "flexflow/ffconst.h" 5 | #include "flexflow/parallel_tensor.h" 6 | 7 | namespace FlexFlow { 8 | 9 | struct Group_byParams { 10 | int n; 11 | float alpha; 12 | char name[MAX_OPNAME]; 13 | bool is_valid( 14 | std::pair const &) const; 15 | }; 16 | bool operator==(Group_byParams const &, Group_byParams const &); 17 | 18 | } // namespace FlexFlow 19 | 20 | namespace std { 21 | template <> 22 | struct hash { 23 | size_t operator()(FlexFlow::Group_byParams const &) const; 24 | }; 25 | } // namespace std 26 | 27 | #endif // _FLEXFLOW_GROUPBY_PARAMS_H 28 | -------------------------------------------------------------------------------- /include/flexflow/ops/inc_multihead_self_attention_params.h: -------------------------------------------------------------------------------- 1 | #ifndef _FLEXFLOW_INC_MULTIHEAD_SELF_ATTENTION_PARAMS_H 2 | #define _FLEXFLOW_INC_MULTIHEAD_SELF_ATTENTION_PARAMS_H 3 | 4 | #include "flexflow/ffconst.h" 5 | #include "flexflow/fftype.h" 6 | #include "flexflow/inference.h" 7 | #include "flexflow/parallel_tensor.h" 8 | 9 | namespace FlexFlow { 10 | 11 | struct IncMultiHeadSelfAttentionParams { 12 | LayerID layer_guid; 13 | int embed_dim, num_q_heads, kdim, vdim, num_kv_heads, 14 | tensor_parallelism_degree, num_kv_cache_pages; 15 | float dropout, scaling_factor; 16 | bool add_zero_attn, scaling_query, qk_prod_scaling, position_bias; 17 | RotaryEmbeddingMeta rotary_embedding_meta; 18 | DataType quantization_type; 19 | bool offload; 20 | char name[MAX_OPNAME]; 21 | bool is_valid(ParallelTensorShape const &) const; 22 | }; 23 | 24 | bool operator==(IncMultiHeadSelfAttentionParams const &, 25 | IncMultiHeadSelfAttentionParams const &); 26 | 27 | } // namespace FlexFlow 28 | 29 | namespace std { 30 | template <> 31 | struct hash { 32 | size_t operator()(FlexFlow::IncMultiHeadSelfAttentionParams const &) const; 33 | }; 34 | } // namespace std 35 | 36 | #endif // _FLEXFLOW_INC_MULTIHEAD_SELF_ATTENTION_PARAMS_H 37 | -------------------------------------------------------------------------------- /include/flexflow/ops/kernels/cast_kernels.h: -------------------------------------------------------------------------------- 1 | #ifndef _FLEXFLOW_OPS_KERNELS_CAST_KERNELS_H 2 | #define _FLEXFLOW_OPS_KERNELS_CAST_KERNELS_H 3 | 4 | #include "flexflow/device.h" 5 | #include "flexflow/fftype.h" 6 | #include "flexflow/op_meta.h" 7 | 8 | namespace FlexFlow { 9 | 10 | class Cast; 11 | 12 | class CastMeta : public OpMeta { 13 | public: 14 | CastMeta(FFHandler handle, Cast const *cast); 15 | DataType input_data_type, output_data_type; 16 | }; 17 | 18 | namespace Kernels { 19 | namespace Cast { 20 | template 21 | void forward_kernel_wrapper(CastMeta const *m, 22 | IDT const *input_ptr, 23 | ODT *output_ptr, 24 | size_t volume); 25 | 26 | template 27 | void backward_kernel_wrapper(IDT const *src_ptr, ODT *dst_ptr, size_t volume); 28 | 29 | namespace Internal { 30 | 31 | template 32 | void forward_kernel(IDT const *input_ptr, 33 | ODT *output_ptr, 34 | size_t volume, 35 | ffStream_t stream); 36 | template 37 | void backward_kernel(IDT const *src_ptr, 38 | ODT *dst_ptr, 39 | size_t volume, 40 | ffStream_t stream); 41 | } // namespace Internal 42 | } // namespace Cast 43 | } // namespace Kernels 44 | } // namespace FlexFlow 45 | 46 | #endif // _FLEXFLOW_OPS_KERNELS_CAST_KERNELS_H 47 | -------------------------------------------------------------------------------- /include/flexflow/ops/kernels/concat_kernels.h: -------------------------------------------------------------------------------- 1 | #ifndef _FLEXFLOW_OPS_KERNELS_CONCAT_KERNELS_H 2 | #define _FLEXFLOW_OPS_KERNELS_CONCAT_KERNELS_H 3 | 4 | #include "flexflow/accessor.h" 5 | #include "flexflow/device.h" 6 | #include "flexflow/fftype.h" 7 | #include "flexflow/op_meta.h" 8 | 9 | namespace FlexFlow { 10 | 11 | class Concat; 12 | 13 | class ConcatMeta : public OpMeta { 14 | public: 15 | ConcatMeta(FFHandler handle, Concat const *cc); 16 | int legion_axis; 17 | }; 18 | 19 | namespace Kernels { 20 | namespace Concat { 21 | 22 | void init_meta(ConcatMeta *meta, int legion_axis); 23 | void forward_kernel_wrapper(ConcatMeta const *m, 24 | GenericTensorAccessorW const &output, 25 | GenericTensorAccessorR const *inputs, 26 | int num_inputs, 27 | int axis); 28 | void backward_kernel_wrapper(ConcatMeta const *m, 29 | GenericTensorAccessorR const &output_grad, 30 | GenericTensorAccessorW const *input_grads, 31 | int num_inputs, 32 | int axis); 33 | 34 | namespace Internal { 35 | 36 | void forward_kernel(GenericTensorAccessorW const &output, 37 | GenericTensorAccessorR const *inputs, 38 | int num_inputs, 39 | int axis, 40 | ffStream_t stream); 41 | 42 | void backward_kernel(GenericTensorAccessorR const &output_grad, 43 | GenericTensorAccessorW const *input_grads, 44 | int num_inputs, 45 | int axis, 46 | ffStream_t stream); 47 | } // namespace Internal 48 | } // namespace Concat 49 | } // namespace Kernels 50 | } // namespace FlexFlow 51 | 52 | #endif // _FLEXFLOW_OPS_KERNELS_CONCAT_KERNELS_H 53 | -------------------------------------------------------------------------------- /include/flexflow/ops/kernels/decompress_kernels.h: -------------------------------------------------------------------------------- 1 | #ifndef _FLEXFLOW_DECOMPRESS_KERNELS_H 2 | #define _FLEXFLOW_DECOMPRESS_KERNELS_H 3 | 4 | #include "flexflow/device.h" 5 | 6 | namespace FlexFlow { 7 | namespace Kernels { 8 | 9 | template 10 | __global__ void decompress_int4_general_weights(char const *input_weight_ptr, 11 | DT *weight_ptr, 12 | int in_dim, 13 | int valueSize); 14 | template 15 | __global__ void decompress_int8_general_weights(char const *input_weight_ptr, 16 | DT *weight_ptr, 17 | int in_dim, 18 | int valueSize); 19 | 20 | template 21 | __global__ void decompress_int4_attention_weights(char *input_weight_ptr, 22 | DT *weight_ptr, 23 | int qProjSize, 24 | int qSize, 25 | int num_heads); 26 | 27 | template 28 | __global__ void decompress_int8_attention_weights(char *input_weight_ptr, 29 | DT *weight_ptr, 30 | int qProjSize, 31 | int qSize, 32 | int num_heads); 33 | // template 34 | // void decompress_weight_bias(T1 *input_weight_ptr, 35 | // T2 *weight_ptr, 36 | // T2 *params, 37 | // int group_size, 38 | // int tensor_size); 39 | 40 | } // namespace Kernels 41 | } // namespace FlexFlow 42 | 43 | #endif // _FLEXFLOW_DECOMPRESS_KERNELS_H 44 | -------------------------------------------------------------------------------- /include/flexflow/ops/kernels/dropout_kernels.h: -------------------------------------------------------------------------------- 1 | #ifndef _FLEXFLOW_OPS_KERNELS_DROPOUT_KERNELS_H 2 | #define _FLEXFLOW_OPS_KERNELS_DROPOUT_KERNELS_H 3 | 4 | #include "flexflow/device.h" 5 | #include "flexflow/fftype.h" 6 | #include "flexflow/op_meta.h" 7 | #include "flexflow/ops/dropout.h" 8 | 9 | namespace FlexFlow { 10 | 11 | class DropoutMeta : public OpMeta { 12 | public: 13 | DropoutMeta(FFHandler handle, 14 | Dropout const *dropout, 15 | Legion::Memory gpu_mem, 16 | Legion::Domain const &output_domain); 17 | ~DropoutMeta(void); 18 | Realm::RegionInstance reserveInst; 19 | #if defined(FF_USE_CUDA) || defined(FF_USE_HIP_CUDA) 20 | cudnnTensorDescriptor_t inputTensor, outputTensor; 21 | cudnnDropoutDescriptor_t dropoutDesc; 22 | #else 23 | miopenTensorDescriptor_t inputTensor, outputTensor; 24 | miopenDropoutDescriptor_t dropoutDesc; 25 | #endif 26 | void *reserveSpace, *dropoutStates; 27 | size_t reserveSpaceSize, dropoutStateSize; 28 | }; 29 | 30 | namespace Kernels { 31 | namespace Dropout { 32 | void forward_kernel_wrapper(DropoutMeta *m, 33 | float const *input_ptr, 34 | float *output_ptr); 35 | void backward_kernel_wrapper(DropoutMeta *m, 36 | float const *output_grad_ptr, 37 | float *input_grad_ptr); 38 | 39 | namespace Internal { 40 | void forward_kernel(DropoutMeta *m, 41 | float const *input_ptr, 42 | float *output_ptr, 43 | ffStream_t stream); 44 | void backward_kernel(DropoutMeta *m, 45 | float const *output_grad_ptr, 46 | float *input_grad_ptr, 47 | ffStream_t stream); 48 | } // namespace Internal 49 | } // namespace Dropout 50 | } // namespace Kernels 51 | } // namespace FlexFlow 52 | 53 | #endif // _FLEXFLOW_OPS_KERNELS_DROPOUT_KERNELS_H 54 | -------------------------------------------------------------------------------- /include/flexflow/ops/kernels/embedding_kernels.h: -------------------------------------------------------------------------------- 1 | #ifndef _FLEXFLOW_OPS_KERNELS_EMBEDDING_KERNELS_H 2 | #define _FLEXFLOW_OPS_KERNELS_EMBEDDING_KERNELS_H 3 | 4 | #include "flexflow/accessor.h" 5 | #include "flexflow/device.h" 6 | #include "flexflow/fftype.h" 7 | #include "flexflow/op_meta.h" 8 | 9 | namespace FlexFlow { 10 | 11 | class EmbeddingMeta : public OpMeta { 12 | public: 13 | EmbeddingMeta(FFHandler handle, Op const *op); 14 | DataType input_data_type; 15 | AggrMode aggr; 16 | }; 17 | 18 | namespace Kernels { 19 | namespace Embedding { 20 | void forward_kernel_wrapper(EmbeddingMeta const *m, 21 | GenericTensorAccessorR const &input, 22 | GenericTensorAccessorW const &output, 23 | GenericTensorAccessorR const &weight, 24 | int in_dim, 25 | int out_dim, 26 | int batch_size); 27 | void backward_kernel_wrapper(EmbeddingMeta const *m, 28 | GenericTensorAccessorR const &input, 29 | GenericTensorAccessorR const &output, 30 | GenericTensorAccessorW const &weight_grad, 31 | int in_dim, 32 | int out_dim, 33 | int batch_size); 34 | 35 | namespace Internal { 36 | template 37 | void forward_kernel(TI const *input_ptr, 38 | TD *output_ptr, 39 | TD const *weight_ptr, 40 | int in_dim, 41 | int out_dim, 42 | int batch_size, 43 | AggrMode aggr, 44 | int outputSize, 45 | ffStream_t stream); 46 | 47 | ; 48 | } // namespace Internal 49 | } // namespace Embedding 50 | } // namespace Kernels 51 | } // namespace FlexFlow 52 | 53 | #endif // _FLEXFLOW_OPS_KERNELS_EMBEDDING_KERNELS_H -------------------------------------------------------------------------------- /include/flexflow/ops/kernels/flat_kernels.h: -------------------------------------------------------------------------------- 1 | #ifndef _FLEXFLOW_OPS_KERNELS_FLAT_KERNELS_H 2 | #define _FLEXFLOW_OPS_KERNELS_FLAT_KERNELS_H 3 | 4 | #include "flexflow/device.h" 5 | #include "flexflow/fftype.h" 6 | #include "flexflow/op_meta.h" 7 | 8 | namespace FlexFlow { 9 | 10 | class Flat; 11 | 12 | class FlatMeta : public OpMeta { 13 | public: 14 | FlatMeta(FFHandler handle, Flat const *flat); 15 | }; 16 | 17 | namespace Kernels { 18 | namespace Flat { 19 | 20 | void forward_kernel_wrapper(float const *input_ptr, 21 | float *output_ptr, 22 | size_t num_elements); 23 | void backward_kernel_wrapper(float *input_grad_ptr, 24 | float const *output_grad_ptr, 25 | size_t num_elements); 26 | 27 | namespace Internal { 28 | 29 | void forward_kernel(float const *input_ptr, 30 | float *output_ptr, 31 | size_t num_elements, 32 | ffStream_t stream); 33 | void backward_kernel(float *input_grad_ptr, 34 | float const *output_grad_ptr, 35 | size_t num_elements, 36 | ffStream_t stream); 37 | 38 | } // namespace Internal 39 | } // namespace Flat 40 | } // namespace Kernels 41 | } // namespace FlexFlow 42 | 43 | #endif // _FLEXFLOW_OPS_KERNELS_FLAT_KERNELS_H 44 | -------------------------------------------------------------------------------- /include/flexflow/ops/kernels/gather_kernels.h: -------------------------------------------------------------------------------- 1 | #ifndef _FLEXFLOW_OPS_KERNELS_GATHER_KERNELS_H 2 | #define _FLEXFLOW_OPS_KERNELS_GATHER_KERNELS_H 3 | 4 | #include "flexflow/accessor.h" 5 | #include "flexflow/device.h" 6 | #include "flexflow/fftype.h" 7 | #include "flexflow/op_meta.h" 8 | 9 | namespace FlexFlow { 10 | 11 | class Gather; 12 | 13 | class GatherMeta : public OpMeta { 14 | public: 15 | GatherMeta(FFHandler handler, Gather const *gather); 16 | 17 | public: 18 | int legion_dim; 19 | }; 20 | 21 | namespace Kernels { 22 | namespace Gather { 23 | void forward_kernel_wrapper(GatherMeta const *m, 24 | GenericTensorAccessorR const &input, 25 | GenericTensorAccessorR const &index, 26 | GenericTensorAccessorW const &output); 27 | void backward_kernel_wrapper(GatherMeta const *m, 28 | GenericTensorAccessorR const &output_grad, 29 | GenericTensorAccessorR const &index, 30 | GenericTensorAccessorW const &input_grad); 31 | namespace Internal { 32 | template 33 | void forward_kernel(float const *input_ptr, 34 | IndexType const *index_ptr, 35 | float *output_ptr, 36 | Legion::coord_t output_size, 37 | Legion::coord_t stride, 38 | Legion::coord_t input_dim_size, 39 | Legion::coord_t output_dim_size, 40 | ffStream_t stream); 41 | template 42 | void backward_kernel(float const *output_grad_ptr, 43 | IndexType const *index_ptr, 44 | float *input_grad_ptr, 45 | Legion::coord_t output_size, 46 | Legion::coord_t stride, 47 | Legion::coord_t input_dim_size, 48 | Legion::coord_t output_dim_size, 49 | ffStream_t stream); 50 | } // namespace Internal 51 | } // namespace Gather 52 | } // namespace Kernels 53 | } // namespace FlexFlow 54 | 55 | #endif // _FLEXFLOW_OPS_KERNELS_GATHER_KERNELS_H 56 | -------------------------------------------------------------------------------- /include/flexflow/ops/kernels/reshape_kernels.h: -------------------------------------------------------------------------------- 1 | #ifndef _FLEXFLOW_OPS_KERNELS_RESHAPE_KERNELS_H 2 | #define _FLEXFLOW_OPS_KERNELS_RESHAPE_KERNELS_H 3 | 4 | #include "flexflow/device.h" 5 | #include "flexflow/fftype.h" 6 | #include "flexflow/op_meta.h" 7 | 8 | namespace FlexFlow { 9 | 10 | class Reshape; 11 | 12 | class ReshapeMeta : public OpMeta { 13 | public: 14 | ReshapeMeta(FFHandler handler, Reshape const *reshape); 15 | DataType data_type; 16 | }; 17 | 18 | namespace Kernels { 19 | namespace Reshape { 20 | 21 | template 22 | void forward_kernel_wrapper(T const *input_ptr, 23 | T *output_ptr, 24 | size_t num_elements); 25 | 26 | template 27 | void backward_kernel_wrapper(T *input_grad_ptr, 28 | T const *output_grad_ptr, 29 | size_t num_elements); 30 | 31 | namespace Internal { 32 | 33 | template 34 | void forward_kernel(T const *input_ptr, 35 | T *output_ptr, 36 | size_t num_elements, 37 | ffStream_t stream); 38 | template 39 | void backward_kernel(T *input_grad_ptr, 40 | T const *output_grad_ptr, 41 | size_t num_elements, 42 | ffStream_t stream); 43 | 44 | } // namespace Internal 45 | } // namespace Reshape 46 | } // namespace Kernels 47 | } // namespace FlexFlow 48 | 49 | #endif // _FLEXFLOW_OPS_KERNELS_RESHAPE_KERNELS_H 50 | -------------------------------------------------------------------------------- /include/flexflow/ops/kernels/split_kernels.h: -------------------------------------------------------------------------------- 1 | #ifndef _FLEXFLOW_OPS_KERNELS_SPLIT_KERNELS_H 2 | #define _FLEXFLOW_OPS_KERNELS_SPLIT_KERNELS_H 3 | 4 | #include "flexflow/device.h" 5 | #include "flexflow/fftype.h" 6 | #include "flexflow/op_meta.h" 7 | 8 | namespace FlexFlow { 9 | 10 | namespace Kernels { 11 | namespace Split { 12 | void forward_kernel_wrapper(float **out_ptrs, 13 | float const *in_ptr, 14 | Legion::coord_t const *out_blk_sizes, 15 | Legion::coord_t in_blk_size, 16 | Legion::coord_t num_blks, 17 | int numOutputs); 18 | 19 | void backward_kernel_wrapper(float *in_grad_ptr, 20 | float const **out_grad_ptr, 21 | Legion::coord_t const *out_blk_sizes, 22 | Legion::coord_t in_blk_size, 23 | Legion::coord_t num_blks, 24 | int numOutputs); 25 | 26 | namespace Internal { 27 | void forward_kernel(float **out_ptrs, 28 | float const *in_ptr, 29 | Legion::coord_t const *out_blk_sizes, 30 | Legion::coord_t in_blk_size, 31 | Legion::coord_t num_blks, 32 | int numOutputs, 33 | ffStream_t stream); 34 | void backward_kernel(float *in_grad_ptr, 35 | float const **out_grad_ptr, 36 | Legion::coord_t const *out_blk_sizes, 37 | Legion::coord_t in_blk_size, 38 | Legion::coord_t num_blks, 39 | int numOutputs, 40 | ffStream_t stream); 41 | } // namespace Internal 42 | } // namespace Split 43 | } // namespace Kernels 44 | } // namespace FlexFlow 45 | 46 | #endif // _FLEXFLOW_OPS_KERNELS_SPLIT_KERNELS_H 47 | -------------------------------------------------------------------------------- /include/flexflow/ops/kernels/transpose_kernels.h: -------------------------------------------------------------------------------- 1 | #ifndef _FLEXFLOW_OPS_KERNELS_TRANSPOSE_KERNELS_H 2 | #define _FLEXFLOW_OPS_KERNELS_TRANSPOSE_KERNELS_H 3 | 4 | #include "flexflow/device.h" 5 | #include "flexflow/fftype.h" 6 | #include "flexflow/op_meta.h" 7 | 8 | namespace FlexFlow { 9 | 10 | class Transpose; 11 | 12 | class TransposeMeta : public OpMeta { 13 | public: 14 | TransposeMeta(FFHandler handler, Transpose const *transpose); 15 | int num_dim; 16 | int perm[MAX_TENSOR_DIM]; 17 | }; 18 | 19 | namespace Kernels { 20 | namespace Transpose { 21 | 22 | void forward_kernel_wrapper(TransposeMeta const *m, 23 | float const *input_ptr, 24 | float *output_ptr, 25 | Legion::Domain in_domain, 26 | Legion::Domain out_domain); 27 | void backward_kernel_wrapper(TransposeMeta const *m, 28 | float *input_grad_ptr, 29 | float const *output_grad_ptr, 30 | Legion::Domain in_grad_domain, 31 | Legion::Domain out_grad_domain); 32 | 33 | namespace Internal { 34 | 35 | void forward_kernel(TransposeMeta const *m, 36 | float const *input_ptr, 37 | float *output_ptr, 38 | Legion::Domain in_domain, 39 | Legion::Domain out_domain, 40 | ffStream_t stream); 41 | void backward_kernel(TransposeMeta const *m, 42 | float *input_grad_ptr, 43 | float const *output_grad_ptr, 44 | Legion::Domain in_grad_domain, 45 | Legion::Domain out_grad_domain, 46 | ffStream_t stream); 47 | 48 | } // namespace Internal 49 | } // namespace Transpose 50 | } // namespace Kernels 51 | } // namespace FlexFlow 52 | 53 | #endif // _FLEXFLOW_OPS_KERNELS_TRANSPOSE_KERNELS_H 54 | -------------------------------------------------------------------------------- /include/flexflow/ops/layer_norm_params.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include "flexflow/ffconst.h" 4 | #include "flexflow/fftype.h" 5 | #include "flexflow/parallel_tensor.h" 6 | 7 | namespace FlexFlow { 8 | 9 | struct LayerNormParams { 10 | LayerID layer_guid; 11 | std::vector axes; 12 | bool elementwise_affine; 13 | float eps; 14 | bool use_bias; 15 | char name[MAX_OPNAME]; 16 | bool is_valid(ParallelTensorShape const &) const; 17 | }; 18 | 19 | bool operator==(LayerNormParams const &, LayerNormParams const &); 20 | 21 | } // namespace FlexFlow 22 | 23 | namespace std { 24 | template <> 25 | struct hash { 26 | size_t operator()(FlexFlow::LayerNormParams const &) const; 27 | }; 28 | } // namespace std 29 | -------------------------------------------------------------------------------- /include/flexflow/ops/mean.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include "flexflow/model.h" 4 | 5 | namespace FlexFlow { 6 | 7 | class Mean : public Op { 8 | public: 9 | Mean(FFModel &model, 10 | const ParallelTensor input, 11 | std::vector const &dims, 12 | bool keepdims, 13 | char const *name); 14 | void init(FFModel const &) override; 15 | void forward(FFModel const &) override; 16 | void backward(FFModel const &) override; 17 | void print_layer(FFModel const &model) override { 18 | assert(0); 19 | } 20 | 21 | static OpMeta *init_task(Legion::Task const *task, 22 | std::vector const ®ions, 23 | Legion::Context ctx, 24 | Legion::Runtime *runtime); 25 | static void forward_task(Legion::Task const *task, 26 | std::vector const ®ions, 27 | Legion::Context ctx, 28 | Legion::Runtime *runtime); 29 | static void backward_task(Legion::Task const *task, 30 | std::vector const ®ions, 31 | Legion::Context ctx, 32 | Legion::Runtime *runtime); 33 | bool measure_operator_cost(Simulator *sim, 34 | MachineView const &pc, 35 | CostMetrics &cost_metrics) const override; 36 | }; 37 | 38 | }; // namespace FlexFlow 39 | -------------------------------------------------------------------------------- /include/flexflow/ops/noop.h: -------------------------------------------------------------------------------- 1 | #ifndef _FLEXFLOW_NOOP_H 2 | #define _FLEXFLOW_NOOP_H 3 | 4 | #include "flexflow/inference.h" 5 | #include "flexflow/model.h" 6 | 7 | namespace FlexFlow { 8 | 9 | class NoOp : public Op { 10 | public: 11 | NoOp(FFModel &model, 12 | OperatorType type, 13 | const ParallelTensor output, 14 | char const *name = NULL); 15 | NoOp(FFModel &model, 16 | OperatorType type, 17 | size_t input_tensor_guid, 18 | const ParallelTensor output, 19 | char const *name = NULL); 20 | void init(FFModel const &) override; 21 | void init_inference(FFModel const &, 22 | std::vector const &, 23 | std::vector const &, 24 | MachineView const *mv = nullptr) override; 25 | void forward(FFModel const &) override; 26 | Legion::FutureMap inference(FFModel const &, 27 | BatchConfigFuture const &, 28 | std::vector const &, 29 | std::vector const &, 30 | MachineView const *mv = nullptr) override; 31 | void backward(FFModel const &) override; 32 | void print_layer(FFModel const &model) override { 33 | assert(0); 34 | } 35 | bool measure_operator_cost(Simulator *sim, 36 | MachineView const &pc, 37 | CostMetrics &cost_metrics) const override; 38 | static OpMeta *init_task(Legion::Task const *task, 39 | std::vector const ®ions, 40 | Legion::Context ctx, 41 | Legion::Runtime *runtime); 42 | 43 | size_t get_params_hash() const override; 44 | tl::optional as_dot() const override; 45 | 46 | public: 47 | size_t input_tensor_guid; 48 | }; 49 | 50 | }; // namespace FlexFlow 51 | 52 | #endif // _FLEXFLOW_NOOP_H 53 | -------------------------------------------------------------------------------- /include/flexflow/ops/pool_2d_params.h: -------------------------------------------------------------------------------- 1 | #ifndef _FLEXFLOW_POOL_2D_PARAMS_H 2 | #define _FLEXFLOW_POOL_2D_PARAMS_H 3 | 4 | #include "flexflow/ffconst.h" 5 | #include "flexflow/parallel_tensor.h" 6 | 7 | namespace FlexFlow { 8 | 9 | struct Pool2DParams { 10 | int kernel_h, kernel_w, stride_h, stride_w, padding_h, padding_w; 11 | PoolType pool_type; 12 | ActiMode activation; 13 | char name[MAX_OPNAME]; 14 | 15 | bool is_valid(ParallelTensorShape const &input) const; 16 | void solve_dims(ParallelTensorShape const &input, 17 | ParallelDim output_dims[MAX_TENSOR_DIM], 18 | int *output_ndims) const; 19 | 20 | private: 21 | int output_size(ParallelTensorShape const &input, 22 | ParallelDim output_dims[MAX_TENSOR_DIM]) const; 23 | }; 24 | 25 | bool operator==(Pool2DParams const &, Pool2DParams const &); 26 | 27 | } // namespace FlexFlow 28 | 29 | namespace std { 30 | template <> 31 | struct hash { 32 | size_t operator()(FlexFlow::Pool2DParams const &) const; 33 | }; 34 | } // namespace std 35 | 36 | #endif // _FLEXFLOW_POOL_2D_PARAMS_H 37 | -------------------------------------------------------------------------------- /include/flexflow/ops/reduce_params.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include "flexflow/ffconst.h" 4 | #include "flexflow/fftype.h" 5 | #include "flexflow/parallel_tensor.h" 6 | 7 | namespace FlexFlow { 8 | 9 | struct ReduceParams { 10 | std::vector axes; 11 | bool keepdims; 12 | LayerID layer_guid; 13 | char name[MAX_OPNAME]; 14 | 15 | bool is_valid(ParallelTensorShape const &) const; 16 | }; 17 | 18 | bool operator==(ReduceParams const &, ReduceParams const &); 19 | 20 | } // namespace FlexFlow 21 | 22 | namespace std { 23 | template <> 24 | struct hash { 25 | size_t operator()(FlexFlow::ReduceParams const &) const; 26 | }; 27 | } // namespace std 28 | -------------------------------------------------------------------------------- /include/flexflow/ops/reshape_params.h: -------------------------------------------------------------------------------- 1 | #ifndef _FLEXFLOW_RESHAPE_PARAMS_H 2 | #define _FLEXFLOW_RESHAPE_PARAMS_H 3 | 4 | #include "flexflow/ffconst.h" 5 | #include "flexflow/fftype.h" 6 | #include "flexflow/parallel_tensor.h" 7 | 8 | namespace FlexFlow { 9 | 10 | struct ReshapeParams { 11 | std::vector shape; 12 | LayerID layer_guid; 13 | char name[MAX_OPNAME]; 14 | 15 | bool is_valid(ParallelTensorShape const &) const; 16 | }; 17 | bool operator==(ReshapeParams const &, ReshapeParams const &); 18 | 19 | } // namespace FlexFlow 20 | 21 | namespace std { 22 | template <> 23 | struct hash { 24 | size_t operator()(FlexFlow::ReshapeParams const &) const; 25 | }; 26 | } // namespace std 27 | 28 | #endif // _FLEXFLOW_RESHAPE_PARAMS_H 29 | -------------------------------------------------------------------------------- /include/flexflow/ops/residual_layer_norm_params.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include "flexflow/ffconst.h" 4 | #include "flexflow/fftype.h" 5 | #include "flexflow/parallel_tensor.h" 6 | 7 | namespace FlexFlow { 8 | 9 | struct ResidualLayerNormParams { 10 | LayerID layer_guid; 11 | std::vector axes; 12 | bool elementwise_affine; 13 | float eps; 14 | bool use_bias; 15 | bool use_two_residuals; 16 | bool inplace_residual; 17 | char name[MAX_OPNAME]; 18 | bool is_valid(std::tuple const &) const; 21 | }; 22 | 23 | bool operator==(ResidualLayerNormParams const &, 24 | ResidualLayerNormParams const &); 25 | 26 | } // namespace FlexFlow 27 | 28 | namespace std { 29 | template <> 30 | struct hash { 31 | size_t operator()(FlexFlow::ResidualLayerNormParams const &) const; 32 | }; 33 | } // namespace std 34 | -------------------------------------------------------------------------------- /include/flexflow/ops/residual_rms_norm_params.h: -------------------------------------------------------------------------------- 1 | #ifndef _FLEXFLOW_RESIDUAL_RMSNORM_PARAMS_H 2 | #define _FLEXFLOW_RESIDUAL_RMSNORM_PARAMS_H 3 | 4 | #include "flexflow/ffconst.h" 5 | #include "flexflow/fftype.h" 6 | #include "flexflow/parallel_tensor.h" 7 | 8 | namespace FlexFlow { 9 | 10 | struct ResidualRMSNormParams { 11 | LayerID layer_guid; 12 | float eps; 13 | int dim; 14 | bool inplace_residual; 15 | char name[MAX_OPNAME]; 16 | bool is_valid( 17 | std::pair const &input) const; 18 | }; 19 | 20 | bool operator==(ResidualRMSNormParams const &, ResidualRMSNormParams const &); 21 | 22 | } // namespace FlexFlow 23 | 24 | namespace std { 25 | template <> 26 | struct hash { 27 | size_t operator()(FlexFlow::ResidualRMSNormParams const &) const; 28 | }; 29 | } // namespace std 30 | 31 | #endif // _FLEXFLOW_RESIDUAL_RMSNORM_PARAMS_H -------------------------------------------------------------------------------- /include/flexflow/ops/rms_norm_params.h: -------------------------------------------------------------------------------- 1 | #ifndef _FLEXFLOW_RMSNORM_PARAMS_H 2 | #define _FLEXFLOW_RMSNORM_PARAMS_H 3 | 4 | #include "flexflow/ffconst.h" 5 | #include "flexflow/fftype.h" 6 | #include "flexflow/parallel_tensor.h" 7 | 8 | namespace FlexFlow { 9 | 10 | struct RMSNormParams { 11 | LayerID layer_guid; 12 | float eps; 13 | int dim; 14 | char name[MAX_OPNAME]; 15 | bool is_valid(ParallelTensorShape const &) const; 16 | }; 17 | 18 | bool operator==(RMSNormParams const &, RMSNormParams const &); 19 | 20 | } // namespace FlexFlow 21 | 22 | namespace std { 23 | template <> 24 | struct hash { 25 | size_t operator()(FlexFlow::RMSNormParams const &) const; 26 | }; 27 | } // namespace std 28 | 29 | #endif // _FLEXFLOW_RMSNORM_PARAMS_H -------------------------------------------------------------------------------- /include/flexflow/ops/sampling_params.h: -------------------------------------------------------------------------------- 1 | #ifndef _FLEXFLOW_SAMPLING_PARAMS_H 2 | #define _FLEXFLOW_SAMPLING_PARAMS_H 3 | 4 | #include "flexflow/ffconst.h" 5 | #include "flexflow/parallel_tensor.h" 6 | 7 | namespace FlexFlow { 8 | 9 | struct SamplingParams { 10 | float top_p; 11 | char name[MAX_OPNAME]; 12 | bool is_valid(ParallelTensorShape const &) const; 13 | }; 14 | bool operator==(SamplingParams const &, SamplingParams const &); 15 | 16 | } // namespace FlexFlow 17 | 18 | namespace std { 19 | template <> 20 | struct hash { 21 | size_t operator()(FlexFlow::SamplingParams const &) const; 22 | }; 23 | } // namespace std 24 | 25 | #endif // _FLEXFLOW_SAMPLING_PARAMS_H -------------------------------------------------------------------------------- /include/flexflow/ops/sigmoid_silu_multi_params.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include "flexflow/ffconst.h" 4 | #include "flexflow/fftype.h" 5 | #include "flexflow/parallel_tensor.h" 6 | 7 | namespace FlexFlow { 8 | 9 | struct SigmoidSiluMultiParams { 10 | LayerID layer_guid; 11 | char name[MAX_OPNAME]; 12 | bool is_valid( 13 | std::pair const &) const; 14 | }; 15 | 16 | bool operator==(SigmoidSiluMultiParams const &, SigmoidSiluMultiParams const &); 17 | 18 | } // namespace FlexFlow 19 | 20 | namespace std { 21 | template <> 22 | struct hash { 23 | size_t operator()(FlexFlow::SigmoidSiluMultiParams const &) const; 24 | }; 25 | } // namespace std 26 | -------------------------------------------------------------------------------- /include/flexflow/ops/softmax_params.h: -------------------------------------------------------------------------------- 1 | #ifndef _FLEXFLOW_SOFTMAX_PARAMS_H 2 | #define _FLEXFLOW_SOFTMAX_PARAMS_H 3 | 4 | #include "flexflow/parallel_tensor.h" 5 | 6 | namespace FlexFlow { 7 | 8 | struct SoftmaxParams { 9 | LayerID layer_guid; 10 | int dim; 11 | char name[MAX_OPNAME]; 12 | bool is_valid(ParallelTensorShape const &) const; 13 | }; 14 | bool operator==(SoftmaxParams const &, SoftmaxParams const &); 15 | 16 | } // namespace FlexFlow 17 | 18 | namespace std { 19 | template <> 20 | struct hash { 21 | size_t operator()(FlexFlow::SoftmaxParams const &) const; 22 | }; 23 | } // namespace std 24 | 25 | #endif // _FLEXFLOW_SOFTMAX_PARAMS_H 26 | -------------------------------------------------------------------------------- /include/flexflow/ops/spec_inc_multihead_self_attention_params.h: -------------------------------------------------------------------------------- 1 | #ifndef _FLEXFLOW_SPEC_INC_MULTIHEAD_SELF_ATTENTION_PARAMS_H 2 | #define _FLEXFLOW_SPEC_INC_MULTIHEAD_SELF_ATTENTION_PARAMS_H 3 | 4 | #include "flexflow/ffconst.h" 5 | #include "flexflow/fftype.h" 6 | #include "flexflow/parallel_tensor.h" 7 | 8 | namespace FlexFlow { 9 | 10 | struct SpecIncMultiHeadSelfAttentionParams { 11 | LayerID layer_guid; 12 | int embed_dim, num_q_heads, num_kv_heads, kdim, vdim; 13 | float dropout, scaling_factor; 14 | bool add_zero_attn, scaling_query, qk_prod_scaling, position_bias; 15 | int num_kv_cache_pages; 16 | RotaryEmbeddingMeta rotary_embedding_meta; 17 | char name[MAX_OPNAME]; 18 | bool is_valid(ParallelTensorShape const &) const; 19 | }; 20 | 21 | bool operator==(SpecIncMultiHeadSelfAttentionParams const &, 22 | SpecIncMultiHeadSelfAttentionParams const &); 23 | 24 | } // namespace FlexFlow 25 | 26 | namespace std { 27 | template <> 28 | struct hash { 29 | size_t 30 | operator()(FlexFlow::SpecIncMultiHeadSelfAttentionParams const &) const; 31 | }; 32 | } // namespace std 33 | 34 | #endif // _FLEXFLOW_SPEC_INC_MULTIHEAD_SELF_ATTENTION_PARAMS_H 35 | -------------------------------------------------------------------------------- /include/flexflow/ops/split_params.h: -------------------------------------------------------------------------------- 1 | #ifndef _FLEXFLOW_SPLIT_PARAMS_H 2 | #define _FLEXFLOW_SPLIT_PARAMS_H 3 | 4 | #include "flexflow/parallel_tensor.h" 5 | 6 | namespace FlexFlow { 7 | 8 | struct SplitParams { 9 | std::vector splits; 10 | int legion_axis; 11 | char name[MAX_OPNAME]; 12 | bool is_valid(ParallelTensorShape const &) const; 13 | }; 14 | 15 | bool operator==(SplitParams const &, SplitParams const &); 16 | 17 | } // namespace FlexFlow 18 | 19 | namespace std { 20 | template <> 21 | struct hash { 22 | size_t operator()(FlexFlow::SplitParams const &) const; 23 | }; 24 | } // namespace std 25 | 26 | #endif // _FLEXFLOW_SPLIT_PARAMS_H 27 | -------------------------------------------------------------------------------- /include/flexflow/ops/topk_params.h: -------------------------------------------------------------------------------- 1 | #ifndef _FLEXFLOW_TOPK_PARAMS_H 2 | #define _FLEXFLOW_TOPK_PARAMS_H 3 | 4 | #include "flexflow/ffconst.h" 5 | #include "flexflow/parallel_tensor.h" 6 | 7 | namespace FlexFlow { 8 | 9 | struct TopKParams { 10 | int k; 11 | bool sorted; 12 | char name[MAX_OPNAME]; 13 | bool is_valid(ParallelTensorShape const &) const; 14 | }; 15 | bool operator==(TopKParams const &, TopKParams const &); 16 | 17 | } // namespace FlexFlow 18 | 19 | namespace std { 20 | template <> 21 | struct hash { 22 | size_t operator()(FlexFlow::TopKParams const &) const; 23 | }; 24 | } // namespace std 25 | 26 | #endif // _FLEXFLOW_TOPK_PARAMS_H 27 | -------------------------------------------------------------------------------- /include/flexflow/ops/transpose_params.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include "flexflow/parallel_tensor.h" 4 | 5 | namespace FlexFlow { 6 | 7 | struct TransposeParams { 8 | std::vector perm; 9 | char name[MAX_OPNAME]; 10 | bool is_valid(ParallelTensorShape const &) const; 11 | }; 12 | 13 | bool operator==(TransposeParams const &, TransposeParams const &); 14 | 15 | } // namespace FlexFlow 16 | 17 | namespace std { 18 | template <> 19 | struct hash { 20 | size_t operator()(FlexFlow::TransposeParams const &) const; 21 | }; 22 | } // namespace std 23 | -------------------------------------------------------------------------------- /include/flexflow/ops/tree_inc_multihead_self_attention_params.h: -------------------------------------------------------------------------------- 1 | #ifndef _FLEXFLOW_INC_MULTIHEAD_SELF_ATTENTION_VERIFY_PARAMS_H 2 | #define _FLEXFLOW_INC_MULTIHEAD_SELF_ATTENTION_VERIFY_PARAMS_H 3 | 4 | #include "flexflow/ffconst.h" 5 | #include "flexflow/fftype.h" 6 | #include "flexflow/parallel_tensor.h" 7 | 8 | namespace FlexFlow { 9 | 10 | struct TreeIncMultiHeadSelfAttentionParams { 11 | LayerID layer_guid; 12 | int embed_dim, num_q_heads, kdim, vdim, num_kv_heads, 13 | tensor_parallelism_degree, num_kv_cache_pages; 14 | float dropout, scaling_factor; 15 | bool add_zero_attn, scaling_query, qk_prod_scaling, position_bias; 16 | RotaryEmbeddingMeta rotary_embedding_meta; 17 | DataType quantization_type; 18 | bool offload; 19 | char name[MAX_OPNAME]; 20 | bool is_valid(ParallelTensorShape const &) const; 21 | }; 22 | 23 | bool operator==(TreeIncMultiHeadSelfAttentionParams const &, 24 | TreeIncMultiHeadSelfAttentionParams const &); 25 | 26 | } // namespace FlexFlow 27 | 28 | namespace std { 29 | template <> 30 | struct hash { 31 | size_t 32 | operator()(FlexFlow::TreeIncMultiHeadSelfAttentionParams const &) const; 33 | }; 34 | } // namespace std 35 | 36 | #endif // _FLEXFLOW_INC_MULTIHEAD_SELF_ATTENTION_VERIFY_PARAMS_H 37 | -------------------------------------------------------------------------------- /include/flexflow/parallel_ops/allreduce_params.h: -------------------------------------------------------------------------------- 1 | #ifndef _FLEXFLOW_ALLREDUCE_PARAMS_H 2 | #define _FLEXFLOW_ALLREDUCE_PARAMS_H 3 | 4 | namespace FlexFlow { 5 | 6 | struct AllReduceParams { 7 | LayerID layer_guid; 8 | int allreduce_legion_dim; 9 | char name[MAX_OPNAME]; 10 | bool is_valid(ParallelTensorShape const &) const; 11 | }; 12 | bool operator==(AllReduceParams const &, AllReduceParams const &); 13 | 14 | } // namespace FlexFlow 15 | 16 | namespace std { 17 | template <> 18 | struct hash { 19 | size_t operator()(FlexFlow::AllReduceParams const &) const; 20 | }; 21 | } // namespace std 22 | 23 | #endif // _FLEXFLOW_ALLREDUCE_PARAMS_H 24 | -------------------------------------------------------------------------------- /include/flexflow/parallel_ops/combine_params.h: -------------------------------------------------------------------------------- 1 | #ifndef _FLEXFLOW_COMBINE_PARAMS_H 2 | #define _FLEXFLOW_COMBINE_PARAMS_H 3 | 4 | namespace FlexFlow { 5 | 6 | struct CombineParams { 7 | int combine_legion_dim; 8 | int combine_degree; 9 | char name[MAX_OPNAME]; 10 | bool is_valid(ParallelTensorShape const &) const; 11 | }; 12 | bool operator==(CombineParams const &, CombineParams const &); 13 | 14 | } // namespace FlexFlow 15 | 16 | namespace std { 17 | template <> 18 | struct hash { 19 | size_t operator()(FlexFlow::CombineParams const &) const; 20 | }; 21 | } // namespace std 22 | 23 | #endif // _FLEXFLOW_COMBINE_PARAMS_H 24 | -------------------------------------------------------------------------------- /include/flexflow/parallel_ops/fused_parallel_op_params.h: -------------------------------------------------------------------------------- 1 | #ifndef _FLEXFLOW_FUSED_PARALLEL_OP_PARAMS_H 2 | #define _FLEXFLOW_FUSED_PARALLEL_OP_PARAMS_H 3 | 4 | #include "parallel_op_info.h" 5 | 6 | namespace FlexFlow { 7 | 8 | struct FusedParallelOpParams { 9 | std::vector parallel_ops; 10 | char name[MAX_OPNAME]; 11 | bool is_valid(ParallelTensorShape const &) const; 12 | }; 13 | bool operator==(FusedParallelOpParams const &, FusedParallelOpParams const &); 14 | 15 | } // namespace FlexFlow 16 | 17 | namespace std { 18 | template <> 19 | struct hash { 20 | size_t operator()(FlexFlow::FusedParallelOpParams const &) const; 21 | }; 22 | } // namespace std 23 | 24 | #endif // _FLEXFLOW_FUSED_PARALLEL_OP_PARAMS_H 25 | -------------------------------------------------------------------------------- /include/flexflow/parallel_ops/kernels/allreduce_kernels.h: -------------------------------------------------------------------------------- 1 | #ifndef _FLEXFLOW_OPS_KERNELS_ALLREDUCE_KERNELS_H 2 | #define _FLEXFLOW_OPS_KERNELS_ALLREDUCE_KERNELS_H 3 | 4 | #include "flexflow/batch_config.h" 5 | #include "flexflow/device.h" 6 | #include "flexflow/fftype.h" 7 | #include "flexflow/op_meta.h" 8 | #include "flexflow/parallel_ops/allreduce.h" 9 | 10 | namespace FlexFlow { 11 | 12 | class AllReduceMeta : public OpMeta { 13 | public: 14 | AllReduceMeta(FFHandler handle, AllReduce const *reduct); 15 | }; 16 | 17 | namespace Kernels { 18 | namespace AllReduce { 19 | 20 | void forward_kernel_wrapper(AllReduceMeta const *m, 21 | GenericTensorAccessorR const &input, 22 | GenericTensorAccessorW const &output); 23 | 24 | void backward_kernel_wrapper(AllReduceMeta const *m, 25 | GenericTensorAccessorW const &input_grad, 26 | GenericTensorAccessorR const &output_grad); 27 | 28 | void inference_kernel_wrapper(AllReduceMeta const *m, 29 | BatchConfig const *bc, 30 | GenericTensorAccessorR const &input, 31 | GenericTensorAccessorW const &output); 32 | 33 | void peft_bwd_kernel_wrapper(AllReduceMeta const *m, 34 | BatchConfig const *bc, 35 | GenericTensorAccessorW const &input_grad, 36 | GenericTensorAccessorR const &output_grad); 37 | } // namespace AllReduce 38 | } // namespace Kernels 39 | } // namespace FlexFlow 40 | 41 | #endif // _FLEXFLOW_OPS_KERNELS_ALLREDUCE_KERNELS_H 42 | -------------------------------------------------------------------------------- /include/flexflow/parallel_ops/kernels/combine_kernels.h: -------------------------------------------------------------------------------- 1 | #ifndef _FLEXFLOW_OPS_KERNELS_COMBINE_KERNELS_H 2 | #define _FLEXFLOW_OPS_KERNELS_COMBINE_KERNELS_H 3 | 4 | #include "flexflow/device.h" 5 | #include "flexflow/fftype.h" 6 | #include "flexflow/op_meta.h" 7 | #include "flexflow/parallel_ops/combine.h" 8 | 9 | namespace FlexFlow { 10 | 11 | class Combine; 12 | 13 | class CombineMeta : public OpMeta { 14 | public: 15 | CombineMeta(FFHandler handle, Combine const *comb); 16 | DataType data_type; 17 | }; 18 | 19 | namespace Kernels { 20 | namespace Combine { 21 | 22 | template 23 | void forward_kernel(T const *input_ptr, T *output_ptr, size_t num_elements); 24 | 25 | template 26 | void backward_kernel(T const *output_grad_ptr, 27 | T *input_grad_ptr, 28 | size_t num_elements); 29 | 30 | } // namespace Combine 31 | } // namespace Kernels 32 | } // namespace FlexFlow 33 | 34 | #endif // _FLEXFLOW_OPS_KERNELS_COMBINE_KERNELS_H 35 | -------------------------------------------------------------------------------- /include/flexflow/parallel_ops/kernels/parallel_identity_kernels.h: -------------------------------------------------------------------------------- 1 | #ifndef _FLEXFLOW_OPS_KERNELS_PARALLEL_IDENTITY_KERNELS_H 2 | #define _FLEXFLOW_OPS_KERNELS_PARALLEL_IDENTITY_KERNELS_H 3 | 4 | #include "flexflow/batch_config.h" 5 | #include "flexflow/device.h" 6 | #include "flexflow/fftype.h" 7 | #include "flexflow/op_meta.h" 8 | #include "flexflow/parallel_ops/parallel_identity.h" 9 | 10 | namespace FlexFlow { 11 | 12 | class ParallelIdentityMeta : public OpMeta { 13 | public: 14 | ParallelIdentityMeta(FFHandler handle, ParallelIdentity const *reduct); 15 | }; 16 | 17 | namespace Kernels { 18 | namespace ParallelIdentity { 19 | 20 | void forward_kernel_wrapper(ParallelIdentityMeta const *m, 21 | GenericTensorAccessorR const &input, 22 | GenericTensorAccessorW const &output); 23 | 24 | void backward_kernel_wrapper(ParallelIdentityMeta const *m, 25 | GenericTensorAccessorW const &input_grad, 26 | GenericTensorAccessorR const &output_grad); 27 | 28 | void inference_kernel_wrapper(ParallelIdentityMeta const *m, 29 | BatchConfig const *bc, 30 | GenericTensorAccessorR const &input, 31 | GenericTensorAccessorW const &output); 32 | 33 | void peft_bwd_kernel_wrapper(ParallelIdentityMeta const *m, 34 | BatchConfig const *bc, 35 | GenericTensorAccessorW const &input_grad, 36 | GenericTensorAccessorR const &output_grad); 37 | } // namespace ParallelIdentity 38 | } // namespace Kernels 39 | } // namespace FlexFlow 40 | 41 | #endif // _FLEXFLOW_OPS_KERNELS_PARALLEL_IDENTITY_KERNELS_H 42 | -------------------------------------------------------------------------------- /include/flexflow/parallel_ops/kernels/partition_kernels.h: -------------------------------------------------------------------------------- 1 | #ifndef _FLEXFLOW_OPS_KERNELS_PARTITION_KERNELS_H 2 | #define _FLEXFLOW_OPS_KERNELS_PARTITION_KERNELS_H 3 | 4 | #include "flexflow/device.h" 5 | #include "flexflow/fftype.h" 6 | #include "flexflow/op_meta.h" 7 | 8 | namespace FlexFlow { 9 | 10 | class Repartition; 11 | 12 | class RepartitionMeta : public OpMeta { 13 | public: 14 | RepartitionMeta(FFHandler handle, Repartition const *repart); 15 | DataType data_type; 16 | }; 17 | 18 | namespace Kernels { 19 | namespace Repartition { 20 | 21 | template 22 | void forward_kernel(T const *input_ptr, T *output_ptr, size_t num_elements); 23 | 24 | template 25 | void backward_kernel(T const *output_grad_ptr, 26 | T *input_grad_ptr, 27 | size_t num_elements); 28 | 29 | } // namespace Repartition 30 | } // namespace Kernels 31 | } // namespace FlexFlow 32 | 33 | #endif // _FLEXFLOW_OPS_KERNELS_PARTITION_KERNELS_H 34 | -------------------------------------------------------------------------------- /include/flexflow/parallel_ops/kernels/reduction_kernels.h: -------------------------------------------------------------------------------- 1 | #ifndef _FLEXFLOW_OPS_KERNELS_REDUCTION_KERNELS_H 2 | #define _FLEXFLOW_OPS_KERNELS_REDUCTION_KERNELS_H 3 | 4 | #include "flexflow/device.h" 5 | #include "flexflow/fftype.h" 6 | #include "flexflow/op_meta.h" 7 | #include "flexflow/parallel_ops/reduction.h" 8 | 9 | namespace FlexFlow { 10 | 11 | class ReductionMeta : public OpMeta { 12 | public: 13 | ReductionMeta(FFHandler handle, Reduction const *reduct); 14 | }; 15 | 16 | namespace Kernels { 17 | namespace Reduction { 18 | 19 | template 20 | void forward_kernel(T const *input_ptr, 21 | T *output_ptr, 22 | size_t num_elements, 23 | size_t num_replicas); 24 | 25 | template 26 | void backward_kernel(T const *output_grad_ptr, 27 | T *input_grad_ptr, 28 | size_t num_elements); 29 | 30 | } // namespace Reduction 31 | } // namespace Kernels 32 | } // namespace FlexFlow 33 | 34 | #endif // _FLEXFLOW_OPS_KERNELS_REDUCTION_KERNELS_H 35 | -------------------------------------------------------------------------------- /include/flexflow/parallel_ops/kernels/replicate_kernels.h: -------------------------------------------------------------------------------- 1 | #ifndef _FLEXFLOW_OPS_KERNELS_REPLICATE_KERNELS_H 2 | #define _FLEXFLOW_OPS_KERNELS_REPLICATE_KERNELS_H 3 | 4 | #include "flexflow/device.h" 5 | #include "flexflow/fftype.h" 6 | #include "flexflow/op_meta.h" 7 | #include "flexflow/parallel_ops/replicate.h" 8 | 9 | namespace FlexFlow { 10 | 11 | class ReplicateMeta : public OpMeta { 12 | public: 13 | ReplicateMeta(FFHandler handle, Replicate const *repl); 14 | }; 15 | 16 | namespace Kernels { 17 | namespace Replicate { 18 | 19 | template 20 | void forward_kernel(T const *input_ptr, T *output_ptr, size_t num_elements); 21 | 22 | template 23 | void backward_kernel(T const *output_grad_ptr, 24 | T *input_grad_ptr, 25 | size_t num_elements, 26 | size_t num_replicas); 27 | 28 | } // namespace Replicate 29 | } // namespace Kernels 30 | } // namespace FlexFlow 31 | 32 | #endif // _FLEXFLOW_OPS_KERNELS_REPLICATE_KERNELS_H 33 | -------------------------------------------------------------------------------- /include/flexflow/parallel_ops/parallel_identity_params.h: -------------------------------------------------------------------------------- 1 | #ifndef _FLEXFLOW_PARALLEL_IDENTITY_PARAMS_H 2 | #define _FLEXFLOW_PARALLEL_IDENTITY_PARAMS_H 3 | 4 | namespace FlexFlow { 5 | 6 | struct ParallelIdentityParams { 7 | LayerID layer_guid; 8 | int parallel_identity_legion_dim; 9 | char name[MAX_OPNAME]; 10 | bool is_valid(ParallelTensorShape const &) const; 11 | }; 12 | bool operator==(ParallelIdentityParams const &, ParallelIdentityParams const &); 13 | 14 | } // namespace FlexFlow 15 | 16 | namespace std { 17 | template <> 18 | struct hash { 19 | size_t operator()(FlexFlow::ParallelIdentityParams const &) const; 20 | }; 21 | } // namespace std 22 | 23 | #endif // _FLEXFLOW_PARALLEL_IDENTITY_PARAMS_H 24 | -------------------------------------------------------------------------------- /include/flexflow/parallel_ops/parallel_op.h: -------------------------------------------------------------------------------- 1 | #ifndef _FLEXFLOW_PARALLEL_OP_H 2 | #define _FLEXFLOW_PARALLEL_OP_H 3 | 4 | #include "flexflow/model.h" 5 | #include "tl/optional.hpp" 6 | 7 | namespace FlexFlow { 8 | 9 | struct ParallelOpJoinResult { 10 | tl::optional op = tl::nullopt; 11 | bool join_did_succeed = false; 12 | }; 13 | 14 | ParallelOpJoinResult try_join_parallel_ops(ParallelOpInfo const &, 15 | ParallelOpInfo const &); 16 | 17 | class ParallelOp : public Op { 18 | public: 19 | ParallelOp(FFModel &model, 20 | OperatorType type, 21 | char const *_name, 22 | const ParallelTensor input); 23 | virtual void init(FFModel const &) = 0; 24 | virtual void forward(FFModel const &) = 0; 25 | virtual void backward(FFModel const &) = 0; 26 | virtual void create_input_partition(FFModel &model) = 0; 27 | virtual void create_input_partition_inference( 28 | FFModel &model, 29 | std::vector const &batch_inputs, 30 | std::vector const &batch_outputs) { 31 | assert(false); 32 | } 33 | void print_layer(FFModel const &model){}; 34 | virtual bool measure_operator_cost(Simulator *sim, 35 | MachineView const &pc, 36 | CostMetrics &cost_metrics) const = 0; 37 | virtual bool append_parallel_op_info( 38 | std::vector ¶llel_ops) const = 0; 39 | virtual bool is_parallel_op() const; 40 | 41 | public: 42 | Legion::LogicalPartition input_lp, output_grad_lp; 43 | std::unordered_map 44 | inference_input_lps, inference_output_grad_lps; 45 | }; 46 | 47 | }; // namespace FlexFlow 48 | 49 | #endif // _FLEXFLOW_PARALLEL_OP_H 50 | -------------------------------------------------------------------------------- /include/flexflow/parallel_ops/parallel_op_info.h: -------------------------------------------------------------------------------- 1 | #ifndef _FLEXFLOW_PARALLEL_OPS_PARALLEL_OP_INFO_H 2 | #define _FLEXFLOW_PARALLEL_OPS_PARALLEL_OP_INFO_H 3 | 4 | #include "flexflow/ffconst.h" 5 | 6 | namespace FlexFlow { 7 | 8 | struct ParallelOpInfo { 9 | friend void swap(ParallelOpInfo &, ParallelOpInfo &); 10 | 11 | OperatorType op_type; 12 | int parallel_dim; 13 | int parallel_degree; 14 | }; 15 | bool operator==(ParallelOpInfo const &, ParallelOpInfo const &); 16 | 17 | } // namespace FlexFlow 18 | 19 | #endif /* _FLEXFLOW_PARALLEL_OPS_PARALLEL_OP_INFO_H */ 20 | -------------------------------------------------------------------------------- /include/flexflow/parallel_ops/partition_params.h: -------------------------------------------------------------------------------- 1 | #ifndef _FLEXFLOW_PARTITION_PARAMS_H 2 | #define _FLEXFLOW_PARTITION_PARAMS_H 3 | 4 | namespace FlexFlow { 5 | 6 | struct RepartitionParams { 7 | int repartition_legion_dim; 8 | int repartition_degree; 9 | char name[MAX_OPNAME]; 10 | bool is_valid(ParallelTensorShape const &) const; 11 | }; 12 | bool operator==(RepartitionParams const &, RepartitionParams const &); 13 | 14 | } // namespace FlexFlow 15 | 16 | namespace std { 17 | template <> 18 | struct hash { 19 | size_t operator()(FlexFlow::RepartitionParams const &) const; 20 | }; 21 | } // namespace std 22 | 23 | #endif // _FLEXFLOW_PARTITION_PARAMS_H 24 | -------------------------------------------------------------------------------- /include/flexflow/parallel_ops/reduction_params.h: -------------------------------------------------------------------------------- 1 | #ifndef _FLEXFLOW_REDUCTION_PARAMS_H 2 | #define _FLEXFLOW_REDUCTION_PARAMS_H 3 | 4 | namespace FlexFlow { 5 | 6 | struct ReductionParams { 7 | int reduction_legion_dim; 8 | int reduction_degree; 9 | char name[MAX_OPNAME]; 10 | bool is_valid(ParallelTensorShape const &) const; 11 | }; 12 | bool operator==(ReductionParams const &, ReductionParams const &); 13 | 14 | } // namespace FlexFlow 15 | 16 | namespace std { 17 | template <> 18 | struct hash { 19 | size_t operator()(FlexFlow::ReductionParams const &) const; 20 | }; 21 | } // namespace std 22 | 23 | #endif // _FLEXFLOW_REDUCTION_PARAMS_H 24 | -------------------------------------------------------------------------------- /include/flexflow/parallel_ops/replicate_params.h: -------------------------------------------------------------------------------- 1 | #ifndef _FLEXFLOW_REPLICATE_PARAMS_H 2 | #define _FLEXFLOW_REPLICATE_PARAMS_H 3 | 4 | namespace FlexFlow { 5 | 6 | struct ReplicateParams { 7 | int replicate_legion_dim; 8 | int replicate_degree; 9 | char name[MAX_OPNAME]; 10 | bool is_valid(ParallelTensorShape const &) const; 11 | }; 12 | bool operator==(ReplicateParams const &, ReplicateParams const &); 13 | 14 | } // namespace FlexFlow 15 | 16 | namespace std { 17 | template <> 18 | struct hash { 19 | size_t operator()(FlexFlow::ReplicateParams const &) const; 20 | }; 21 | } // namespace std 22 | 23 | #endif // _FLEXFLOW_REPLICATE_PARAMS_H 24 | -------------------------------------------------------------------------------- /include/flexflow/recompile.h: -------------------------------------------------------------------------------- 1 | /* Copyright 2023 CMU, Facebook, LANL, MIT, NVIDIA, and Stanford (alphabetical) 2 | * 3 | * Licensed under the Apache License, Version 2.0 (the "License"); 4 | * you may not use this file except in compliance with the License. 5 | * You may obtain a copy of the License at 6 | * 7 | * http://www.apache.org/licenses/LICENSE-2.0 8 | * 9 | * Unless required by applicable law or agreed to in writing, software 10 | * distributed under the License is distributed on an "AS IS" BASIS, 11 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | * See the License for the specific language governing permissions and 13 | * limitations under the License. 14 | */ 15 | 16 | #ifndef _FLEXFLOW_RECOMPILE_H_ 17 | #define _FLEXFLOW_RECOMPILE_H_ 18 | 19 | #include "legion.h" 20 | #include 21 | 22 | namespace FlexFlow { 23 | 24 | class FFModel; 25 | 26 | class RecompileState { 27 | public: 28 | RecompileState(std::function _trigger_func, 29 | std::function _alter_func, 30 | FFModel *_ff); 31 | bool trigger(); 32 | void alter(); 33 | 34 | public: 35 | int recompilations; 36 | 37 | private: 38 | std::function trigger_func; 39 | std::function alter_func; 40 | FFModel *ff; 41 | }; 42 | 43 | }; // namespace FlexFlow 44 | #endif 45 | -------------------------------------------------------------------------------- /include/flexflow/runtime.h: -------------------------------------------------------------------------------- 1 | /* Copyright 2023 CMU, Facebook, LANL, MIT, NVIDIA, and Stanford (alphabetical) 2 | * 3 | * Licensed under the Apache License, Version 2.0 (the "License"); 4 | * you may not use this file except in compliance with the License. 5 | * You may obtain a copy of the License at 6 | * 7 | * http://www.apache.org/licenses/LICENSE-2.0 8 | * 9 | * Unless required by applicable law or agreed to in writing, software 10 | * distributed under the License is distributed on an "AS IS" BASIS, 11 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | * See the License for the specific language governing permissions and 13 | * limitations under the License. 14 | */ 15 | 16 | #ifndef _FLEXFLOW_RUNTIME_H_ 17 | #define _FLEXFLOW_RUNTIME_H_ 18 | 19 | #include "config.h" 20 | 21 | namespace FlexFlow { 22 | 23 | class FFRuntime { 24 | public: 25 | FFRuntime(FFConfig &config); 26 | FFHandler handlers[MAX_NUM_WORKERS]; 27 | }; 28 | 29 | } // namespace FlexFlow 30 | 31 | #endif // _FLEXFLOW_RUNTIME_H_ 32 | -------------------------------------------------------------------------------- /include/flexflow/utils/disjoint_set.h: -------------------------------------------------------------------------------- 1 | #ifndef _FLEXFLOW_DISJOINT_SET_H 2 | #define _FLEXFLOW_DISJOINT_SET_H 3 | 4 | #include 5 | #include 6 | #include 7 | #include 8 | 9 | template 10 | class m_disjoint_set { 11 | public: 12 | void m_union(T const *l, T const *r) { 13 | this->add_node_if_missing(l); 14 | this->add_node_if_missing(r); 15 | T const *ll = this->find(l); 16 | T const *rr = this->find(r); 17 | if (ll != rr) { 18 | this->mapping[ll] = rr; 19 | } 20 | } 21 | T const *find(T const *t) { 22 | this->add_node_if_missing(t); 23 | T const *parent = this->mapping.at(t); 24 | if (parent == nullptr) { 25 | return t; 26 | } else { 27 | return this->find(parent); 28 | } 29 | } 30 | 31 | private: 32 | void add_node_if_missing(T const *t) { 33 | if (mapping.find(t) == mapping.end()) { 34 | mapping[t] = nullptr; 35 | } 36 | } 37 | std::unordered_map mapping; 38 | }; 39 | 40 | template > 41 | class disjoint_set { 42 | public: 43 | void m_union(T const &l, T const &r) { 44 | this->nodes.insert(l); 45 | this->nodes.insert(r); 46 | this->ds.m_union(this->get_node(l), this->get_node(r)); 47 | } 48 | T const &find(T const &t) { 49 | this->nodes.insert(t); 50 | return *this->ds.find(this->get_node(t)); 51 | } 52 | std::map get_mapping() const { 53 | std::map mapping; 54 | for (T const &t : this->nodes) { 55 | mapping[t] = this->ds.find(&t); 56 | } 57 | return mapping; 58 | } 59 | 60 | private: 61 | T const *get_node(T const &t) { 62 | auto it = this->nodes.find(t); 63 | assert(it != this->nodes.end()); 64 | return &*it; 65 | } 66 | 67 | m_disjoint_set ds; 68 | std::set nodes; 69 | }; 70 | 71 | #endif // _FLEXFLOW_DISJOINT_SET_H 72 | -------------------------------------------------------------------------------- /include/flexflow/utils/dot/record_formatter.h: -------------------------------------------------------------------------------- 1 | #ifndef _RECORD_FORMATTER_H 2 | #define _RECORD_FORMATTER_H 3 | 4 | #include 5 | #include 6 | 7 | class RecordFormatter { 8 | friend RecordFormatter &operator<<(RecordFormatter &r, 9 | std::string const &tok); 10 | friend RecordFormatter &operator<<(RecordFormatter &r, int tok); 11 | friend RecordFormatter &operator<<(RecordFormatter &r, float tok); 12 | friend RecordFormatter &operator<<(RecordFormatter &r, 13 | RecordFormatter const &sub_r); 14 | friend RecordFormatter &operator<<(RecordFormatter &r, 15 | std::ostringstream &oss); 16 | friend std::ostream &operator<<(std::ostream &s, RecordFormatter const &r); 17 | 18 | private: 19 | std::vector pieces; 20 | }; 21 | 22 | #endif // _RECORD_FORMATTER_H -------------------------------------------------------------------------------- /include/flexflow/utils/random_utils.h: -------------------------------------------------------------------------------- 1 | #ifndef _RANDOM_UTILS_H 2 | #define _RANDOM_UTILS_H 3 | 4 | #include 5 | #include 6 | #include 7 | 8 | float randf(); 9 | 10 | template 11 | T select_random(std::vector const &values) { 12 | return values[std::rand() % values.size()]; 13 | } 14 | 15 | template 16 | T select_random_determistic(std::vector const &values, 17 | std::vector const &weights, 18 | float value) { 19 | if (values.empty()) { 20 | throw std::invalid_argument("Values list must not be empty."); 21 | } 22 | float total = 0.0f; 23 | for (auto const &w : weights) { 24 | if (w < 0) { 25 | throw std::invalid_argument("Weights must not be negative"); 26 | } 27 | total += w; 28 | } 29 | 30 | float r = value * total; 31 | float curr = 0.0f; 32 | int i = -1; 33 | while (curr <= r && (i < 0 || i < (int)values.size() - 1)) { 34 | i++; 35 | curr += weights[i]; 36 | } 37 | return values[i]; 38 | } 39 | 40 | template 41 | T select_random(std::vector const &values, 42 | std::vector const &weights) { 43 | return select_random_determistic(values, weights, randf()); 44 | } 45 | 46 | #endif // _RANDOM_UTILS_H 47 | -------------------------------------------------------------------------------- /include/flexflow/utils/recursive_logger.h: -------------------------------------------------------------------------------- 1 | #ifndef _FLEXFLOW_RECURSIVE_LOGGER_H 2 | #define _FLEXFLOW_RECURSIVE_LOGGER_H 3 | 4 | #include "legion/legion_utilities.h" 5 | #include 6 | 7 | #define CONCAT(a, b) CONCAT_INNER(a, b) 8 | #define CONCAT_INNER(a, b) a##b 9 | #define UNIQUE_TAG() CONCAT(tag, __COUNTER__) 10 | #define TAG_ENTER(mlogger) auto UNIQUE_TAG() = mlogger->enter_tag() 11 | 12 | namespace FlexFlow { 13 | 14 | class RecursiveLogger; 15 | 16 | class DepthTag { 17 | public: 18 | DepthTag() = delete; 19 | DepthTag(RecursiveLogger &); 20 | DepthTag(DepthTag const &) = delete; 21 | ~DepthTag(); 22 | 23 | private: 24 | RecursiveLogger &logger; 25 | }; 26 | 27 | class RecursiveLogger { 28 | public: 29 | /* RecursiveLogger(Legion::Logger const &); */ 30 | RecursiveLogger(std::string const &category_name); 31 | 32 | Realm::LoggerMessage info(); 33 | Realm::LoggerMessage debug(); 34 | Realm::LoggerMessage spew(); 35 | void enter(); 36 | void leave(); 37 | 38 | std::unique_ptr enter_tag(); 39 | 40 | private: 41 | int depth = 0; 42 | 43 | void print_prefix(Realm::LoggerMessage &) const; 44 | 45 | Legion::Logger logger; 46 | }; 47 | 48 | }; // namespace FlexFlow 49 | #endif // _FLEXFLOW_RECURSIVE_LOGGER_H 50 | -------------------------------------------------------------------------------- /include/flexflow/utils/test_utils.h: -------------------------------------------------------------------------------- 1 | #ifndef _FLEXFLOW_UTILS_H_ 2 | #define _FLEXFLOW_UTILS_H_ 3 | #include "flexflow/model.h" 4 | #include 5 | #include 6 | #include 7 | #include 8 | 9 | namespace FlexFlow { 10 | 11 | struct ArgsConfig; 12 | 13 | void initialize_tensor_from_file(const std::string file_path, 14 | Tensor label, 15 | FFModel const &ff, 16 | std::string data_type = "float", 17 | int num_dim = 3); 18 | 19 | void initialize_tensor_gradient_from_file(const std::string file_path, 20 | Tensor label, 21 | FFModel const &ff, 22 | std::string data_type, 23 | int num_dim); 24 | 25 | void initialize_tensor_from_file(const std::string file_path, 26 | Tensor label, 27 | FFModel const &ff, 28 | std::string data_type, 29 | int num_dim); 30 | 31 | template 32 | void initialize_tensor_from_file_task( 33 | Legion::Task const *task, 34 | std::vector const ®ions, 35 | Legion::Context ctx, 36 | Legion::Runtime *runtime); 37 | 38 | void dump_region_to_file(FFModel &ff, 39 | Legion::LogicalRegion ®ion, 40 | std::string file_path, 41 | int dims = 4); 42 | 43 | template 44 | void dump_tensor_task(Legion::Task const *task, 45 | std::vector const ®ions, 46 | Legion::Context ctx, 47 | Legion::Runtime *runtime); 48 | 49 | void register_custom_tasks(); 50 | 51 | }; // namespace FlexFlow 52 | #endif 53 | -------------------------------------------------------------------------------- /include/flexflow/utils/tuple.h: -------------------------------------------------------------------------------- 1 | #ifndef _FLEXFLOW_UTILS_TUPLE_H 2 | #define _FLEXFLOW_UTILS_TUPLE_H 3 | 4 | #include 5 | #include 6 | #include 7 | 8 | // Adapted from 9 | // https://github.com/bitwizeshift/BackportCpp/blob/4f33a7f9b219f169e60d8ed2fd5731a3a23288e4/include/bpstd/tuple.hpp 10 | 11 | namespace FlexFlow { 12 | 13 | namespace TupleUtils { 14 | 15 | template 16 | struct index_of_impl; 17 | 18 | template 19 | struct index_of_impl 20 | : index_of_impl {}; 21 | 22 | template 23 | struct index_of_impl 24 | : std::integral_constant {}; 25 | 26 | template 27 | struct index_of : index_of_impl {}; 28 | 29 | }; // namespace TupleUtils 30 | 31 | template 32 | T &get(std::tuple &t) noexcept { 33 | return std::get::value>(t); 34 | } 35 | 36 | template 37 | T &&get(std::tuple &&t) noexcept { 38 | return move(std::get::value>(t)); 39 | } 40 | 41 | template 42 | T const &get(std::tuple const &t) noexcept { 43 | return std::get::value>(t); 44 | } 45 | 46 | template 47 | T const &&get(std::tuple const &&t) noexcept { 48 | return move(std::get::value>(t)); 49 | } 50 | 51 | }; // namespace FlexFlow 52 | 53 | #endif // _FLEXFLOW_UTILS_TUPLE_H -------------------------------------------------------------------------------- /inference/.gitignore: -------------------------------------------------------------------------------- 1 | configs 2 | weights 3 | tokenizers 4 | prompt 5 | output 6 | .env -------------------------------------------------------------------------------- /inference/README.md: -------------------------------------------------------------------------------- 1 | # Inference Examples 2 | This folder contains the code to run inference examples in FlexFlow 3 | 4 | To create a sample prompt, call (from the `build` folder): 5 | 6 | ```bash 7 | mkdir -p ../inference/prompt 8 | echo '["San Francisco is a "]' > ../inference/prompt/test.json 9 | ``` 10 | 11 | To download a model for use in C++, call: 12 | ```bash 13 | huggingface-cli login # if needed 14 | python ../inference/utils/download_hf_model.py meta-llama/Llama-2-7b-hf --half-precision-only 15 | ``` 16 | 17 | To run the incremental decoding example in C++, call: 18 | 19 | ```bash 20 | ./inference/incr_decoding/incr_decoding -ll:cpu 4 -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 --fusion -llm-model meta-llama/Llama-2-7b-hf -prompt ../inference/prompt/test.json -tensor-parallelism-degree 4 21 | ``` 22 | 23 | To run the speculative inference example in C++, call: 24 | 25 | ```bash 26 | ./inference/spec_infer/spec_infer -ll:cpu 4 -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 --fusion -llm-model meta-llama/Llama-2-7b-hf -ssm-model JackFram/llama-160m -prompt ../inference/prompt/test.json -tensor-parallelism-degree 4 27 | ``` 28 | 29 | To run a PEFT model example in C++, call: 30 | 31 | ```bash 32 | ./inference/peft/peft \ 33 | -ll:gpu 4 -ll:cpu 4 -ll:util 4 \ 34 | -tensor-parallelism-degree 4 \ 35 | -ll:fsize 8192 -ll:zsize 12000 \ 36 | -llm-model JackFram/llama-160m \ 37 | -finetuning-dataset ../inference/prompt/peft_dataset.json \ 38 | -peft-model goliaro/llama-160m-lora \ 39 | -enable-peft \ 40 | --use-full-precision \ 41 | --inference-debugging 42 | ``` -------------------------------------------------------------------------------- /inference/inference_wrapper.in: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # Set the LD_LIBRARY_PATH using LIBTORCH_PYTHON_DIR from CMake. 3 | export LD_LIBRARY_PATH="@LIBTORCH_PYTHON_DIR@:$LD_LIBRARY_PATH" 4 | # Launch the executable using the specified launcher. 5 | @LAUNCHER@ "${BASH_SOURCE[0]%/*}/@TARGET_PATH@" "$@" -------------------------------------------------------------------------------- /inference/python/peft_demo/INSTRUCTIONS.md: -------------------------------------------------------------------------------- 1 | ## Peft Demo 2 | * `git clone -b peft --recursive https://github.com/flexflow/FlexFlow.git` 3 | * `cd FlexFlow/` 4 | 5 | * If you wish to run the demo by installing FlexFlow 6 | * `conda env create -f conda/flexflow.yml` 7 | * `conda activate flexflow` 8 | 9 | * If you wish to run the demo using a Docker container 10 | * `export FF_CUDA_ARCH=all && export cuda_version=12.0 && ./docker/build.sh flexflow && ./docker/run.sh flexflow` 11 | 12 | * Then, install the Llama2 model (the `meta-llama/Llama-2-7b-hf` model is gated, so make sure to add your HF access token) 13 | 14 | * `export HUGGINGFACE_TOKEN="[Your token]"` 15 | * `huggingface-cli login --token "$HUGGINGFACE_TOKEN"` 16 | * `python3 inference/utils/download_peft_model.py "goliaro/llama-2-7b-lora-full"` 17 | 18 | * Run the demo 19 | ``` 20 | mkdir inference/output 21 | cd inference/python/peft_demo/ 22 | python3 demo.py -config-file demo_config.json 23 | ``` 24 | 25 | 26 | -------------------------------------------------------------------------------- /inference/python/streamlit/README.md: -------------------------------------------------------------------------------- 1 | # Streamlit demo 2 | 3 | ## Instructions 4 | 5 | 1. Build and install FlexFlow, or build and run `source ./set_python_envs.sh` from the build folder 6 | 2. Edit the flexflow-serve/inference/python/streamlit/fastapi_incr.py to configure the model to run and the system configs (num gpus, amount of memory, etc) 7 | 3. In one terminal, launch the LLM engine with the commands below, and wait until the model's weights loading completes 8 | ``` 9 | cd flexflow-serve/inference/python/streamlit 10 | export PORT_NUMBER=8080 11 | uvicorn fastapi_incr:app --reload --port $PORT_NUMBER 12 | ``` 13 | 4. In another terminal, launch the streamlit app: 14 | ``` 15 | cd flexflow-serve/inference/python/streamlit 16 | streamlit run app.py --server.port 8501 --server.address 0.0.0.0 17 | ``` 18 | 5. Open the URL printed to the terminal, e.g. `http://localhost:8501` and interact with the app via browser 19 | 20 | -------------------------------------------------------------------------------- /inference/utils/download_hf_model.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | import flexflow.serve as ff 3 | import argparse, os 4 | 5 | 6 | def parse_args(): 7 | parser = argparse.ArgumentParser() 8 | parser.add_argument( 9 | "model_names", type=str, nargs="+", help="Name of the model(s) to download" 10 | ) 11 | parser.add_argument( 12 | "--cache-folder", 13 | type=str, 14 | help="Folder to use to store the model(s) assets in FlexFlow format", 15 | default=os.environ.get("FF_CACHE_PATH", ""), 16 | ) 17 | parser.add_argument( 18 | "--refresh-cache", 19 | action="store_true", 20 | help="Use this flag to force the refresh of the model(s) weights/tokenizer cache", 21 | ) 22 | group = parser.add_mutually_exclusive_group() 23 | group.add_argument( 24 | "--full-precision-only", 25 | action="store_true", 26 | help="Only download the full precision version of the weights", 27 | ) 28 | group.add_argument( 29 | "--half-precision-only", 30 | action="store_true", 31 | help="Only download the half precision version of the weights", 32 | ) 33 | args = parser.parse_args() 34 | return args 35 | 36 | 37 | def main(args): 38 | if args.full_precision_only: 39 | data_types = (ff.DataType.DT_FLOAT,) 40 | elif args.half_precision_only: 41 | data_types = (ff.DataType.DT_HALF,) 42 | else: 43 | data_types = (ff.DataType.DT_FLOAT, ff.DataType.DT_HALF) 44 | 45 | for model_name in args.model_names: 46 | for data_type in data_types: 47 | llm = ff.LLM( 48 | model_name, 49 | data_type=data_type, 50 | cache_path=args.cache_folder, 51 | refresh_cache=args.refresh_cache, 52 | ) 53 | llm.download_hf_weights_if_needed() 54 | llm.download_hf_tokenizer_if_needed() 55 | llm.download_hf_config() 56 | 57 | 58 | if __name__ == "__main__": 59 | args = parse_args() 60 | main(args) 61 | -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [build-system] 2 | requires = [ 3 | "wheel", 4 | "setuptools>=45", 5 | "setuptools_scm[toml]>=6.0", 6 | "cmake-build-extension", 7 | "ninja", 8 | "requests", 9 | "pip", 10 | ] 11 | build-backend = "setuptools.build_meta" 12 | -------------------------------------------------------------------------------- /python/flexflow/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright 2023 CMU, Facebook, LANL, MIT, NVIDIA, and Stanford (alphabetical) 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | # 15 | 16 | from .config import flexflow_dir 17 | -------------------------------------------------------------------------------- /python/flexflow/config.py: -------------------------------------------------------------------------------- 1 | # Copyright 2023 CMU, Facebook, LANL, MIT, NVIDIA, and Stanford (alphabetical) 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | # 15 | 16 | import os 17 | 18 | # python binding 19 | _FF_PYTHON_BINDING = "cffi" 20 | 21 | 22 | def flexflow_python_binding(): 23 | return _FF_PYTHON_BINDING 24 | 25 | 26 | _FF_ALREADY_INITIALIZED = False 27 | 28 | 29 | def flexflow_already_initialized(): 30 | global _FF_ALREADY_INITIALIZED 31 | return _FF_ALREADY_INITIALIZED 32 | 33 | 34 | def set_flexflow_initialized(): 35 | global _FF_ALREADY_INITIALIZED 36 | if _FF_ALREADY_INITIALIZED == True: 37 | raise RuntimeError( 38 | "Attempting to set _FF_ALREADY_INITIALIZED=True, but _FF_ALREADY_INITIALIZED is already True" 39 | ) 40 | _FF_ALREADY_INITIALIZED = True 41 | 42 | 43 | # FlexFlow dir 44 | _FF_DIR = os.path.dirname(os.path.realpath(__file__)) 45 | 46 | 47 | def flexflow_dir(): 48 | return _FF_DIR 49 | 50 | # Get runtime configs from the command line 51 | def get_configs(): 52 | import argparse,json 53 | parser = argparse.ArgumentParser() 54 | parser.add_argument( 55 | "-config-file", 56 | help="The path to a JSON file with the configs. If omitted, a sample model and configs will be used instead.", 57 | type=str, 58 | default=None, 59 | ) 60 | args, unknown = parser.parse_known_args() 61 | if args.config_file is not None: 62 | with open(args.config_file) as f: 63 | return json.load(f) 64 | else: 65 | return None 66 | -------------------------------------------------------------------------------- /python/flexflow/serve/models/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright 2023 CMU, Facebook, LANL, MIT, NVIDIA, and Stanford (alphabetical) 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | from .llama import FlexFlowLLAMA, LLAMAConfig 16 | from .opt import FlexFlowOPT, OPTConfig 17 | from .falcon import FlexFlowFalcon, FalconConfig 18 | from .starcoder import FlexFlowSTARCODER, STARCODERConfig 19 | from .mpt import FlexFlowMPT, MPTConfig 20 | -------------------------------------------------------------------------------- /python/flexflow/serve/models/base.py: -------------------------------------------------------------------------------- 1 | # Copyright 2023 CMU, Facebook, LANL, MIT, NVIDIA, and Stanford (alphabetical) 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | from flexflow.core import * 15 | 16 | 17 | class FlexFlowModel: 18 | def __init__( 19 | self, 20 | ffmodel: FFModel, 21 | mode: InferenceMode, 22 | generation_config: GenerationConfig, 23 | ffconfig: FFConfig, 24 | hf_config: any, 25 | data_type: DataType, 26 | ): 27 | self.build_model() 28 | 29 | def build_model(self): 30 | assert False, "Not implemented yet" 31 | 32 | def convert_hf_weight_name(name): 33 | assert False, "Not implemented yet" 34 | 35 | def convert_hf_model(model, dst_folder): 36 | assert False, "Not implemented yet" 37 | -------------------------------------------------------------------------------- /python/flexflow/torch/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/flexflow/flexflow-serve/dd7bee0e9d6e43b1aaf9355a4d2091fdae2b6942/python/flexflow/torch/__init__.py -------------------------------------------------------------------------------- /python/flexflow/torch/nn/__init__.py: -------------------------------------------------------------------------------- 1 | from .modules import * 2 | -------------------------------------------------------------------------------- /python/flexflow/torch/nn/modules/__init__.py: -------------------------------------------------------------------------------- 1 | from .module import Module 2 | 3 | from torch.nn import Conv2d as Conv2d 4 | from torch.nn import MaxPool2d as MaxPool2d 5 | from torch.nn import Linear as Linear 6 | from torch.nn import Dropout as Dropout 7 | from torch.nn import Flatten as Flatten 8 | from torch.nn import ReLU as ReLU 9 | -------------------------------------------------------------------------------- /python/flexflow/torch/nn/modules/module.py: -------------------------------------------------------------------------------- 1 | import torch.nn as nn 2 | import flexflow.core as ff 3 | import flexflow.torch.fx as fx 4 | 5 | class Module(nn.Module): 6 | def __init__(self): 7 | super(Module, self).__init__() 8 | self._ffconfig = ff.FFConfig() 9 | self._ffconfig.parse_args() 10 | self._ffmodel = ff.FFModel(self._ffconfig) 11 | self._graph = None 12 | 13 | def __call__(self, input): 14 | print("forward"); 15 | 16 | # TODO: automatically call this function 17 | def symbolic_trace(self): 18 | self._graph = fx.symbolic_trace(self) 19 | for node in self._graph: 20 | if type(node) == fx.ModuleNode: 21 | print(node.name, node.module) -------------------------------------------------------------------------------- /python/flexflow_cffi_header.py.in: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | # Copyright 2023 CMU, Facebook, LANL, MIT, NVIDIA, and Stanford (alphabetical) 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | # 17 | 18 | # IMPORTANT: 19 | # * legion_cffi.py.in is used as an input to string.format() 20 | # * legion_cffi.py is a generated file and should not be modified by hand 21 | 22 | from __future__ import absolute_import, division, print_function, unicode_literals 23 | 24 | flexflow_header = {header} 25 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | cffi 2 | numpy 3 | cmake-build-extension 4 | ninja 5 | requests 6 | regex 7 | torch 8 | torchaudio 9 | torchvision 10 | flash-attn 11 | transformers>=4.47.1 12 | sentencepiece 13 | einops 14 | pip 15 | # peft-related 16 | scipy 17 | bitsandbytes 18 | datasets 19 | accelerate 20 | loralib 21 | triton 22 | peft 23 | -------------------------------------------------------------------------------- /scripts/format.sh: -------------------------------------------------------------------------------- 1 | #! /usr/bin/env bash 2 | 3 | set -euo pipefail 4 | 5 | GIT_ROOT="$(git rev-parse --show-toplevel)" 6 | cd "$GIT_ROOT" 7 | 8 | TOOLS_PATH="$GIT_ROOT/.tools" 9 | RELEASE="master-1d7ec53d" 10 | CLANG_FORMAT_VERSION="15" 11 | CLANG_FORMAT_PATH="$TOOLS_PATH/clang-format-$CLANG_FORMAT_VERSION-$RELEASE" 12 | 13 | mkdir -p "$TOOLS_PATH" 14 | 15 | error() { 16 | >&2 echo "$@" 17 | exit 1 18 | } 19 | 20 | get_os() { 21 | UNAME_OUTPUT="$(uname -s)" 22 | case "$UNAME_OUTPUT" in 23 | Linux*) 24 | OS=Linux 25 | ;; 26 | Darwin*) 27 | OS=Mac 28 | ;; 29 | *) 30 | error "Unknown OS $UNAME_OUTPUT. Exiting..." 31 | esac 32 | 33 | echo "$OS" 34 | } 35 | 36 | download_clang_tool() { 37 | TOOL="$1" 38 | VERSION="$2" 39 | TARGET_PATH="$3" 40 | 41 | BASE_URL="https://github.com/muttleyxd/clang-tools-static-binaries/releases/download/$RELEASE/" 42 | 43 | OS="$(get_os)" 44 | case "$OS" in 45 | Linux) 46 | URL_OS="linux" 47 | ;; 48 | Mac) 49 | URL_OS="macosx" 50 | ;; 51 | *) 52 | error "Unknown return value from get_os: $OS. Exiting..." 53 | esac 54 | URL="$BASE_URL/clang-${TOOL}-${VERSION}_${URL_OS}-amd64" 55 | echo "Downloading from $URL..." 56 | 57 | if command -v wget &> /dev/null; then 58 | wget "$URL" -O "$TARGET_PATH" 59 | elif command -v curl &> /dev/null; then 60 | curl -L "$URL" -o "$TARGET_PATH" 61 | else 62 | error "Could not find either wget or curl. Exiting..." 63 | fi 64 | } 65 | 66 | if [[ ! -e $CLANG_FORMAT_PATH ]]; then 67 | download_clang_tool format "$CLANG_FORMAT_VERSION" "$CLANG_FORMAT_PATH" 68 | chmod u+x "$CLANG_FORMAT_PATH" 69 | fi 70 | 71 | mapfile -t FILES < <(git ls-files ':!:triton/**' '*.h' '*.cc' '*.cpp' '*.cu' '*.c') 72 | "$CLANG_FORMAT_PATH" -i "${FILES[@]}" 73 | -------------------------------------------------------------------------------- /scripts/install_tokenizer.sh: -------------------------------------------------------------------------------- 1 | #! /usr/bin/env bash 2 | set -x 3 | set -e 4 | 5 | # Cd into directory holding this script 6 | cd "${BASH_SOURCE[0]%/*}" 7 | cd ../deps/tokenizers-cpp/example 8 | cmake -D CMAKE_CXX_FLAGS=-fPIC 9 | make -j 10 | -------------------------------------------------------------------------------- /scripts/mnist_mlp_run.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | eval "$(conda shell.bash hook)" 3 | conda activate flexflow 4 | export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:$CONDA_PREFIX/lib 5 | 6 | # Path to your FlexFlow build 7 | FLEXFLOW_DIR=/home/ubuntu/FlexFlow/build 8 | 9 | # Path to your UCX installation 10 | UCX_DIR=/home/ubuntu/ucx-1.15.0/install 11 | 12 | export REALM_UCP_BOOTSTRAP_PLUGIN=$FLEXFLOW_DIR/deps/legion/lib/realm_ucp_bootstrap_mpi.so 13 | export LD_LIBRARY_PATH=$FLEXFLOW_DIR/deps/legion/lib:$LD_LIBRARY_PATH 14 | export LD_LIBRARY_PATH=$FLEXFLOW_DIR:$LD_LIBRARY_PATH 15 | export LD_LIBRARY_PATH=$UCX_DIR/lib:$LD_LIBRARY_PATH 16 | export LD_LIBRARY_PATH=/opt/conda/envs/flexflow/lib:$LD_LIBRARY_PATH 17 | 18 | mpiexec -x REALM_UCP_BOOTSTRAP_PLUGIN -x PATH -x LD_LIBRARY_PATH --hostfile ~/hostfile --mca btl_tcp_if_include ens5 -np 2 "$FLEXFLOW_DIR"/flexflow_python "$FLEXFLOW_DIR"/../examples/python/native/mnist_mlp.py -ll:py 1 -ll:gpu 1 -ll:fsize 8000 -ll:zsize 8000 19 | -------------------------------------------------------------------------------- /src/ops/mean.cpp: -------------------------------------------------------------------------------- 1 | /* Copyright 2023 CMU, Facebook, LANL, MIT, NVIDIA, and Stanford (alphabetical) 2 | * 3 | * Licensed under the Apache License, Version 2.0 (the "License"); 4 | * you may not use this file except in compliance with the License. 5 | * You may obtain a copy of the License at 6 | * 7 | * http://www.apache.org/licenses/LICENSE-2.0 8 | * 9 | * Unless required by applicable law or agreed to in writing, software 10 | * distributed under the License is distributed on an "AS IS" BASIS, 11 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | * See the License for the specific language governing permissions and 13 | * limitations under the License. 14 | */ 15 | 16 | #include "flexflow/ops/mean.h" 17 | #include "flexflow/utils/hip_helper.h" 18 | #include 19 | 20 | namespace FlexFlow {}; // namespace FlexFlow 21 | -------------------------------------------------------------------------------- /src/ops/mean.cu: -------------------------------------------------------------------------------- 1 | /* Copyright 2023 CMU, Facebook, LANL, MIT, NVIDIA, and Stanford (alphabetical) 2 | * 3 | * Licensed under the Apache License, Version 2.0 (the "License"); 4 | * you may not use this file except in compliance with the License. 5 | * You may obtain a copy of the License at 6 | * 7 | * http://www.apache.org/licenses/LICENSE-2.0 8 | * 9 | * Unless required by applicable law or agreed to in writing, software 10 | * distributed under the License is distributed on an "AS IS" BASIS, 11 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | * See the License for the specific language governing permissions and 13 | * limitations under the License. 14 | */ 15 | 16 | #include "flexflow/ops/mean.h" 17 | #include "flexflow/utils/cuda_helper.h" 18 | 19 | namespace FlexFlow {}; // namespace FlexFlow 20 | -------------------------------------------------------------------------------- /src/ops/moe.cc: -------------------------------------------------------------------------------- 1 | /* Copyright 2023 CMU, Facebook, Stanford 2 | * 3 | * Licensed under the Apache License, Version 2.0 (the "License"); 4 | * you may not use this file except in compliance with the License. 5 | * You may obtain a copy of the License at 6 | * 7 | * http://www.apache.org/licenses/LICENSE-2.0 8 | * 9 | * Unless required by applicable law or agreed to in writing, software 10 | * distributed under the License is distributed on an "AS IS" BASIS, 11 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | * See the License for the specific language governing permissions and 13 | * limitations under the License. 14 | */ 15 | 16 | #include "flexflow/model.h" 17 | 18 | using namespace FlexFlow; 19 | 20 | Tensor FFModel::moe(const Tensor input, 21 | int num_exp, 22 | int num_select, 23 | int expert_hidden_size, 24 | float alpha, 25 | float lambda) { 26 | // MoE model 27 | Tensor gate_preds = dense(input, num_exp, AC_MODE_RELU); 28 | Tensor topK_output[2]; 29 | top_k(gate_preds, topK_output, num_select, false); 30 | Tensor exp_tensors[num_exp]; 31 | group_by(input, topK_output[1], exp_tensors, num_exp, alpha); 32 | Tensor agg_inputs[num_exp + 4]; 33 | agg_inputs[0] = softmax(topK_output[0]); // gate preds 34 | agg_inputs[1] = topK_output[1]; // gate assign 35 | agg_inputs[2] = topK_output[1]; // gate assign TopK (for cache) 36 | agg_inputs[3] = gate_preds; // full gate preds 37 | for (int i = 0; i < num_exp; i++) { 38 | Tensor exp_pred = dense(exp_tensors[i], expert_hidden_size, AC_MODE_RELU); 39 | agg_inputs[i + 4] = softmax(exp_pred); 40 | } 41 | Tensor coop_output = aggregate(agg_inputs, num_exp, lambda); 42 | // get_metrics(); 43 | return coop_output; 44 | } 45 | -------------------------------------------------------------------------------- /src/parallel_ops/fused_parallel_op.cpp: -------------------------------------------------------------------------------- 1 | /* Copyright 2023 CMU, Facebook, LANL, MIT, NVIDIA, and Stanford (alphabetical) 2 | * 3 | * Licensed under the Apache License, Version 2.0 (the "License"); 4 | * you may not use this file except in compliance with the License. 5 | * You may obtain a copy of the License at 6 | * 7 | * http://www.apache.org/licenses/LICENSE-2.0 8 | * 9 | * Unless required by applicable law or agreed to in writing, software 10 | * distributed under the License is distributed on an "AS IS" BASIS, 11 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | * See the License for the specific language governing permissions and 13 | * limitations under the License. 14 | */ 15 | 16 | #include "flexflow/parallel_ops/fused_parallel_op.h" 17 | #include "flexflow/utils/hip_helper.h" 18 | #include 19 | 20 | namespace FlexFlow {}; // namespace FlexFlow 21 | -------------------------------------------------------------------------------- /src/parallel_ops/fused_parallel_op.cu: -------------------------------------------------------------------------------- 1 | /* Copyright 2023 CMU, Facebook, LANL, MIT, NVIDIA, and Stanford (alphabetical) 2 | * 3 | * Licensed under the Apache License, Version 2.0 (the "License"); 4 | * you may not use this file except in compliance with the License. 5 | * You may obtain a copy of the License at 6 | * 7 | * http://www.apache.org/licenses/LICENSE-2.0 8 | * 9 | * Unless required by applicable law or agreed to in writing, software 10 | * distributed under the License is distributed on an "AS IS" BASIS, 11 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | * See the License for the specific language governing permissions and 13 | * limitations under the License. 14 | */ 15 | 16 | #include "flexflow/parallel_ops/fused_parallel_op.h" 17 | #include "flexflow/utils/cuda_helper.h" 18 | 19 | namespace FlexFlow {}; // namespace FlexFlow 20 | -------------------------------------------------------------------------------- /src/recompile/recompile_state.cc: -------------------------------------------------------------------------------- 1 | /* Copyright 2023 CMU, Facebook, LANL, MIT, NVIDIA, and Stanford (alphabetical) 2 | * 3 | * Licensed under the Apache License, Version 2.0 (the "License"); 4 | * you may not use this file except in compliance with the License. 5 | * You may obtain a copy of the License at 6 | * 7 | * http://www.apache.org/licenses/LICENSE-2.0 8 | * 9 | * Unless required by applicable law or agreed to in writing, software 10 | * distributed under the License is distributed on an "AS IS" BASIS, 11 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | * See the License for the specific language governing permissions and 13 | * limitations under the License. 14 | */ 15 | 16 | #include "flexflow/model.h" 17 | #include "flexflow/recompile.h" 18 | #include "legion.h" 19 | 20 | namespace FlexFlow { 21 | 22 | RecompileState::RecompileState(std::function _trigger_func, 23 | std::function _alter_func, 24 | FFModel *_ff) 25 | : trigger_func(_trigger_func), alter_func(_alter_func), ff(_ff) { 26 | recompilations = 0; 27 | } 28 | 29 | bool RecompileState::trigger() { 30 | return trigger_func(ff); 31 | } 32 | 33 | void RecompileState::alter() { 34 | if (recompilations == 0) { 35 | alter_func(ff); 36 | } 37 | recompilations++; 38 | } 39 | 40 | }; // namespace FlexFlow 41 | -------------------------------------------------------------------------------- /src/runtime/accessor_kernel.cpp: -------------------------------------------------------------------------------- 1 | #include "flexflow/utils/hip_helper.h" 2 | #include 3 | 4 | namespace FlexFlow { 5 | 6 | using namespace Legion; 7 | 8 | template 9 | __global__ void zero_array(DT *ptr, coord_t size) { 10 | CUDA_KERNEL_LOOP(i, size) { 11 | ptr[i] = 0; 12 | } 13 | } 14 | 15 | }; // namespace FlexFlow 16 | -------------------------------------------------------------------------------- /src/runtime/accessor_kernel.cu: -------------------------------------------------------------------------------- 1 | #include "flexflow/utils/cuda_helper.h" 2 | 3 | namespace FlexFlow { 4 | 5 | using namespace Legion; 6 | 7 | template 8 | __global__ void zero_array(DT *ptr, coord_t size) { 9 | CUDA_KERNEL_LOOP(i, size) { 10 | ptr[i] = 0; 11 | } 12 | } 13 | 14 | }; // namespace FlexFlow 15 | -------------------------------------------------------------------------------- /src/runtime/compile.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | g++ dlrm_strategy.cc strategy.pb.cc -o generator -std=c++11 -lprotobuf -L/usr/local/lib -I/usr/local/include -I"${PROTOBUF}"/src -pthread -O2 4 | g++ dlrm_strategy_hetero.cc strategy.pb.cc -o generator_hetero -std=c++11 -lprotobuf -L/usr/local/lib -I/usr/local/include -I"${PROTOBUF}"/src -pthread -O2 5 | -------------------------------------------------------------------------------- /src/runtime/cpp_driver.cc: -------------------------------------------------------------------------------- 1 | /* Copyright 2023 CMU, Facebook, LANL, MIT, NVIDIA, and Stanford (alphabetical) 2 | * 3 | * Licensed under the Apache License, Version 2.0 (the "License"); 4 | * you may not use this file except in compliance with the License. 5 | * You may obtain a copy of the License at 6 | * 7 | * http://www.apache.org/licenses/LICENSE-2.0 8 | * 9 | * Unless required by applicable law or agreed to in writing, software 10 | * distributed under the License is distributed on an "AS IS" BASIS, 11 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | * See the License for the specific language governing permissions and 13 | * limitations under the License. 14 | */ 15 | 16 | #include "dirent.h" 17 | #include "flexflow/mapper.h" 18 | #include "flexflow/model.h" 19 | 20 | using namespace Legion; 21 | using namespace FlexFlow; 22 | 23 | // ======================================================== 24 | // Task and mapper registrations 25 | // ======================================================== 26 | int main(int argc, char **argv) { 27 | // This needs to be set, otherwise NCCL will try to use group kernel launches, 28 | // which are not compatible with the Realm CUDA hijack. 29 | setenv("NCCL_LAUNCH_MODE", "PARALLEL", true); 30 | 31 | Runtime::set_top_level_task_id(TOP_LEVEL_TASK_ID); 32 | { 33 | TaskVariantRegistrar registrar(TOP_LEVEL_TASK_ID, "top_level"); 34 | registrar.add_constraint(ProcessorConstraint(Processor::LOC_PROC)); 35 | registrar.set_replicable(); 36 | Runtime::preregister_task_variant(registrar, "top_level"); 37 | } 38 | 39 | register_flexflow_internal_tasks(); 40 | 41 | // Register custom tasks 42 | register_custom_tasks(); 43 | 44 | Runtime::add_registration_callback(FFMapper::update_mappers); 45 | return Runtime::start(argc, argv); 46 | } 47 | -------------------------------------------------------------------------------- /src/runtime/fftype.cc: -------------------------------------------------------------------------------- 1 | #include "flexflow/fftype.h" 2 | #include "flexflow/config.h" 3 | #include 4 | 5 | namespace FlexFlow { 6 | 7 | const LayerID LayerID::NO_ID = LayerID(); 8 | 9 | LayerID::LayerID() 10 | : id(0), transformer_layer_id(MAX_NUM_TRANSFORMER_LAYERS), model_id(0) {} 11 | 12 | LayerID::LayerID(size_t _id, size_t _transformer_layer_id, size_t _model_id) 13 | : id(_id), transformer_layer_id(_transformer_layer_id), 14 | model_id(_model_id) { 15 | assert(is_valid_id()); 16 | } 17 | 18 | bool LayerID::is_valid_id() const { 19 | return (id >= LAYER_GUID_FIRST_VALID && id <= LAYER_GUID_LAST_VALID && 20 | transformer_layer_id >= 0 && 21 | transformer_layer_id < MAX_NUM_TRANSFORMER_LAYERS && model_id >= 0); 22 | } 23 | 24 | bool operator==(LayerID const &lhs, LayerID const &rhs) { 25 | // id should be sufficient to distinguish different layers 26 | if (lhs.id == rhs.id) { 27 | assert(lhs.transformer_layer_id == rhs.transformer_layer_id); 28 | assert(lhs.model_id == rhs.model_id); 29 | } 30 | return lhs.id == rhs.id; 31 | } 32 | 33 | const PEFTModelID PEFTModelID::NO_ID = PEFTModelID(); 34 | 35 | PEFTModelID::PEFTModelID() : id(0) {} 36 | 37 | PEFTModelID::PEFTModelID(size_t _id) : id(_id) { 38 | assert(is_valid_id()); 39 | } 40 | 41 | bool PEFTModelID::is_valid_id() const { 42 | return (id >= PEFT_MODEL_ID_FIRST_VALID && id <= PEFT_MODEL_ID_LAST_VALID); 43 | } 44 | 45 | bool operator==(PEFTModelID const &lhs, PEFTModelID const &rhs) { 46 | return lhs.id == rhs.id; 47 | } 48 | 49 | bool operator!=(PEFTModelID const &lhs, PEFTModelID const &rhs) { 50 | return !(lhs == rhs); 51 | } 52 | 53 | std::ostream &operator<<(std::ostream &os, PEFTModelID const &peft_model_id) { 54 | if (peft_model_id == PEFTModelID::NO_ID) { 55 | os << "NO_ID"; 56 | } else { 57 | os << peft_model_id.id; 58 | } 59 | return os; 60 | } 61 | 62 | }; // namespace FlexFlow 63 | -------------------------------------------------------------------------------- /src/runtime/memory_optimization.cc: -------------------------------------------------------------------------------- 1 | /* Copyright 2023 CMU, Facebook, LANL, MIT, NVIDIA, and Stanford (alphabetical) 2 | * 3 | * Licensed under the Apache License, Version 2.0 (the "License"); 4 | * you may not use this file except in compliance with the License. 5 | * You may obtain a copy of the License at 6 | * 7 | * http://www.apache.org/licenses/LICENSE-2.0 8 | * 9 | * Unless required by applicable law or agreed to in writing, software 10 | * distributed under the License is distributed on an "AS IS" BASIS, 11 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | * See the License for the specific language governing permissions and 13 | * limitations under the License. 14 | */ 15 | 16 | #include "flexflow/memory_optimization.h" 17 | 18 | namespace FlexFlow { 19 | 20 | namespace PCG { 21 | 22 | std::string MemoryUsage::to_string() const { 23 | std::string type_name; 24 | switch (usage_type) { 25 | case MemoryUsageType::GLOBAL: 26 | type_name = "GLOBAL"; 27 | break; 28 | case MemoryUsageType::PER_DEVICE_MAX: 29 | type_name = "PER_DEVICE_MAX"; 30 | break; 31 | } 32 | return "(MemoryUsageType:" + type_name + ", Usage:" + std::to_string(num) + 33 | ")"; 34 | } 35 | 36 | MemoryUsage &MemoryUsage::operator+=(MemoryUsage const &rhs) { 37 | assert(usage_type == rhs.usage_type); 38 | 39 | // Handle the merge of memory usage differently here. 40 | switch (usage_type) { 41 | case MemoryUsageType::GLOBAL: 42 | num += rhs.num; 43 | break; 44 | case MemoryUsageType::PER_DEVICE_MAX: 45 | num = std::max(num, rhs.num); 46 | break; 47 | } 48 | 49 | return *this; 50 | } 51 | 52 | MemoryUsage operator+(MemoryUsage lhs, MemoryUsage const &rhs) { 53 | lhs += rhs; 54 | return lhs; 55 | } 56 | 57 | std::ostream &operator<<(std::ostream &s, MemoryUsage const &usage) { 58 | s << usage.to_string(); 59 | return s; 60 | } 61 | 62 | } // namespace PCG 63 | 64 | } // namespace FlexFlow 65 | -------------------------------------------------------------------------------- /src/runtime/operator.cc: -------------------------------------------------------------------------------- 1 | #include "flexflow/operator.h" 2 | #include "flexflow/ffconst_utils.h" 3 | #include "flexflow/simulator.h" 4 | #include 5 | #include 6 | #include 7 | 8 | namespace FlexFlow { 9 | 10 | size_t Op::get_untyped_params_hash() const { 11 | size_t hash = this->get_params_hash(); 12 | hash_combine(hash, this->op_type); 13 | return hash; 14 | } 15 | 16 | size_t Op::get_params_hash() const { 17 | throw std::runtime_error( 18 | "No overload of get_params_hash defined for op type " + 19 | get_operator_type_name(this->op_type)); 20 | } 21 | 22 | fs::path get_dst_folder(std::string const &subdir, 23 | int step_idx, 24 | int shard_idx, 25 | bool before_kernel) { 26 | std::vector debug_subdirs = {"fwd", "bwd", "optim", "weights"}; 27 | assert(std::find(debug_subdirs.begin(), debug_subdirs.end(), subdir) != 28 | debug_subdirs.end()); 29 | std::string step_substr = "step_" + std::to_string(step_idx); 30 | if (before_kernel) { 31 | step_substr += "_pre"; 32 | } 33 | char cwd[PATH_MAX]; 34 | char *result = getcwd(cwd, sizeof(cwd)); 35 | assert(result && "getcwd failed"); 36 | 37 | // char const *ff_cache_path = std::string(std::getenv("FF_DEBUG_PATH")) == 38 | // "." ? 39 | // cwd : std::getenv("FF_DEBUG_PATH"); 40 | 41 | char const *ff_cache_path = std::getenv("FF_CACHE_PATH"); 42 | 43 | std::string debug_dir_ = 44 | ff_cache_path ? std::string(ff_cache_path) + "/debug/flexflow" 45 | : std::string("~/.cache/flexflow/debug/flexflow"); 46 | wordexp_t p; 47 | wordexp(debug_dir_.c_str(), &p, 0); 48 | debug_dir_ = p.we_wordv[0]; 49 | wordfree(&p); 50 | fs::path debug_dir = debug_dir_; 51 | if (!fs::is_directory(debug_dir)) { 52 | printf("invalid debug directory: %s\n", debug_dir.c_str()); 53 | } 54 | assert(fs::is_directory(debug_dir)); 55 | fs::path dst_folder = 56 | debug_dir / subdir / step_substr / ("shard_" + std::to_string(shard_idx)); 57 | fs::create_directories(dst_folder); 58 | return dst_folder; 59 | } 60 | 61 | }; // namespace FlexFlow -------------------------------------------------------------------------------- /src/runtime/recursive_logger.cc: -------------------------------------------------------------------------------- 1 | #include "flexflow/utils/recursive_logger.h" 2 | 3 | namespace FlexFlow { 4 | 5 | RecursiveLogger::RecursiveLogger(std::string const &category_name) 6 | : logger(category_name) {} 7 | 8 | Realm::LoggerMessage RecursiveLogger::info() { 9 | Realm::LoggerMessage msg = this->logger.info(); 10 | this->print_prefix(msg); 11 | return msg; 12 | } 13 | 14 | Realm::LoggerMessage RecursiveLogger::debug() { 15 | Realm::LoggerMessage msg = this->logger.debug(); 16 | this->print_prefix(msg); 17 | return msg; 18 | } 19 | 20 | Realm::LoggerMessage RecursiveLogger::spew() { 21 | Realm::LoggerMessage msg = this->logger.spew(); 22 | this->print_prefix(msg); 23 | return msg; 24 | } 25 | 26 | void RecursiveLogger::print_prefix(Realm::LoggerMessage &msg) const { 27 | msg << this->depth << " "; 28 | for (int i = 0; i < this->depth; i++) { 29 | msg << " "; 30 | } 31 | } 32 | 33 | void RecursiveLogger::enter() { 34 | this->depth++; 35 | } 36 | 37 | void RecursiveLogger::leave() { 38 | this->depth--; 39 | assert(this->depth >= 0); 40 | } 41 | 42 | std::unique_ptr RecursiveLogger::enter_tag() { 43 | return std::unique_ptr(new DepthTag(*this)); 44 | } 45 | 46 | DepthTag::DepthTag(RecursiveLogger &_logger) : logger(_logger) { 47 | this->logger.enter(); 48 | } 49 | 50 | DepthTag::~DepthTag() { 51 | this->logger.leave(); 52 | } 53 | 54 | }; // namespace FlexFlow 55 | -------------------------------------------------------------------------------- /src/runtime/tensor.cpp: -------------------------------------------------------------------------------- 1 | #include "flexflow/accessor.h" 2 | #include "flexflow/config.h" 3 | #include "flexflow/model.h" 4 | #include "flexflow/parallel_tensor.h" 5 | #include "flexflow/utils/hip_helper.h" 6 | #include 7 | 8 | namespace FlexFlow {} // namespace FlexFlow 9 | -------------------------------------------------------------------------------- /src/runtime/tensor.cu: -------------------------------------------------------------------------------- 1 | #include "flexflow/accessor.h" 2 | #include "flexflow/config.h" 3 | #include "flexflow/model.h" 4 | #include "flexflow/parallel_tensor.h" 5 | #include "flexflow/utils/cuda_helper.h" 6 | 7 | namespace FlexFlow {} // namespace FlexFlow 8 | -------------------------------------------------------------------------------- /src/utils/dot/record_formatter.cc: -------------------------------------------------------------------------------- 1 | #include "flexflow/utils/dot/record_formatter.h" 2 | 3 | RecordFormatter &operator<<(RecordFormatter &r, std::string const &tok) { 4 | r.pieces.push_back(tok); 5 | 6 | return r; 7 | } 8 | 9 | RecordFormatter &operator<<(RecordFormatter &r, int tok) { 10 | std::ostringstream oss; 11 | oss << tok; 12 | 13 | r << oss; 14 | 15 | return r; 16 | } 17 | 18 | RecordFormatter &operator<<(RecordFormatter &r, float tok) { 19 | std::ostringstream oss; 20 | oss << std::scientific; 21 | oss << tok; 22 | 23 | r << oss; 24 | 25 | return r; 26 | } 27 | 28 | RecordFormatter &operator<<(RecordFormatter &r, RecordFormatter const &sub_r) { 29 | std::ostringstream oss; 30 | oss << sub_r; 31 | r << oss.str(); 32 | 33 | return r; 34 | } 35 | 36 | RecordFormatter &operator<<(RecordFormatter &r, std::ostringstream &oss) { 37 | r << oss.str(); 38 | 39 | return r; 40 | } 41 | 42 | std::ostream &operator<<(std::ostream &s, RecordFormatter const &r) { 43 | s << "{ "; 44 | for (size_t i = 0; i < r.pieces.size(); i++) { 45 | s << r.pieces[i]; 46 | if (i + 1 < r.pieces.size()) { 47 | s << " | "; 48 | } 49 | } 50 | s << " }"; 51 | 52 | return s; 53 | } -------------------------------------------------------------------------------- /tests/align/README.md: -------------------------------------------------------------------------------- 1 | # FlexFlow-PyTorch Alignment 2 | 3 | This is an ongoing effort to align FlexFlow with PyTorch as a means to verify 4 | the correctness of FlexFlow. Support for additional operators will be coming 5 | soon, and all alignment files here are subject to change. 6 | ## Install the Python dependencies 7 | install `pytest` module in flexflow environment. 8 | 9 | ## Running the Alignment Tests 10 | Note that FlexFlow requires a CPU installation of PyTorch, so we recommend a 11 | separate `conda` environment for each (e.g. named `flexflow` and `pytorch`, 12 | respectively). 13 | 14 | Assuming those two `conda` environments, we may run 15 | ``` 16 | cd FlexFlow 17 | conda activate flexflow 18 | ./tests/align/test_all_operators.sh 19 | ``` 20 | 21 | -------------------------------------------------------------------------------- /tests/align/mt5_encoder/align_mt5_encoder_ff.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | 4 | from flexflow.core import * 5 | 6 | sys.path.append("./align/") 7 | from align_ff_utils import run_fwd_bwd 8 | from mt5_ff_utils import init_ff_mt5_encoder 9 | 10 | # NOTE: We use the PyTorch mT5 encoder output as the labels 11 | ENCODER_LABELS_PATH = os.path.join( 12 | "align", "mt5_encoder", "out", "hidden_states.pt", 13 | ) 14 | 15 | 16 | def run(): 17 | assert os.path.exists(ENCODER_LABELS_PATH), \ 18 | "Make sure to generate the encoder labels file (e.g. by modifying " \ 19 | "the transformers library source code)" 20 | ffmodel, input_dls, label_dl = init_ff_mt5_encoder( 21 | ENCODER_LABELS_PATH, 22 | ) 23 | run_fwd_bwd(ffmodel, ffmodel._ffconfig, input_dls, label_dl) 24 | 25 | 26 | if __name__ == "__main__": 27 | run() 28 | -------------------------------------------------------------------------------- /tests/align/peft_flash_attn/launch.json: -------------------------------------------------------------------------------- 1 | { 2 | "version": "0.2.0", 3 | "configurations": [ 4 | { 5 | "name": "Debug FlexFlow PEFT", 6 | "type": "cppdbg", 7 | "request": "launch", 8 | "program": "${workspaceFolder}/build/inference/peft/peft", 9 | "args": [ 10 | "-ll:gpu", "1", 11 | "-ll:cpu", "4", 12 | "-ll:util", "4", 13 | "-tensor-parallelism-degree", "1", 14 | "-ll:fsize", "8192", 15 | "-ll:zsize", "12000", 16 | "--max-requests-per-batch", "1", 17 | "--max-sequence-length", "128", 18 | "--max-tokens-per-batch", "128", 19 | "-llm-model", "JackFram/llama-160m", 20 | "-finetuning-dataset", "./inference/prompt/peft_dataset.json", 21 | "-peft-model", "goliaro/llama-160m-lora", 22 | "-enable-peft", 23 | "--use-full-precision", 24 | "--inference-debugging" 25 | ], 26 | "stopAtEntry": false, 27 | "cwd": "${workspaceFolder}", 28 | "environment": [ 29 | { 30 | "name": "LD_LIBRARY_PATH", 31 | "value": "/root/flexflow-serve/build:/root/flexflow-serve/build/deps/legion/lib:/opt/conda/lib/python3.12/site-packages/torch/lib:/opt/conda/lib/python3.12/site-packages:/opt/conda/lib" 32 | } 33 | ], 34 | "externalConsole": false, 35 | "MIMode": "gdb", 36 | "miDebuggerPath": "/usr/bin/gdb", 37 | "setupCommands": [ 38 | { 39 | "description": "Enable pretty-printing for gdb", 40 | "text": "-enable-pretty-printing", 41 | "ignoreFailures": true 42 | } 43 | ] 44 | } 45 | ] 46 | } -------------------------------------------------------------------------------- /tests/align/peft_flash_attn/peft_flash_debug_note: -------------------------------------------------------------------------------- 1 | # Memo for peft debug steps 2 | 3 | #export libtorch as env var (add to docker build later): 4 | export LD_LIBRARY_PATH=/opt/conda/lib/python3.12/site-packages/torch/lib:$LD_LIBRARY_PATH 5 | 6 | #before running the tests, run: 7 | source ./build/set_python_envs.sh 8 | 9 | # perform peft test: 10 | ./tests/peft_test.sh 11 | 12 | # build debug: 13 | export BUILD_TYPE=Debug 14 | 15 | # build: 16 | ../config/config.linux 17 | make -j 18 | 19 | # install build 20 | make -j install 21 | 22 | # Steps to debug: 23 | # 1. install gdb-python3 24 | # apt-get update && apt-get install -y python3-dbg 25 | # 2. do not do env setup before this, run in gdb and set env in gdb: 26 | gdb -ex run --args ./build/inference/peft/peft -ll:gpu 1 -ll:cpu 4 -ll:util 4 -tensor-parallelism-degree 1 -ll:fsize 8192 -ll:zsize 12000 --max-requests-per-batch 1 --max-sequence-length 128 --max-tokens-per-batch 128 -llm-model JackFram/llama-160m -finetuning-dataset ./inference/prompt/peft_dataset.json -peft-model goliaro/llama-160m-lora -enable-peft --inference-debugging 27 | # --use-full-precision: ignore this case for now 28 | # 3.in gdb, export all in /build/set_python_envs.sh 29 | set environment LD_LIBRARY_PATH=/root/flexflow-serve/build:/root/flexflow-serve/build/deps/legion/lib:$(dirname $(/root/flexflow-serve/python/flexflow/findpylib.py)):/opt/conda/lib/python3.12/site-packages/torch/lib:/opt/conda/lib/python3.12/site-packages:$LD_LIBRARY_PATH 30 | # 4. and then run 31 | run 32 | 33 | # single C++ tests: 34 | ./build/inference/peft/peft -ll:gpu 2 -ll:cpu 4 -ll:util 4 -tensor-parallelism-degree 2 -ll:fsize 8192 -ll:zsize 12000 --max-requests-per-batch 1 --max-sequence-length 128 --max-tokens-per-batch 128 -llm-model JackFram/llama-160m -finetuning-dataset ./inference/prompt/peft_dataset.json -peft-model goliaro/llama-160m-lora -enable-peft --inference-debugging -------------------------------------------------------------------------------- /tests/align/test_all_operators.sh: -------------------------------------------------------------------------------- 1 | #! /usr/bin/env bash 2 | eval "$(conda shell.bash hook)" 3 | 4 | rm -rf align/out 5 | 6 | function generate_ff_tensor(){ 7 | python tests/align/align_create_tensor_ff.py -ll:gpu 1 -ll:fsize 5000 -ll:zsize 4096 -b 16 -o "$1" 8 | } 9 | 10 | function generate_torch_tensor(){ 11 | python tests/align/align_create_tensor_torch.py -o "$1" 12 | } 13 | 14 | ops=(add concat conv2d cos embedding exp flat getitem identity multiply pool2d reducesum relu reshape scalar_add scalar_multiply scalar_sub scalar_truediv sigmoid sin subtract tanh transpose view_embedding max min linear layernorm gather) 15 | 16 | #create flexflow tensors 17 | conda activate flexflow 18 | conda info --envs 19 | for(( i=0;i<${#ops[@]};i++)) 20 | do 21 | generate_ff_tensor "${ops[i]}"; 22 | done; 23 | 24 | #create torch tensorss 25 | conda activate pytorch 26 | for(( i=0;i<${#ops[@]};i++)) 27 | do 28 | generate_torch_tensor "${ops[i]}"; 29 | done; 30 | 31 | conda activate flexflow 32 | python -m pytest tests/align/align_test.py 33 | -------------------------------------------------------------------------------- /tests/inference/huggingface_inference_simple.py: -------------------------------------------------------------------------------- 1 | from transformers import ( 2 | AutoModelForCausalLM, 3 | AutoTokenizer, 4 | AutoConfig, 5 | GenerationConfig, 6 | ) 7 | 8 | model_name = "meta-llama/Meta-Llama-3.1-8B-Instruct" 9 | do_sample = False 10 | max_length = 128 11 | model = AutoModelForCausalLM.from_pretrained(model_name, trust_remote_code=True, device_map="auto",) 12 | hf_config = AutoConfig.from_pretrained(model_name, trust_remote_code=True) 13 | tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True) 14 | generation_config = GenerationConfig.from_pretrained(model_name) 15 | print(generation_config.do_sample) 16 | generation_config.do_sample = do_sample 17 | generation_config.num_beams=1 18 | generation_config.temperature = None 19 | generation_config.top_p = None 20 | 21 | 22 | def run_text_completion(): 23 | prompt = "Help me plan a 1-week trip to Dubai" 24 | batch = tokenizer(prompt, return_tensors="pt", add_special_tokens=True) 25 | 26 | generated = model.generate( 27 | batch["input_ids"], 28 | max_new_tokens=max_length, 29 | generation_config=generation_config, 30 | ) 31 | out = tokenizer.decode(generated[0]) 32 | print(out) 33 | 34 | def run_chat_completion(): 35 | messages=[ 36 | {"role": "system", "content": "You are a helpful an honest programming assistant."}, 37 | {"role": "user", "content": "Is Rust better than Python?"}, 38 | ] 39 | tokenized_chat = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True) 40 | batch = tokenizer(tokenized_chat, return_tensors="pt") 41 | 42 | generated = model.generate( 43 | batch["input_ids"], 44 | max_new_tokens=max_length, 45 | generation_config=generation_config, 46 | ) 47 | out = tokenizer.decode(generated[0], skip_special_tokens=True, clean_up_tokenization_spaces=True) 48 | prompt_length = len(tokenizer.decode(batch["input_ids"][0], skip_special_tokens=True, clean_up_tokenization_spaces=True)) 49 | all_text = out[prompt_length:] 50 | print(all_text) 51 | run_chat_completion() -------------------------------------------------------------------------------- /tests/inference/huggingface_pipeline.py: -------------------------------------------------------------------------------- 1 | import transformers 2 | from transformers import GenerationConfig 3 | 4 | model_id = "meta-llama/Meta-Llama-3.1-8B-Instruct" 5 | do_sample = False 6 | 7 | generation_config = GenerationConfig.from_pretrained(model_id) 8 | generation_config.do_sample = do_sample 9 | generation_config.num_beams=1 10 | # generation_config.max_length = 128 11 | generation_config.temperature = None 12 | generation_config.top_p = None 13 | print(generation_config) 14 | 15 | pipeline = transformers.pipeline( 16 | "text-generation", 17 | model=model_id, 18 | # model_kwargs={"torch_dtype": torch.bfloat16}, 19 | device_map="auto", 20 | ) 21 | 22 | messages=[ 23 | {"role": "system", "content": "You are a helpful an honest programming assistant."}, 24 | {"role": "user", "content": "Is Rust better than Python?"}, 25 | ] 26 | 27 | # messages="Help me plan a 1-week trip to Dubai" 28 | outputs = pipeline( 29 | messages, 30 | max_new_tokens=128, 31 | generation_config=generation_config, 32 | ) 33 | print(outputs[0]["generated_text"][-1]['content']) -------------------------------------------------------------------------------- /tests/multinode_helpers/mpi_wrapper1.sh: -------------------------------------------------------------------------------- 1 | #! /usr/bin/env bash 2 | set -x 3 | set -e 4 | 5 | if [ -z "$FF_HOME" ]; then echo "FF_HOME variable is not defined, aborting tests"; exit; fi 6 | if [ -z "$NUM_NODES" ]; then echo "NUM_NODES variable is not defined, aborting tests"; exit; fi 7 | if [ -z "$GPUS" ]; then echo "GPUS variable is not defined, aborting tests"; exit; fi 8 | 9 | # We need to wrap the instruction below in its own script because MPI throws an error if we try 10 | # to run "mpirun" more than once in the same script. Hence, we cannot simply call "mpirun" in the 11 | # training_tests.sh script 12 | mpirun -np "$NUM_NODES" "$FF_HOME"/tests/multinode_helpers/mpi_wrapper2.sh "$@" 13 | -------------------------------------------------------------------------------- /tests/multinode_helpers/mpi_wrapper2.sh: -------------------------------------------------------------------------------- 1 | #! /usr/bin/env bash 2 | set -x 3 | set -e 4 | 5 | if [ -z "$NUM_NODES" ]; then echo "NUM_NODES variable is not defined, aborting tests"; exit; fi 6 | if [ -z "$GPUS" ]; then echo "GPUS variable is not defined, aborting tests"; exit; fi 7 | 8 | # We need to wrap the instruction below in its own script because the CUDA_VISIBLE_DEVICES environment 9 | # variable will need to be set differently for each node, but the "mpirun" command should take a single 10 | # executable as its first argument 11 | CUDA_VISIBLE_DEVICES=$(seq -s, $((OMPI_COMM_WORLD_RANK * GPUS )) $(( OMPI_COMM_WORLD_RANK * GPUS +1 )) ) 12 | export CUDA_VISIBLE_DEVICES 13 | 14 | python "$@" 15 | -------------------------------------------------------------------------------- /tests/python_interface_test.sh: -------------------------------------------------------------------------------- 1 | #! /usr/bin/env bash 2 | set -x 3 | set -e 4 | 5 | 6 | FF_HOME="$(realpath "${BASH_SOURCE[0]%/*}/..")" 7 | export FF_HOME 8 | # Edit the folder below if you did not build FlexFlow in $FF_HOME/build 9 | BUILD_FOLDER="${FF_HOME}/build" 10 | export BUILD_FOLDER 11 | 12 | # Token to access private huggingface models (e.g. LLAMA-2) 13 | HUGGINGFACE_TOKEN=${HUGGINGFACE_TOKEN:-none} 14 | if [[ "$HUGGINGFACE_TOKEN" != "none" ]]; then 15 | huggingface-cli login --token "$HUGGINGFACE_TOKEN" 16 | fi 17 | 18 | installation_status=${1:-"before-installation"} 19 | echo "Running Python interface tests (installation status: ${installation_status})" 20 | if [[ "$installation_status" == "before-installation" ]]; then 21 | # Check availability of flexflow modules in Python 22 | export PYTHONPATH="${FF_HOME}/python:${BUILD_FOLDER}/deps/legion/bindings/python:${PYTHONPATH}" 23 | export LD_LIBRARY_PATH="${BUILD_FOLDER}:${LD_LIBRARY_PATH}" 24 | python -c "import flexflow.core; import flexflow.serve as ff; exit()" 25 | unset PYTHONPATH 26 | unset LD_LIBRARY_PATH 27 | # Run simple python inference test 28 | export LD_LIBRARY_PATH="${BUILD_FOLDER}:${BUILD_FOLDER}/deps/legion/lib:${LD_LIBRARY_PATH}" 29 | export PYTHONPATH="${FF_HOME}/python:${BUILD_FOLDER}/deps/legion/bindings/python:${PYTHONPATH}" 30 | python "$FF_HOME"/inference/python/incr_decoding.py 31 | unset PYTHONPATH 32 | unset LD_LIBRARY_PATH 33 | elif [[ "$installation_status" == "after-installation" ]]; then 34 | # Check availability of flexflow modules in Python 35 | python -c "import flexflow.core; import flexflow.serve as ff; exit()" 36 | # Run simple python inference test 37 | python "$FF_HOME"/inference/python/incr_decoding.py 38 | else 39 | echo "Invalid installation status!" 40 | echo "Usage: $0 {before-installation, after-installation}" 41 | exit 1 42 | fi 43 | -------------------------------------------------------------------------------- /triton/README.md: -------------------------------------------------------------------------------- 1 | # Legion Triton Backend 2 | 3 | This directory contains an incomplete prototype for a new 4 | [backend for Triton](https://github.com/triton-inference-server/backend) built on top of the 5 | [Legion runtime](https://legion.stanford.edu) for handling multi-node multi-GPU inference 6 | requests. While Legion is the primary runtime carrying out multi-node inference jobs, users 7 | do not need to understand Legion at all to use this backend. 8 | 9 | ## Build instructions 10 | 11 | ### CMake 12 | 13 | A simple CMake is provided to build Legion backend and to resolve its dependencies. 14 | Note that the build will install protobuf with customized settting, please make sure 15 | that the system doesn't have protobuf installed to avoid conflict. 16 | 17 | ``` 18 | $ mkdir build 19 | $ cd build 20 | $ cmake .. 21 | $ make 22 | ``` 23 | 24 | After build, the backend shared library can be found at `/PATH/TO/BUILDDIR/triton-legion/backends/legion` 25 | 26 | By default, the unit tests and test data are installed at `/PATH/TO/BUILDDIR/triton-legion/test`, 27 | which can be run after switching the current directory to the installed location. 28 | 29 | ### Make 30 | 31 | Protobuf is required for the backend and it must be installed from source with the following command 32 | to build the static protobuf library that can be linked with the backend shared library 33 | 34 | ``` 35 | git clone https://github.com/protocolbuffers/protobuf.git 36 | git checkout v3.17.1 37 | cd protobuf/cmake 38 | cmake -DCMAKE_POSITION_INDEPENDENT_CODE:BOOL=ON -Dprotobuf_BUILD_TESTS:BOOL=OFF -Dprotobuf_WITH_ZLIB:BOOL=OFF -Dprotobuf_MSVC_STATIC_RUNTIME:BOOL=OFF -DCMAKE_BUILD_TYPE:STRING=RELEASE -DBUILD_SHARED_LIBS:STRING=no . 39 | make install 40 | ``` 41 | 42 | Set the `LG_RT_DIR` environment variable to point to the `legion/runtime` directory in a Legion repo 43 | 44 | Set the `TRITON_DIR` to point to an installation of the Triton server 45 | 46 | Go into the `src` directory and type `make` 47 | 48 | Copy the `libtriton_flexflow.so` shared object to a triton model repository 49 | -------------------------------------------------------------------------------- /triton/cmake/TritonLegionBackendConfig.cmake.in: -------------------------------------------------------------------------------- 1 | #------------------------------------------------------------------------------# 2 | # Copyright 2022 NVIDIA CORPORATION 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | #------------------------------------------------------------------------------# 16 | 17 | include(CMakeFindDependencyMacro) 18 | 19 | get_filename_component( 20 | TRITONLEGIONBACKEND_CMAKE_DIR "${CMAKE_CURRENT_LIST_FILE}" PATH 21 | ) 22 | 23 | list(APPEND CMAKE_MODULE_PATH ${TRITONLEGIONBACKEND_CMAKE_DIR}) 24 | 25 | if(NOT TARGET TritonLegionBackend::triton-legion-backend) 26 | include("${TRITONLEGIONBACKEND_CMAKE_DIR}/TritonLegionBackendTargets.cmake") 27 | endif() 28 | 29 | set(TRITONLEGIONBACKEND_LIBRARIES TritonLegionBackend::triton-legion-backend) 30 | -------------------------------------------------------------------------------- /triton/qa/L0_e2e/models/add/1/model.onnx: -------------------------------------------------------------------------------- 1 | model:y 2 |  3 | input0 4 | input1output"Add 5 | test_graphZ 6 | input0 7 |  8 |  9 | Z 10 | input1 11 |  12 |  13 | b 14 | output 15 |  16 |  17 | B -------------------------------------------------------------------------------- /triton/qa/L0_e2e/models/add/1/model.strategy: -------------------------------------------------------------------------------- 1 | 1 Add 0 2 1 1 1 0 -------------------------------------------------------------------------------- /triton/qa/L0_e2e/models/add/config.pbtxt: -------------------------------------------------------------------------------- 1 | #------------------------------------------------------------------------------# 2 | # Copyright 2022 NVIDIA CORPORATION 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | #------------------------------------------------------------------------------# 16 | 17 | name: "add" 18 | backend: "legion" 19 | max_batch_size: 0 20 | input [ 21 | { 22 | name: "input0" 23 | data_type: TYPE_FP32 24 | dims: [ 4, 2 ] 25 | }, 26 | { 27 | name: "input1" 28 | data_type: TYPE_FP32 29 | dims: [ 4, 2 ] 30 | } 31 | ] 32 | output [ 33 | { 34 | name: "output" 35 | data_type: TYPE_FP32 36 | dims: [ 4, 2 ] 37 | } 38 | ] 39 | instance_group [ { kind : KIND_MODEL }] 40 | -------------------------------------------------------------------------------- /triton/qa/L0_e2e/models/cast/1/model.onnx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/flexflow/flexflow-serve/dd7bee0e9d6e43b1aaf9355a4d2091fdae2b6942/triton/qa/L0_e2e/models/cast/1/model.onnx -------------------------------------------------------------------------------- /triton/qa/L0_e2e/models/cast/1/model.strategy: -------------------------------------------------------------------------------- 1 | 1 Cast 0 2 1 1 1 0 -------------------------------------------------------------------------------- /triton/qa/L0_e2e/models/cast/config.pbtxt: -------------------------------------------------------------------------------- 1 | #------------------------------------------------------------------------------# 2 | # Copyright 2022 NVIDIA CORPORATION 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | #------------------------------------------------------------------------------# 16 | 17 | name: "cast" 18 | backend: "legion" 19 | max_batch_size: 0 20 | input [ 21 | { 22 | name: "input" 23 | data_type: TYPE_FP32 24 | dims: [ 1, 3 ] 25 | } 26 | ] 27 | output [ 28 | { 29 | name: "output" 30 | data_type: TYPE_FP64 31 | dims: [ 1, 3 ] 32 | } 33 | ] 34 | instance_group [ { kind : KIND_MODEL }] 35 | -------------------------------------------------------------------------------- /triton/qa/L0_e2e/models/identity/1/model.onnx: -------------------------------------------------------------------------------- 1 | model:j 2 |  3 | inputoutput"Identity 4 | test_graphZ 5 | input 6 |  7 |  8 |  9 |  10 | b 11 | output 12 |  13 |  14 |  15 |  16 | B -------------------------------------------------------------------------------- /triton/qa/L0_e2e/models/identity/1/model.strategy: -------------------------------------------------------------------------------- 1 | 1 Identity_0 0 4 1 1 1 1 1 0 -------------------------------------------------------------------------------- /triton/qa/L0_e2e/models/identity/config.pbtxt: -------------------------------------------------------------------------------- 1 | #------------------------------------------------------------------------------# 2 | # Copyright 2022 NVIDIA CORPORATION 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | #------------------------------------------------------------------------------# 16 | 17 | name: "identity" 18 | backend: "legion" 19 | max_batch_size: 0 20 | input [ 21 | { 22 | name: "input" 23 | data_type: TYPE_FP32 24 | dims: [ 4, 1, 5, 5 ] 25 | } 26 | ] 27 | output [ 28 | { 29 | name: "output" 30 | data_type: TYPE_FP32 31 | dims: [ 4, 1, 5, 5 ] 32 | } 33 | ] 34 | instance_group [ { kind : KIND_MODEL }] 35 | -------------------------------------------------------------------------------- /triton/qa/L0_e2e/models/mul/1/model.onnx: -------------------------------------------------------------------------------- 1 | model:y 2 |  3 | input0 4 | input1output"Mul 5 | test_graphZ 6 | input0 7 |  8 |  9 | Z 10 | input1 11 |  12 |  13 | b 14 | output 15 |  16 |  17 | B -------------------------------------------------------------------------------- /triton/qa/L0_e2e/models/mul/1/model.strategy: -------------------------------------------------------------------------------- 1 | 1 Mul 0 2 1 1 1 0 -------------------------------------------------------------------------------- /triton/qa/L0_e2e/models/mul/config.pbtxt: -------------------------------------------------------------------------------- 1 | #------------------------------------------------------------------------------# 2 | # Copyright 2022 NVIDIA CORPORATION 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | #------------------------------------------------------------------------------# 16 | 17 | name: "mul" 18 | backend: "legion" 19 | max_batch_size: 0 20 | input [ 21 | { 22 | name: "input0" 23 | data_type: TYPE_FP32 24 | dims: [ 4, 2 ] 25 | }, 26 | { 27 | name: "input1" 28 | data_type: TYPE_FP32 29 | dims: [ 4, 2 ] 30 | } 31 | ] 32 | output [ 33 | { 34 | name: "output" 35 | data_type: TYPE_FP32 36 | dims: [ 4, 2 ] 37 | } 38 | ] 39 | instance_group [ { kind : KIND_MODEL }] 40 | -------------------------------------------------------------------------------- /triton/qa/L0_e2e/models/reciprocal/1/model.onnx: -------------------------------------------------------------------------------- 1 | model:\ 2 |  3 | inputoutput" 4 | Reciprocal 5 | test_graphZ 6 | input 7 |  8 |  9 | b 10 | output 11 |  12 |  13 | B -------------------------------------------------------------------------------- /triton/qa/L0_e2e/models/reciprocal/1/model.strategy: -------------------------------------------------------------------------------- 1 | 1 Reciprocal 0 2 1 1 1 0 -------------------------------------------------------------------------------- /triton/qa/L0_e2e/models/reciprocal/config.pbtxt: -------------------------------------------------------------------------------- 1 | #------------------------------------------------------------------------------# 2 | # Copyright 2022 NVIDIA CORPORATION 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | #------------------------------------------------------------------------------# 16 | 17 | name: "reciprocal" 18 | backend: "legion" 19 | max_batch_size: 0 20 | input [ 21 | { 22 | name: "input" 23 | data_type: TYPE_FP32 24 | dims: [ 1, 3 ] 25 | } 26 | ] 27 | output [ 28 | { 29 | name: "output" 30 | data_type: TYPE_FP32 31 | dims: [ 1, 3 ] 32 | } 33 | ] 34 | instance_group [ { kind : KIND_MODEL }] 35 | -------------------------------------------------------------------------------- /triton/qa/L0_e2e/models/softmax/1/model.onnx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/flexflow/flexflow-serve/dd7bee0e9d6e43b1aaf9355a4d2091fdae2b6942/triton/qa/L0_e2e/models/softmax/1/model.onnx -------------------------------------------------------------------------------- /triton/qa/L0_e2e/models/softmax/1/model.strategy: -------------------------------------------------------------------------------- 1 | 1 Softmax 0 2 1 1 1 0 -------------------------------------------------------------------------------- /triton/qa/L0_e2e/models/softmax/config.pbtxt: -------------------------------------------------------------------------------- 1 | #------------------------------------------------------------------------------# 2 | # Copyright 2022 NVIDIA CORPORATION 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | #------------------------------------------------------------------------------# 16 | 17 | name: "softmax" 18 | backend: "legion" 19 | max_batch_size: 0 20 | input [ 21 | { 22 | name: "input" 23 | data_type: TYPE_FP32 24 | dims: [ 3, 1 ] 25 | } 26 | ] 27 | output [ 28 | { 29 | name: "output" 30 | data_type: TYPE_FP32 31 | dims: [ 3, 1 ] 32 | } 33 | ] 34 | instance_group [ { kind : KIND_MODEL }] 35 | -------------------------------------------------------------------------------- /triton/qa/L0_e2e/models/softmax1/1/model.onnx: -------------------------------------------------------------------------------- 1 | model:Y 2 |  3 | inputoutput"Softmax 4 | test_graphZ 5 | input 6 |  7 |  8 | b 9 | output 10 |  11 |  12 | B -------------------------------------------------------------------------------- /triton/qa/L0_e2e/models/softmax1/1/model.strategy: -------------------------------------------------------------------------------- 1 | 1 Softmax_1 0 2 1 1 1 0 -------------------------------------------------------------------------------- /triton/qa/L0_e2e/models/softmax1/config.pbtxt: -------------------------------------------------------------------------------- 1 | #------------------------------------------------------------------------------# 2 | # Copyright 2022 NVIDIA CORPORATION 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | #------------------------------------------------------------------------------# 16 | 17 | name: "softmax1" 18 | backend: "legion" 19 | max_batch_size: 0 20 | input [ 21 | { 22 | name: "input" 23 | data_type: TYPE_FP32 24 | dims: [ 3, 1 ] 25 | } 26 | ] 27 | output [ 28 | { 29 | name: "output" 30 | data_type: TYPE_FP32 31 | dims: [ 3, 1 ] 32 | } 33 | ] 34 | instance_group [ { kind : KIND_MODEL }] 35 | -------------------------------------------------------------------------------- /triton/qa/L0_e2e/models/sqrt/1/model.onnx: -------------------------------------------------------------------------------- 1 | model:V 2 |  3 | inputoutput"Sqrt 4 | test_graphZ 5 | input 6 |  7 |  8 | b 9 | output 10 |  11 |  12 | B -------------------------------------------------------------------------------- /triton/qa/L0_e2e/models/sqrt/1/model.strategy: -------------------------------------------------------------------------------- 1 | 1 Sqrt 0 2 1 1 1 0 -------------------------------------------------------------------------------- /triton/qa/L0_e2e/models/sqrt/config.pbtxt: -------------------------------------------------------------------------------- 1 | #------------------------------------------------------------------------------# 2 | # Copyright 2022 NVIDIA CORPORATION 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | #------------------------------------------------------------------------------# 16 | 17 | name: "sqrt" 18 | backend: "legion" 19 | max_batch_size: 0 20 | input [ 21 | { 22 | name: "input" 23 | data_type: TYPE_FP32 24 | dims: [ 3, 1 ] 25 | } 26 | ] 27 | output [ 28 | { 29 | name: "output" 30 | data_type: TYPE_FP32 31 | dims: [ 3, 1 ] 32 | } 33 | ] 34 | instance_group [ { kind : KIND_MODEL }] 35 | -------------------------------------------------------------------------------- /triton/qa/L0_e2e/models/sub/1/model.onnx: -------------------------------------------------------------------------------- 1 | model:y 2 |  3 | input0 4 | input1output"Sub 5 | test_graphZ 6 | input0 7 |  8 |  9 | Z 10 | input1 11 |  12 |  13 | b 14 | output 15 |  16 |  17 | B -------------------------------------------------------------------------------- /triton/qa/L0_e2e/models/sub/1/model.strategy: -------------------------------------------------------------------------------- 1 | 1 Sub 0 2 1 1 1 0 -------------------------------------------------------------------------------- /triton/qa/L0_e2e/models/sub/config.pbtxt: -------------------------------------------------------------------------------- 1 | #------------------------------------------------------------------------------# 2 | # Copyright 2022 NVIDIA CORPORATION 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | #------------------------------------------------------------------------------# 16 | 17 | name: "sub" 18 | backend: "legion" 19 | max_batch_size: 0 20 | input [ 21 | { 22 | name: "input0" 23 | data_type: TYPE_FP32 24 | dims: [ 4, 2 ] 25 | }, 26 | { 27 | name: "input1" 28 | data_type: TYPE_FP32 29 | dims: [ 4, 2 ] 30 | } 31 | ] 32 | output [ 33 | { 34 | name: "output" 35 | data_type: TYPE_FP32 36 | dims: [ 4, 2 ] 37 | } 38 | ] 39 | instance_group [ { kind : KIND_MODEL }] 40 | -------------------------------------------------------------------------------- /triton/qa/L0_e2e/models/tanh/1/model.onnx: -------------------------------------------------------------------------------- 1 | model:V 2 |  3 | inputoutput"Tanh 4 | test_graphZ 5 | input 6 |  7 |  8 | b 9 | output 10 |  11 |  12 | B -------------------------------------------------------------------------------- /triton/qa/L0_e2e/models/tanh/1/model.strategy: -------------------------------------------------------------------------------- 1 | 1 Tanh 0 2 1 1 1 0 -------------------------------------------------------------------------------- /triton/qa/L0_e2e/models/tanh/config.pbtxt: -------------------------------------------------------------------------------- 1 | #------------------------------------------------------------------------------# 2 | # Copyright 2022 NVIDIA CORPORATION 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | #------------------------------------------------------------------------------# 16 | 17 | name: "tanh" 18 | backend: "legion" 19 | max_batch_size: 0 20 | input [ 21 | { 22 | name: "input" 23 | data_type: TYPE_FP32 24 | dims: [ 3, 1 ] 25 | } 26 | ] 27 | output [ 28 | { 29 | name: "output" 30 | data_type: TYPE_FP32 31 | dims: [ 3, 1 ] 32 | } 33 | ] 34 | instance_group [ { kind : KIND_MODEL }] 35 | -------------------------------------------------------------------------------- /triton/qa/L0_e2e/test.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | #------------------------------------------------------------------------------# 3 | # Copyright 2022 NVIDIA CORPORATION 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | #------------------------------------------------------------------------------# 17 | 18 | TEST_PY=operator_test.py 19 | DATADIR="./models" 20 | SERVER=/opt/tritonserver/bin/tritonserver 21 | SERVER_ARGS="--model-repository=$DATADIR" 22 | source ../common/util.sh 23 | 24 | rm -f *.log* 25 | 26 | RET=0 27 | 28 | # 1 GPU 1 node 29 | export REALM_DEFAULT_ARGS="-ll:gpu 1" 30 | TEST_LOG="./single_device_single_node.log" 31 | 32 | run_server 33 | if [ "$SERVER_PID" == "0" ]; then 34 | echo -e "\n***\n*** Failed to start $SERVER\n***" 35 | cat $SERVER_LOG 36 | exit 1 37 | fi 38 | 39 | set +e 40 | python $TEST_PY >>$TEST_LOG 2>&1 41 | if [ $? -ne 0 ]; then 42 | echo -e "\n***\n*** Test Failed\n***" 43 | cat $TEST_LOG 44 | RET=1 45 | fi 46 | set -e 47 | 48 | # [issue #7] WAR to ignore core dump on server exit 49 | set +e 50 | kill_server 51 | set -e 52 | 53 | # [gluo FIXME] add test for multi-GPU / multi-node 54 | 55 | if [ $RET -eq 0 ]; then 56 | echo -e "\n***\n*** Test Passed\n***" 57 | else 58 | echo -e "\n***\n*** Test Failed\n***" 59 | fi 60 | 61 | exit $RET 62 | -------------------------------------------------------------------------------- /triton/qa/L0_e2e/test_helpers.py: -------------------------------------------------------------------------------- 1 | #------------------------------------------------------------------------------# 2 | # Copyright 2022 NVIDIA CORPORATION 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | #------------------------------------------------------------------------------# 16 | 17 | import numpy as np 18 | 19 | def softmax(input, axis): 20 | output = np.exp(input - np.max(input, axis, keepdims=True)) 21 | return output / np.sum(output, axis, keepdims=True) 22 | -------------------------------------------------------------------------------- /triton/qa/L0_parser/test.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | #------------------------------------------------------------------------------# 3 | # Copyright 2022 NVIDIA CORPORATION 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | #------------------------------------------------------------------------------# 17 | 18 | TEST_BIN=./onnx_parser_test 19 | 20 | rm -f *.log* 21 | 22 | RET=0 23 | 24 | TEST_LOG="./parser.log" 25 | 26 | set +e 27 | $TEST_BIN >>$TEST_LOG 2>&1 28 | if [ $? -ne 0 ]; then 29 | echo -e "\n***\n*** Test Failed\n***" 30 | cat $TEST_LOG 31 | RET=1 32 | fi 33 | set -e 34 | 35 | if [ $RET -eq 0 ]; then 36 | echo -e "\n***\n*** Test Passed\n***" 37 | else 38 | echo -e "\n***\n*** Test Failed\n***" 39 | fi 40 | 41 | exit $RET 42 | -------------------------------------------------------------------------------- /triton/src/config.h: -------------------------------------------------------------------------------- 1 | /* Copyright 2022 NVIDIA CORPORATION 2 | * 3 | * Licensed under the Apache License, Version 2.0 (the "License"); 4 | * you may not use this file except in compliance with the License. 5 | * You may obtain a copy of the License at 6 | * 7 | * http://www.apache.org/licenses/LICENSE-2.0 8 | * 9 | * Unless required by applicable law or agreed to in writing, software 10 | * distributed under the License is distributed on an "AS IS" BASIS, 11 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | * See the License for the specific language governing permissions and 13 | * limitations under the License. 14 | */ 15 | 16 | #ifndef __LEGION_TRITON_CONFIG_H__ 17 | #define __LEGION_TRITON_CONFIG_H__ 18 | 19 | // Configuration constants for upper bounds for some static properties 20 | 21 | // Maximum number of instances per model that we expect to see 22 | #define MAX_NUM_INSTANCES 8 23 | 24 | // Maximum number of local processors that we need to handle in this process 25 | #define MAX_LOCAL_PROCS 16 26 | 27 | #endif // __LEGION_TRITON_CONFIG_H__ 28 | -------------------------------------------------------------------------------- /triton/src/libtriton_legion.ldscript: -------------------------------------------------------------------------------- 1 | #------------------------------------------------------------------------------# 2 | # Copyright 2022 NVIDIA CORPORATION 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | #------------------------------------------------------------------------------# 16 | { 17 | global: 18 | TRITONBACKEND_*; 19 | extern "C++" { 20 | triton::backend::legion::*; 21 | }; 22 | local: *; 23 | }; 24 | -------------------------------------------------------------------------------- /triton/src/operator.cc: -------------------------------------------------------------------------------- 1 | /* Copyright 2022 NVIDIA CORPORATION 2 | * 3 | * Licensed under the Apache License, Version 2.0 (the "License"); 4 | * you may not use this file except in compliance with the License. 5 | * You may obtain a copy of the License at 6 | * 7 | * http://www.apache.org/licenses/LICENSE-2.0 8 | * 9 | * Unless required by applicable law or agreed to in writing, software 10 | * distributed under the License is distributed on an "AS IS" BASIS, 11 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | * See the License for the specific language governing permissions and 13 | * limitations under the License. 14 | */ 15 | 16 | #include "operator.h" 17 | #include "operators/binary.h" 18 | #include "operators/concat.h" 19 | #include "operators/conv2d.h" 20 | #include "operators/matmul.h" 21 | #include "operators/reshape.h" 22 | #include "operators/softmax.h" 23 | #include "operators/unary.h" 24 | #include "tensor.h" 25 | 26 | namespace triton { namespace backend { namespace legion { 27 | 28 | Operator::Operator( 29 | LegionModelState* m, const LayerStrategy* s, OperatorType t, 30 | const char* name, unsigned in, unsigned wts, unsigned out) 31 | : op_type(t), op_name(name), model(m), strategy(s), num_inputs(in), 32 | num_weights(wts), num_outputs(out) 33 | { 34 | } 35 | 36 | Operator::~Operator(void) 37 | { 38 | // Delete all the weight and output tensors 39 | for (auto wts : weights) delete wts; 40 | for (auto tensor : outputs) delete tensor; 41 | } 42 | 43 | /*static*/ void 44 | Operator::PreregisterTaskVariants(void) 45 | { 46 | BinaryOperator::PreregisterTaskVariants(); 47 | Concat::PreregisterTaskVariants(); 48 | Conv2D::PreregisterTaskVariants(); 49 | MatMul::PreregisterTaskVariants(); 50 | Reshape::PreregisterTaskVariants(); 51 | Softmax::PreregisterTaskVariants(); 52 | UnaryOperator::PreregisterTaskVariants(); 53 | } 54 | 55 | }}} // namespace triton::backend::legion 56 | -------------------------------------------------------------------------------- /triton/src/operators/flat.h: -------------------------------------------------------------------------------- 1 | /* Copyright 2022 NVIDIA CORPORATION 2 | * 3 | * Licensed under the Apache License, Version 2.0 (the "License"); 4 | * you may not use this file except in compliance with the License. 5 | * You may obtain a copy of the License at 6 | * 7 | * http://www.apache.org/licenses/LICENSE-2.0 8 | * 9 | * Unless required by applicable law or agreed to in writing, software 10 | * distributed under the License is distributed on an "AS IS" BASIS, 11 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | * See the License for the specific language governing permissions and 13 | * limitations under the License. 14 | */ 15 | 16 | #ifndef __LEGION_TRITON_FLAT_H__ 17 | #define __LEGION_TRITON_FLAT_H__ 18 | 19 | #include "operator.h" 20 | #include "tensor.h" 21 | 22 | namespace triton { namespace backend { namespace legion { 23 | 24 | struct FlatArgs : public OperatorArgs { 25 | public: 26 | }; 27 | 28 | class Flat : public Operator { 29 | public: 30 | Flat(LegionModelState* state, const char* name); 31 | 32 | void configure(Tensor* input, Tensor* output); 33 | 34 | virtual void initialize(LegionModelInstance* instance); 35 | virtual void forward(LegionModelInstance* instance); 36 | virtual void finalize(LegionModelInstance* instance); 37 | 38 | static FlatArgs initalize_gpu( 39 | const Legion::Task* task, 40 | const std::vector& regions, Legion::Context ctx, 41 | Legion::Runtime* runtime); 42 | static void forward_gpu( 43 | const Legion::Task* task, 44 | const std::vector& regions, Legion::Context ctx, 45 | Legion::Runtime* runtime); 46 | static void forward_kernel( 47 | const FlatArgs* args, const void* input_ptr, void* output_ptr, 48 | size_t num_elements); 49 | 50 | public: 51 | LegionModelState* const model; 52 | }; 53 | 54 | }}} // namespace triton::backend::legion 55 | 56 | #endif // __LEGION_TRITON_FLAT_H__ 57 | -------------------------------------------------------------------------------- /triton/src/tensor.h: -------------------------------------------------------------------------------- 1 | /* Copyright 2022 NVIDIA CORPORATION 2 | * 3 | * Licensed under the Apache License, Version 2.0 (the "License"); 4 | * you may not use this file except in compliance with the License. 5 | * You may obtain a copy of the License at 6 | * 7 | * http://www.apache.org/licenses/LICENSE-2.0 8 | * 9 | * Unless required by applicable law or agreed to in writing, software 10 | * distributed under the License is distributed on an "AS IS" BASIS, 11 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | * See the License for the specific language governing permissions and 13 | * limitations under the License. 14 | */ 15 | 16 | #ifndef __LEGION_TRITON_TENSOR_H__ 17 | #define __LEGION_TRITON_TENSOR_H__ 18 | 19 | #include "config.h" 20 | #include "legion.h" 21 | #include "types.h" 22 | 23 | namespace triton { namespace backend { namespace legion { 24 | 25 | class Tensor { 26 | public: 27 | Tensor(Operator* op, DataType type, const size_t* dims, size_t num_dims); 28 | Tensor(Operator* op, DataType type, const std::vector& dims); 29 | virtual ~Tensor(void); 30 | 31 | public: 32 | Operator* const owner; 33 | const DataType type; 34 | const std::vector bounds; 35 | 36 | public: 37 | Legion::LogicalRegion region[MAX_NUM_INSTANCES]; 38 | Legion::LogicalPartition partition[MAX_NUM_INSTANCES]; 39 | }; 40 | 41 | class Weights : public Tensor { 42 | public: 43 | Weights(Operator* op, DataType type, const size_t* dims, size_t num_dims); 44 | Weights(Operator* op, DataType type, const std::vector& dims); 45 | virtual ~Weights(void); 46 | 47 | public: 48 | Legion::Domain local_bounds[MAX_LOCAL_PROCS]; 49 | Legion::Memory local_memory[MAX_LOCAL_PROCS]; 50 | void* local_allocation[MAX_LOCAL_PROCS]; 51 | size_t local_strides[MAX_LOCAL_PROCS][LEGION_MAX_DIM]; 52 | }; 53 | 54 | }}} // namespace triton::backend::legion 55 | 56 | #endif // __LEGION_TRITON_TENSOR_H__ 57 | -------------------------------------------------------------------------------- /triton/src/test/data/add.onnx: -------------------------------------------------------------------------------- 1 | model:y 2 |  3 | input0 4 | input1output"Add 5 | test_graphZ 6 | input0 7 |  8 |  9 | Z 10 | input1 11 |  12 |  13 | b 14 | output 15 |  16 |  17 | B -------------------------------------------------------------------------------- /triton/src/test/data/avg_pool.onnx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/flexflow/flexflow-serve/dd7bee0e9d6e43b1aaf9355a4d2091fdae2b6942/triton/src/test/data/avg_pool.onnx -------------------------------------------------------------------------------- /triton/src/test/data/avg_pool_autopad.onnx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/flexflow/flexflow-serve/dd7bee0e9d6e43b1aaf9355a4d2091fdae2b6942/triton/src/test/data/avg_pool_autopad.onnx -------------------------------------------------------------------------------- /triton/src/test/data/avg_pool_ceil.onnx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/flexflow/flexflow-serve/dd7bee0e9d6e43b1aaf9355a4d2091fdae2b6942/triton/src/test/data/avg_pool_ceil.onnx -------------------------------------------------------------------------------- /triton/src/test/data/avg_pool_count_include_pad.onnx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/flexflow/flexflow-serve/dd7bee0e9d6e43b1aaf9355a4d2091fdae2b6942/triton/src/test/data/avg_pool_count_include_pad.onnx -------------------------------------------------------------------------------- /triton/src/test/data/avg_pool_pad.onnx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/flexflow/flexflow-serve/dd7bee0e9d6e43b1aaf9355a4d2091fdae2b6942/triton/src/test/data/avg_pool_pad.onnx -------------------------------------------------------------------------------- /triton/src/test/data/cast.onnx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/flexflow/flexflow-serve/dd7bee0e9d6e43b1aaf9355a4d2091fdae2b6942/triton/src/test/data/cast.onnx -------------------------------------------------------------------------------- /triton/src/test/data/conv2d_with_bias.onnx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/flexflow/flexflow-serve/dd7bee0e9d6e43b1aaf9355a4d2091fdae2b6942/triton/src/test/data/conv2d_with_bias.onnx -------------------------------------------------------------------------------- /triton/src/test/data/identity.onnx: -------------------------------------------------------------------------------- 1 | model:j 2 |  3 | inputoutput"Identity 4 | test_graphZ 5 | input 6 |  7 |  8 |  9 |  10 | b 11 | output 12 |  13 |  14 |  15 |  16 | B -------------------------------------------------------------------------------- /triton/src/test/data/max_pool.onnx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/flexflow/flexflow-serve/dd7bee0e9d6e43b1aaf9355a4d2091fdae2b6942/triton/src/test/data/max_pool.onnx -------------------------------------------------------------------------------- /triton/src/test/data/max_pool_autopad.onnx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/flexflow/flexflow-serve/dd7bee0e9d6e43b1aaf9355a4d2091fdae2b6942/triton/src/test/data/max_pool_autopad.onnx -------------------------------------------------------------------------------- /triton/src/test/data/max_pool_ceil.onnx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/flexflow/flexflow-serve/dd7bee0e9d6e43b1aaf9355a4d2091fdae2b6942/triton/src/test/data/max_pool_ceil.onnx -------------------------------------------------------------------------------- /triton/src/test/data/max_pool_dilations.onnx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/flexflow/flexflow-serve/dd7bee0e9d6e43b1aaf9355a4d2091fdae2b6942/triton/src/test/data/max_pool_dilations.onnx -------------------------------------------------------------------------------- /triton/src/test/data/max_pool_order.onnx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/flexflow/flexflow-serve/dd7bee0e9d6e43b1aaf9355a4d2091fdae2b6942/triton/src/test/data/max_pool_order.onnx -------------------------------------------------------------------------------- /triton/src/test/data/mul.onnx: -------------------------------------------------------------------------------- 1 | model:y 2 |  3 | input0 4 | input1output"Mul 5 | test_graphZ 6 | input0 7 |  8 |  9 | Z 10 | input1 11 |  12 |  13 | b 14 | output 15 |  16 |  17 | B -------------------------------------------------------------------------------- /triton/src/test/data/reciprocal.onnx: -------------------------------------------------------------------------------- 1 | model:\ 2 |  3 | inputoutput" 4 | Reciprocal 5 | test_graphZ 6 | input 7 |  8 |  9 | b 10 | output 11 |  12 |  13 | B -------------------------------------------------------------------------------- /triton/src/test/data/softmax.onnx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/flexflow/flexflow-serve/dd7bee0e9d6e43b1aaf9355a4d2091fdae2b6942/triton/src/test/data/softmax.onnx -------------------------------------------------------------------------------- /triton/src/test/data/softmax_default_axis.onnx: -------------------------------------------------------------------------------- 1 | model:Y 2 |  3 | inputoutput"Softmax 4 | test_graphZ 5 | input 6 |  7 |  8 | b 9 | output 10 |  11 |  12 | B -------------------------------------------------------------------------------- /triton/src/test/data/softmax_negative_axis.onnx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/flexflow/flexflow-serve/dd7bee0e9d6e43b1aaf9355a4d2091fdae2b6942/triton/src/test/data/softmax_negative_axis.onnx -------------------------------------------------------------------------------- /triton/src/test/data/sqrt.onnx: -------------------------------------------------------------------------------- 1 | model:V 2 |  3 | inputoutput"Sqrt 4 | test_graphZ 5 | input 6 |  7 |  8 | b 9 | output 10 |  11 |  12 | B -------------------------------------------------------------------------------- /triton/src/test/data/sub.onnx: -------------------------------------------------------------------------------- 1 | model:y 2 |  3 | input0 4 | input1output"Sub 5 | test_graphZ 6 | input0 7 |  8 |  9 | Z 10 | input1 11 |  12 |  13 | b 14 | output 15 |  16 |  17 | B -------------------------------------------------------------------------------- /triton/src/test/data/tanh.onnx: -------------------------------------------------------------------------------- 1 | model:V 2 |  3 | inputoutput"Tanh 4 | test_graphZ 5 | input 6 |  7 |  8 | b 9 | output 10 |  11 |  12 | B --------------------------------------------------------------------------------