├── benchmarks ├── rnn │ ├── fractaltensor │ │ ├── README.md │ │ ├── stacked_lstm │ │ │ ├── images │ │ │ │ ├── lstm.png │ │ │ │ ├── access_to_generate_ysss.png │ │ │ │ ├── access_to_generate_ysss.pptx │ │ │ │ └── preprocess.py │ │ │ ├── regions │ │ │ │ └── regions.h │ │ │ ├── README.md │ │ │ └── CMakeLists.txt │ │ ├── figures │ │ │ ├── grid_lstm_fractaltensor.png │ │ │ └── grid_lstm_fractaltensor.pptx │ │ ├── cute_stacked_lstm │ │ │ ├── figures │ │ │ │ ├── etdg-lstm.png │ │ │ │ ├── access_map1.png │ │ │ │ ├── access_map2.png │ │ │ │ ├── access_map3.png │ │ │ │ └── access_map4.png │ │ │ ├── Makefile │ │ │ └── README.md │ │ ├── dilated_lstm │ │ │ ├── regions │ │ │ │ └── regions.h │ │ │ └── CMakeLists.txt │ │ ├── grid_lstm │ │ │ ├── Makefile │ │ │ ├── regions │ │ │ │ └── regions.h │ │ │ ├── run.sh │ │ │ └── CMakeLists.txt │ │ └── cute_dilated_lstm │ │ │ └── Makefile │ ├── baselines │ │ ├── figures │ │ │ ├── figures.pptx │ │ │ └── dilaited_lstm_pytorch.png │ │ ├── stacked_lstm │ │ │ ├── figures │ │ │ │ ├── stacked_lstm_perf_with_depth.pdf │ │ │ │ ├── for_plot.tsv │ │ │ │ ├── stacked_lstm_results.tsv │ │ │ │ └── perf_with_increased_depth_subplot1.tsv │ │ │ ├── tf_model │ │ │ │ └── __init__.py │ │ │ ├── pt_model │ │ │ │ └── __init__.py │ │ │ ├── triton_model │ │ │ │ └── __init__.py │ │ │ ├── README.md │ │ │ └── test_utils.py │ │ ├── grid_lstm │ │ │ ├── README.md │ │ │ ├── pt_model │ │ │ │ └── __init__.py │ │ │ ├── triton_model │ │ │ │ └── __init__.py │ │ │ ├── tf_model │ │ │ │ └── __init__.py │ │ │ ├── run_grid_lstm_pt.sh │ │ │ └── test_utils.py │ │ ├── stacked_dilated_rnn │ │ │ ├── tf_model │ │ │ │ ├── __init__.py │ │ │ │ └── model.py │ │ │ ├── triton_model │ │ │ │ └── __init__.py │ │ │ ├── pt_model │ │ │ │ └── __init__.py │ │ │ ├── README.md │ │ │ └── stacked_drnn_tensorflow.py │ │ └── README.md │ ├── cuDNN │ │ ├── Makefile │ │ ├── lstm_cell_cudnn.cu │ │ ├── stacked_lstm_cudnn.cu │ │ ├── utils.h │ │ ├── CMakeLists.txt │ │ └── main.cu │ └── tvm │ │ ├── utils.py │ │ └── CMakeLists.txt └── fused_two_hgemms │ ├── baseline │ └── README.md │ ├── fractaltensor │ ├── figures │ │ ├── access_maps.png │ │ ├── fused_two_gemms.png │ │ ├── etdg_for_two_gemms.png │ │ └── gemm_translated_to_macro_kernel.png │ └── Makefile │ └── README.md ├── docs ├── images │ ├── zip.png │ ├── stack.png │ ├── data_types.png │ ├── frontend_tree.png │ ├── primitive_type.png │ ├── access_by_depth_1.png │ ├── access_by_depth_2.png │ ├── fractaltensor_layout.png │ ├── grid_rnn_example │ │ ├── cell.png │ │ ├── scan_x.png │ │ ├── scan_y.png │ │ ├── grid_cell.png │ │ ├── grid_rnn.png │ │ └── code_structure_and_memory.png │ ├── index_a_FractalTensor_1.png │ ├── index_a_FractalTensor_2.png │ ├── type_expression_tensor.png │ ├── product_two_FractalTensor.png │ ├── slide_over_fractaltensor.png │ └── type_expression_FractalTensor.png └── fractaltensor_operations │ ├── information_query.md │ ├── access_primitives.md │ ├── memory_layout_of_fractaltensor.md │ ├── memory_operations.md │ └── extended_access_operations.md ├── assets ├── FractalTensor-logo.png └── FractalTensor_overview.png ├── requirements.txt ├── examples ├── dilated_rnn │ ├── example.gv.pdf │ ├── context.py │ └── dilated_rnn.py ├── hello_world │ ├── example.gv.pdf │ ├── context.py │ ├── utils.py │ └── hello_world.py ├── stacked_rnn │ ├── example.gv.pdf │ └── context.py ├── sparse_attention │ ├── figures │ │ └── bigbird-attn.png │ ├── context.py │ ├── torch_windowed_attention_demo.py │ ├── README.md │ └── bigbird.py ├── __init__.py ├── utils │ └── __init__.py ├── convolution │ ├── context.py │ └── utils.py ├── grid_rnn │ ├── context.py │ └── grid_rnn_utils.py ├── transformer │ └── context.py ├── flash_attention │ ├── context.py │ ├── flash_attention_utils.py │ └── README.md ├── rnn_attention │ ├── context.py │ └── rnn_attention_utils.py └── README.md ├── kaleido ├── parser │ ├── tests │ │ ├── figures │ │ │ ├── udf1.gv.pdf │ │ │ ├── udf2.gv.pdf │ │ │ ├── udf3.gv.pdf │ │ │ ├── assignment1.gv.pdf │ │ │ ├── assignment2.gv.pdf │ │ │ ├── assignment3.gv.pdf │ │ │ ├── assignment4.gv.pdf │ │ │ ├── assignment5.gv.pdf │ │ │ └── assignment6.gv.pdf │ │ ├── context.py │ │ └── utils.py │ ├── errors.py │ └── operations │ │ └── access_patterns.py ├── core │ ├── device │ │ ├── device_context.h │ │ ├── tests │ │ │ └── CMakeLists.txt │ │ ├── cuda_info.h │ │ ├── kernels │ │ │ ├── softmax_common.h │ │ │ ├── tile_transmitter.h │ │ │ ├── lstm │ │ │ │ ├── dilated_lstm │ │ │ │ │ └── region1.h │ │ │ │ └── stacked_lstm │ │ │ │ │ └── region1.h │ │ │ ├── gather_scatter.h │ │ │ └── softmax_v2.h │ │ ├── traits_base.h │ │ ├── cuda_timer.h │ │ ├── gpu_context.cc │ │ ├── gpu_context.h │ │ └── cuda_info.cc │ ├── tests │ │ ├── test_main.cc │ │ ├── CMakeLists.txt │ │ ├── test_cuda_info.cc │ │ ├── test_layout.cc │ │ └── test_allocator.cc │ ├── fractal_tensor.cc │ ├── operators │ │ ├── expect_eq_op.h │ │ ├── softmax_op.h │ │ ├── transpose_op.h │ │ ├── print_op.h │ │ ├── online_softmax_op.h │ │ ├── gather_nd_op.h │ │ ├── scatter_nd_op.h │ │ ├── concat_op.h │ │ ├── matmul_op.h │ │ ├── fill_op.h │ │ ├── gemm_batched_op.h │ │ ├── CMakeLists.txt │ │ ├── elementwise_op.h │ │ ├── softmax_op.cu │ │ ├── online_softmax_op.cu │ │ ├── launch_config.h │ │ ├── tests │ │ │ └── b2b_gemm_test_utils.h │ │ ├── fill_op.cu │ │ ├── gather_nd_op.cu │ │ └── scatter_nd_op.cu │ ├── allocator.h │ ├── tensor.cc │ ├── data_types.proto │ ├── init.cc │ ├── tensor_shape.cc │ ├── place.cc │ ├── config.h │ ├── tile_shape.h │ ├── fractal_tensor.h │ ├── layout.h │ ├── place.h │ ├── tensor_shape.h │ └── CMakeLists.txt ├── frontend │ ├── __init__.py │ ├── operations │ │ ├── tests │ │ │ ├── context.py │ │ │ ├── test_product.py │ │ │ ├── test_zip.py │ │ │ ├── test_constants.py │ │ │ ├── test_flatten.py │ │ │ ├── test_aggregate.py │ │ │ └── test_join.py │ │ ├── __init__.py │ │ └── tensor │ │ │ ├── arithmetic │ │ │ ├── contraction.py │ │ │ └── broadcast.py │ │ │ ├── data_movements.py │ │ │ └── reshape.py │ └── tests │ │ ├── context.py │ │ └── test_type_equivalence.py └── __init__.py ├── CODE_OF_CONDUCT.md ├── .gitignore ├── .clang-format ├── scripts ├── clang_format.hook └── format.sh ├── Makefile ├── cmake ├── python.cmake ├── external │ ├── pybind.cmake │ ├── tvm.cmake │ ├── cccl.cmake │ ├── cutlass.cmake │ ├── zlib.cmake │ ├── gflags.cmake │ └── glog.cmake └── third_party.cmake ├── LICENSE ├── SUPPORT.md ├── .pre-commit-config.yaml ├── CMakeLists.txt └── SECURITY.md /benchmarks/rnn/fractaltensor/README.md: -------------------------------------------------------------------------------- 1 | [TBD] 2 | -------------------------------------------------------------------------------- /benchmarks/fused_two_hgemms/baseline/README.md: -------------------------------------------------------------------------------- 1 | [TBD] 2 | -------------------------------------------------------------------------------- /docs/images/zip.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/FractalTensor/HEAD/docs/images/zip.png -------------------------------------------------------------------------------- /docs/images/stack.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/FractalTensor/HEAD/docs/images/stack.png -------------------------------------------------------------------------------- /docs/images/data_types.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/FractalTensor/HEAD/docs/images/data_types.png -------------------------------------------------------------------------------- /assets/FractalTensor-logo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/FractalTensor/HEAD/assets/FractalTensor-logo.png -------------------------------------------------------------------------------- /docs/images/frontend_tree.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/FractalTensor/HEAD/docs/images/frontend_tree.png -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | anytree==2.8.0 2 | astpretty==2.1.0 3 | asttokens==2.0.5 4 | absl-py==1.0.0 5 | graphviz==0.17 6 | -------------------------------------------------------------------------------- /docs/images/primitive_type.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/FractalTensor/HEAD/docs/images/primitive_type.png -------------------------------------------------------------------------------- /assets/FractalTensor_overview.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/FractalTensor/HEAD/assets/FractalTensor_overview.png -------------------------------------------------------------------------------- /docs/images/access_by_depth_1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/FractalTensor/HEAD/docs/images/access_by_depth_1.png -------------------------------------------------------------------------------- /docs/images/access_by_depth_2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/FractalTensor/HEAD/docs/images/access_by_depth_2.png -------------------------------------------------------------------------------- /docs/images/fractaltensor_layout.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/FractalTensor/HEAD/docs/images/fractaltensor_layout.png -------------------------------------------------------------------------------- /docs/images/grid_rnn_example/cell.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/FractalTensor/HEAD/docs/images/grid_rnn_example/cell.png -------------------------------------------------------------------------------- /examples/dilated_rnn/example.gv.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/FractalTensor/HEAD/examples/dilated_rnn/example.gv.pdf -------------------------------------------------------------------------------- /examples/hello_world/example.gv.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/FractalTensor/HEAD/examples/hello_world/example.gv.pdf -------------------------------------------------------------------------------- /examples/stacked_rnn/example.gv.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/FractalTensor/HEAD/examples/stacked_rnn/example.gv.pdf -------------------------------------------------------------------------------- /docs/images/grid_rnn_example/scan_x.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/FractalTensor/HEAD/docs/images/grid_rnn_example/scan_x.png -------------------------------------------------------------------------------- /docs/images/grid_rnn_example/scan_y.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/FractalTensor/HEAD/docs/images/grid_rnn_example/scan_y.png -------------------------------------------------------------------------------- /docs/images/index_a_FractalTensor_1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/FractalTensor/HEAD/docs/images/index_a_FractalTensor_1.png -------------------------------------------------------------------------------- /docs/images/index_a_FractalTensor_2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/FractalTensor/HEAD/docs/images/index_a_FractalTensor_2.png -------------------------------------------------------------------------------- /docs/images/type_expression_tensor.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/FractalTensor/HEAD/docs/images/type_expression_tensor.png -------------------------------------------------------------------------------- /docs/images/grid_rnn_example/grid_cell.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/FractalTensor/HEAD/docs/images/grid_rnn_example/grid_cell.png -------------------------------------------------------------------------------- /docs/images/grid_rnn_example/grid_rnn.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/FractalTensor/HEAD/docs/images/grid_rnn_example/grid_rnn.png -------------------------------------------------------------------------------- /docs/images/product_two_FractalTensor.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/FractalTensor/HEAD/docs/images/product_two_FractalTensor.png -------------------------------------------------------------------------------- /docs/images/slide_over_fractaltensor.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/FractalTensor/HEAD/docs/images/slide_over_fractaltensor.png -------------------------------------------------------------------------------- /kaleido/parser/tests/figures/udf1.gv.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/FractalTensor/HEAD/kaleido/parser/tests/figures/udf1.gv.pdf -------------------------------------------------------------------------------- /kaleido/parser/tests/figures/udf2.gv.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/FractalTensor/HEAD/kaleido/parser/tests/figures/udf2.gv.pdf -------------------------------------------------------------------------------- /kaleido/parser/tests/figures/udf3.gv.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/FractalTensor/HEAD/kaleido/parser/tests/figures/udf3.gv.pdf -------------------------------------------------------------------------------- /benchmarks/rnn/baselines/figures/figures.pptx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/FractalTensor/HEAD/benchmarks/rnn/baselines/figures/figures.pptx -------------------------------------------------------------------------------- /docs/images/type_expression_FractalTensor.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/FractalTensor/HEAD/docs/images/type_expression_FractalTensor.png -------------------------------------------------------------------------------- /kaleido/parser/tests/figures/assignment1.gv.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/FractalTensor/HEAD/kaleido/parser/tests/figures/assignment1.gv.pdf -------------------------------------------------------------------------------- /kaleido/parser/tests/figures/assignment2.gv.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/FractalTensor/HEAD/kaleido/parser/tests/figures/assignment2.gv.pdf -------------------------------------------------------------------------------- /kaleido/parser/tests/figures/assignment3.gv.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/FractalTensor/HEAD/kaleido/parser/tests/figures/assignment3.gv.pdf -------------------------------------------------------------------------------- /kaleido/parser/tests/figures/assignment4.gv.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/FractalTensor/HEAD/kaleido/parser/tests/figures/assignment4.gv.pdf -------------------------------------------------------------------------------- /kaleido/parser/tests/figures/assignment5.gv.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/FractalTensor/HEAD/kaleido/parser/tests/figures/assignment5.gv.pdf -------------------------------------------------------------------------------- /kaleido/parser/tests/figures/assignment6.gv.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/FractalTensor/HEAD/kaleido/parser/tests/figures/assignment6.gv.pdf -------------------------------------------------------------------------------- /examples/sparse_attention/figures/bigbird-attn.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/FractalTensor/HEAD/examples/sparse_attention/figures/bigbird-attn.png -------------------------------------------------------------------------------- /benchmarks/rnn/fractaltensor/stacked_lstm/images/lstm.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/FractalTensor/HEAD/benchmarks/rnn/fractaltensor/stacked_lstm/images/lstm.png -------------------------------------------------------------------------------- /benchmarks/rnn/baselines/figures/dilaited_lstm_pytorch.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/FractalTensor/HEAD/benchmarks/rnn/baselines/figures/dilaited_lstm_pytorch.png -------------------------------------------------------------------------------- /docs/images/grid_rnn_example/code_structure_and_memory.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/FractalTensor/HEAD/docs/images/grid_rnn_example/code_structure_and_memory.png -------------------------------------------------------------------------------- /benchmarks/rnn/fractaltensor/figures/grid_lstm_fractaltensor.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/FractalTensor/HEAD/benchmarks/rnn/fractaltensor/figures/grid_lstm_fractaltensor.png -------------------------------------------------------------------------------- /benchmarks/fused_two_hgemms/fractaltensor/figures/access_maps.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/FractalTensor/HEAD/benchmarks/fused_two_hgemms/fractaltensor/figures/access_maps.png -------------------------------------------------------------------------------- /benchmarks/rnn/fractaltensor/figures/grid_lstm_fractaltensor.pptx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/FractalTensor/HEAD/benchmarks/rnn/fractaltensor/figures/grid_lstm_fractaltensor.pptx -------------------------------------------------------------------------------- /benchmarks/fused_two_hgemms/fractaltensor/figures/fused_two_gemms.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/FractalTensor/HEAD/benchmarks/fused_two_hgemms/fractaltensor/figures/fused_two_gemms.png -------------------------------------------------------------------------------- /benchmarks/rnn/fractaltensor/cute_stacked_lstm/figures/etdg-lstm.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/FractalTensor/HEAD/benchmarks/rnn/fractaltensor/cute_stacked_lstm/figures/etdg-lstm.png -------------------------------------------------------------------------------- /benchmarks/fused_two_hgemms/fractaltensor/figures/etdg_for_two_gemms.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/FractalTensor/HEAD/benchmarks/fused_two_hgemms/fractaltensor/figures/etdg_for_two_gemms.png -------------------------------------------------------------------------------- /benchmarks/rnn/fractaltensor/cute_stacked_lstm/figures/access_map1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/FractalTensor/HEAD/benchmarks/rnn/fractaltensor/cute_stacked_lstm/figures/access_map1.png -------------------------------------------------------------------------------- /benchmarks/rnn/fractaltensor/cute_stacked_lstm/figures/access_map2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/FractalTensor/HEAD/benchmarks/rnn/fractaltensor/cute_stacked_lstm/figures/access_map2.png -------------------------------------------------------------------------------- /benchmarks/rnn/fractaltensor/cute_stacked_lstm/figures/access_map3.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/FractalTensor/HEAD/benchmarks/rnn/fractaltensor/cute_stacked_lstm/figures/access_map3.png -------------------------------------------------------------------------------- /benchmarks/rnn/fractaltensor/cute_stacked_lstm/figures/access_map4.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/FractalTensor/HEAD/benchmarks/rnn/fractaltensor/cute_stacked_lstm/figures/access_map4.png -------------------------------------------------------------------------------- /benchmarks/rnn/fractaltensor/stacked_lstm/images/access_to_generate_ysss.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/FractalTensor/HEAD/benchmarks/rnn/fractaltensor/stacked_lstm/images/access_to_generate_ysss.png -------------------------------------------------------------------------------- /benchmarks/rnn/fractaltensor/stacked_lstm/images/access_to_generate_ysss.pptx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/FractalTensor/HEAD/benchmarks/rnn/fractaltensor/stacked_lstm/images/access_to_generate_ysss.pptx -------------------------------------------------------------------------------- /benchmarks/rnn/baselines/stacked_lstm/figures/stacked_lstm_perf_with_depth.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/FractalTensor/HEAD/benchmarks/rnn/baselines/stacked_lstm/figures/stacked_lstm_perf_with_depth.pdf -------------------------------------------------------------------------------- /benchmarks/fused_two_hgemms/fractaltensor/figures/gemm_translated_to_macro_kernel.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/FractalTensor/HEAD/benchmarks/fused_two_hgemms/fractaltensor/figures/gemm_translated_to_macro_kernel.png -------------------------------------------------------------------------------- /benchmarks/rnn/fractaltensor/dilated_lstm/regions/regions.h: -------------------------------------------------------------------------------- 1 | // Copyright (c) Microsoft Corporation. All rights reserved. 2 | // Licensed under the MIT License. 3 | 4 | #pragma once 5 | 6 | #include "region1.h" 7 | #include "region2.h" 8 | -------------------------------------------------------------------------------- /kaleido/core/device/device_context.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | namespace kaleido { 4 | namespace core { 5 | 6 | class DeviceContext { 7 | public: 8 | virtual ~DeviceContext() {} 9 | }; 10 | } // namespace core 11 | } // namespace kaleido 12 | -------------------------------------------------------------------------------- /benchmarks/rnn/fractaltensor/stacked_lstm/regions/regions.h: -------------------------------------------------------------------------------- 1 | // Copyright (c) Microsoft Corporation. All rights reserved. 2 | // Licensed under the MIT License. 3 | 4 | #pragma once 5 | 6 | #include "region1.h" 7 | #include "region2.h" 8 | #include "region3.h" 9 | -------------------------------------------------------------------------------- /examples/__init__.py: -------------------------------------------------------------------------------- 1 | # ------------------------------------------------------------------------- 2 | # Copyright (c) Microsoft Corporation. All rights reserved. 3 | # Licensed under the MIT License. 4 | # -------------------------------------------------------------------------- 5 | -------------------------------------------------------------------------------- /benchmarks/rnn/fractaltensor/grid_lstm/Makefile: -------------------------------------------------------------------------------- 1 | BENCH_NAME ?= grid_rnn 2 | BUILD_DIR := build 3 | 4 | .PHONY: build clean 5 | 6 | build: 7 | @mkdir -p build && cd build && cmake .. && make -j12 8 | 9 | $(BUILD_DIR)/$(BENCH_NAME): build 10 | 11 | clean: 12 | @rm -rf build 13 | -------------------------------------------------------------------------------- /benchmarks/fused_two_hgemms/fractaltensor/Makefile: -------------------------------------------------------------------------------- 1 | BENCH_NAME ?= back2back_hgemm 2 | BUILD_DIR := build 3 | 4 | .PHONY: build clean 5 | 6 | build: 7 | @mkdir -p build && cd build && cmake .. && make -j12 8 | 9 | $(BUILD_DIR)/$(BENCH_NAME): build 10 | 11 | clean: 12 | @rm -rf build 13 | -------------------------------------------------------------------------------- /examples/utils/__init__.py: -------------------------------------------------------------------------------- 1 | # ------------------------------------------------------------------------- 2 | # Copyright (c) Microsoft Corporation. All rights reserved. 3 | # Licensed under the MIT License. 4 | # -------------------------------------------------------------------------- 5 | 6 | from .data_utils import * 7 | -------------------------------------------------------------------------------- /benchmarks/rnn/baselines/stacked_lstm/figures/for_plot.tsv: -------------------------------------------------------------------------------- 1 | Test Name Average Time Elapsed Time Throughput 2 | CuDNN 0.0251 0.7545 2544.7697 3 | PT 0.2123 6.3702 301.4025 4 | PT_JITed 0.0505 1.5161 1266.4127 5 | TF_GraphMode 0.0743 2.2293 861.2472 6 | TF_WhileOpLSTM 0.1068 3.2042 599.2203 7 | TF_AutoGraph 0.0501 1.5038 1276.7482 8 | -------------------------------------------------------------------------------- /kaleido/core/tests/test_main.cc: -------------------------------------------------------------------------------- 1 | // Copyright (c) Microsoft Corporation. All rights reserved. 2 | // Licensed under the MIT License. 3 | 4 | #include 5 | #include 6 | 7 | int main(int argc, char** argv) { 8 | testing::InitGoogleTest(&argc, argv); 9 | google::InitGoogleLogging(argv[0]); 10 | 11 | return RUN_ALL_TESTS(); 12 | } 13 | -------------------------------------------------------------------------------- /benchmarks/rnn/fractaltensor/grid_lstm/regions/regions.h: -------------------------------------------------------------------------------- 1 | // Copyright (c) Microsoft Corporation. All rights reserved. 2 | // Licensed under the MIT License. 3 | 4 | #pragma once 5 | 6 | #include "region1.h" 7 | #include "region2.h" 8 | #include "region3.h" 9 | #include "region4.h" 10 | #include "region5.h" 11 | #include "region6.h" 12 | #include "region7.h" 13 | #include "region8.h" 14 | -------------------------------------------------------------------------------- /benchmarks/rnn/cuDNN/Makefile: -------------------------------------------------------------------------------- 1 | BENCH_NAME ?= lstm_cell_cudnn 2 | BUILD_DIR := build 3 | OUTPUT_FILE ?= ../c_cudnn_lstm_cell_bench.tsv 4 | 5 | .PHONY: build bench clean 6 | 7 | build: 8 | @mkdir -p build && cd build && cmake .. && make -j 9 | 10 | $(BUILD_DIR)/$(BENCH_NAME): build 11 | 12 | bench: $(BUILD_DIR)/$(BENCH_NAME) 13 | @./$(BUILD_DIR)/$(BENCH_NAME) $(OUTPUT_FILE) 14 | 15 | clean: 16 | @rm -rf build 17 | -------------------------------------------------------------------------------- /examples/convolution/context.py: -------------------------------------------------------------------------------- 1 | # ------------------------------------------------------------------------- 2 | # Copyright (c) Microsoft Corporation. All rights reserved. 3 | # Licensed under the MIT License. 4 | # -------------------------------------------------------------------------- 5 | 6 | import os 7 | import sys 8 | 9 | sys.path.insert( 10 | 0, os.path.abspath(os.path.join(os.path.dirname(__file__), '../..'))) 11 | -------------------------------------------------------------------------------- /examples/dilated_rnn/context.py: -------------------------------------------------------------------------------- 1 | # ------------------------------------------------------------------------- 2 | # Copyright (c) Microsoft Corporation. All rights reserved. 3 | # Licensed under the MIT License. 4 | # -------------------------------------------------------------------------- 5 | 6 | import os 7 | import sys 8 | 9 | sys.path.insert( 10 | 0, os.path.abspath(os.path.join(os.path.dirname(__file__), '../..'))) 11 | -------------------------------------------------------------------------------- /examples/grid_rnn/context.py: -------------------------------------------------------------------------------- 1 | # ------------------------------------------------------------------------- 2 | # Copyright (c) Microsoft Corporation. All rights reserved. 3 | # Licensed under the MIT License. 4 | # -------------------------------------------------------------------------- 5 | 6 | import os 7 | import sys 8 | 9 | sys.path.insert( 10 | 0, os.path.abspath(os.path.join(os.path.dirname(__file__), '../..'))) 11 | -------------------------------------------------------------------------------- /examples/hello_world/context.py: -------------------------------------------------------------------------------- 1 | # ------------------------------------------------------------------------- 2 | # Copyright (c) Microsoft Corporation. All rights reserved. 3 | # Licensed under the MIT License. 4 | # -------------------------------------------------------------------------- 5 | 6 | import os 7 | import sys 8 | 9 | sys.path.insert( 10 | 0, os.path.abspath(os.path.join(os.path.dirname(__file__), '../..'))) 11 | -------------------------------------------------------------------------------- /examples/stacked_rnn/context.py: -------------------------------------------------------------------------------- 1 | # ------------------------------------------------------------------------- 2 | # Copyright (c) Microsoft Corporation. All rights reserved. 3 | # Licensed under the MIT License. 4 | # -------------------------------------------------------------------------- 5 | 6 | import os 7 | import sys 8 | 9 | sys.path.insert( 10 | 0, os.path.abspath(os.path.join(os.path.dirname(__file__), '../..'))) 11 | -------------------------------------------------------------------------------- /examples/transformer/context.py: -------------------------------------------------------------------------------- 1 | # ------------------------------------------------------------------------- 2 | # Copyright (c) Microsoft Corporation. All rights reserved. 3 | # Licensed under the MIT License. 4 | # -------------------------------------------------------------------------- 5 | 6 | import os 7 | import sys 8 | 9 | sys.path.insert( 10 | 0, os.path.abspath(os.path.join(os.path.dirname(__file__), '../..'))) 11 | -------------------------------------------------------------------------------- /kaleido/core/fractal_tensor.cc: -------------------------------------------------------------------------------- 1 | // Copyright (c) Microsoft Corporation. All rights reserved. 2 | // Licensed under the MIT License. 3 | 4 | #include "kaleido/core/fractal_tensor.h" 5 | 6 | #include 7 | 8 | namespace kaleido { 9 | namespace core { 10 | 11 | std::string FractalTensor::DebugString() const { 12 | return type_desc_.DebugString(); 13 | } 14 | 15 | } // namespace core 16 | } // namespace kaleido 17 | -------------------------------------------------------------------------------- /examples/flash_attention/context.py: -------------------------------------------------------------------------------- 1 | # ------------------------------------------------------------------------- 2 | # Copyright (c) Microsoft Corporation. All rights reserved. 3 | # Licensed under the MIT License. 4 | # -------------------------------------------------------------------------- 5 | 6 | import os 7 | import sys 8 | 9 | sys.path.insert( 10 | 0, os.path.abspath(os.path.join(os.path.dirname(__file__), '../..'))) 11 | -------------------------------------------------------------------------------- /examples/rnn_attention/context.py: -------------------------------------------------------------------------------- 1 | # ------------------------------------------------------------------------- 2 | # Copyright (c) Microsoft Corporation. All rights reserved. 3 | # Licensed under the MIT License. 4 | # -------------------------------------------------------------------------- 5 | 6 | import os 7 | import sys 8 | 9 | sys.path.insert( 10 | 0, os.path.abspath(os.path.join(os.path.dirname(__file__), '../..'))) 11 | -------------------------------------------------------------------------------- /examples/sparse_attention/context.py: -------------------------------------------------------------------------------- 1 | # ------------------------------------------------------------------------- 2 | # Copyright (c) Microsoft Corporation. All rights reserved. 3 | # Licensed under the MIT License. 4 | # -------------------------------------------------------------------------- 5 | 6 | import os 7 | import sys 8 | 9 | sys.path.insert( 10 | 0, os.path.abspath(os.path.join(os.path.dirname(__file__), '../..'))) 11 | -------------------------------------------------------------------------------- /kaleido/core/device/tests/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | # ------------------------------------------------------------------------- 2 | # Copyright (c) Microsoft Corporation. All rights reserved. Licensed under the 3 | # MIT License. 4 | # -------------------------------------------------------------------------- 5 | 6 | set(TEST_ROOT ${PROJECT_SOURCE_DIR}/kaleido/core/device/tests) 7 | 8 | nv_test(test_tile_copy SRCS ${TEST_ROOT}/test_tile_copy.cu) 9 | -------------------------------------------------------------------------------- /benchmarks/rnn/fractaltensor/cute_dilated_lstm/Makefile: -------------------------------------------------------------------------------- 1 | BENCH_NAME ?= dilated_lstm 2 | BUILD_DIR := build 3 | OUTPUT_FILE ?= ../../dilated_lstm_bench.tsv 4 | 5 | .PHONY: build bench clean 6 | 7 | build: 8 | @mkdir -p build && cd build && cmake .. && make -j 9 | 10 | $(BUILD_DIR)/$(BENCH_NAME): build 11 | 12 | bench: $(BUILD_DIR)/$(BENCH_NAME) 13 | @./$(BUILD_DIR)/$(BENCH_NAME) $(OUTPUT_FILE) 14 | 15 | clean: 16 | @rm -rf build 17 | -------------------------------------------------------------------------------- /benchmarks/rnn/fractaltensor/cute_stacked_lstm/Makefile: -------------------------------------------------------------------------------- 1 | BENCH_NAME ?= stacked_lstm 2 | BUILD_DIR := build 3 | OUTPUT_FILE ?= ../../cute_stacked_lstm_bench.tsv 4 | 5 | .PHONY: build bench clean 6 | 7 | build: 8 | @mkdir -p build && cd build && cmake .. && make -j 9 | 10 | $(BUILD_DIR)/$(BENCH_NAME): build 11 | 12 | bench: $(BUILD_DIR)/$(BENCH_NAME) 13 | @./$(BUILD_DIR)/$(BENCH_NAME) $(OUTPUT_FILE) 14 | 15 | clean: 16 | @rm -rf build 17 | -------------------------------------------------------------------------------- /examples/README.md: -------------------------------------------------------------------------------- 1 | # Restrictions of using Python's syntax in implementing the examples 2 | 3 | - The parser parses dataflow relations among variable generations and usages from 4 | the source code. Functions can be nested, but the current implementation of the parser ONLY 5 | resolves name alias in the function's local scope. Name resolver does not search 6 | the enclosing scope. A better implementation of name resolver will remove this limitation. 7 | -------------------------------------------------------------------------------- /kaleido/core/operators/expect_eq_op.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include "kaleido/core/tensor.h" 4 | 5 | namespace kaleido { 6 | namespace core { 7 | namespace ops { 8 | 9 | template 10 | class ExpectEqOp { 11 | public: 12 | void operator()(const Tensor& x, const Tensor& y, float epsilon = 1e-5); 13 | }; 14 | 15 | } // namespace ops 16 | } // namespace core 17 | } // namespace kaleido 18 | -------------------------------------------------------------------------------- /kaleido/core/allocator.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | 5 | namespace kaleido { 6 | namespace core { 7 | 8 | // TODO(ying): make Place a template parameter. 9 | // template 10 | class Allocator { 11 | public: 12 | virtual ~Allocator() = default; 13 | 14 | virtual void* Allocate(const size_t& nbytes) = 0; 15 | virtual void Deallocate(void* ptr) = 0; 16 | }; 17 | } // namespace core 18 | } // namespace kaleido 19 | -------------------------------------------------------------------------------- /benchmarks/rnn/baselines/grid_lstm/README.md: -------------------------------------------------------------------------------- 1 | # gridLSTM 2 | 3 | ## Hyper-parameters 4 | 5 | 1. `batch_size`=20 6 | 2. `seq_len`=64 7 | 3. `hidden_size`=`input_size`=128 8 | 4. `rnn_cell`=`LSTM` 9 | 5. `iters` = 20, `warmup` = 10 10 | 11 | ## Result 12 | 13 | |Name|PyTorch Average Time| TF_graph Average Time| 14 | |:--|:--|:--| 15 | |gridlstm_gpu:0_forward| 2.6266 |2.5567| 16 | |gridlstm_cpu_forward| 8.6012 |3.7226| 17 | 18 | > tf_graph: using tf.compat.v1.session 19 | -------------------------------------------------------------------------------- /kaleido/core/operators/softmax_op.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include "kaleido/core/tensor.h" 4 | 5 | namespace kaleido { 6 | namespace core { 7 | namespace ops { 8 | 9 | template 10 | class SoftmaxOp { 11 | public: 12 | void operator()(const DeviceContext& context, const Tensor& x, Tensor& y, 13 | int dim); 14 | }; 15 | 16 | } // namespace ops 17 | } // namespace core 18 | } // namespace kaleido 19 | -------------------------------------------------------------------------------- /kaleido/core/operators/transpose_op.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include "kaleido/core/tensor.h" 4 | 5 | namespace kaleido { 6 | namespace core { 7 | namespace ops { 8 | 9 | template 10 | class TransposeOp { 11 | public: 12 | void operator()(const Tensor& input, Tensor& output, 13 | std::vector dims); 14 | }; 15 | 16 | } // namespace ops 17 | } // namespace core 18 | } // namespace kaleido 19 | -------------------------------------------------------------------------------- /kaleido/parser/tests/context.py: -------------------------------------------------------------------------------- 1 | # ------------------------------------------------------------------------- 2 | # Copyright (c) Microsoft Corporation. All rights reserved. 3 | # Licensed under the MIT License. 4 | # -------------------------------------------------------------------------- 5 | 6 | import os 7 | import sys 8 | 9 | sys.path.insert( 10 | 0, os.path.abspath(os.path.join(os.path.dirname(__file__), '../../..'))) 11 | 12 | import random 13 | import unittest 14 | -------------------------------------------------------------------------------- /CODE_OF_CONDUCT.md: -------------------------------------------------------------------------------- 1 | # Microsoft Open Source Code of Conduct 2 | 3 | This project has adopted the [Microsoft Open Source Code of Conduct](https://opensource.microsoft.com/codeofconduct/). 4 | 5 | Resources: 6 | 7 | - [Microsoft Open Source Code of Conduct](https://opensource.microsoft.com/codeofconduct/) 8 | - [Microsoft Code of Conduct FAQ](https://opensource.microsoft.com/codeofconduct/faq/) 9 | - Contact [opencode@microsoft.com](mailto:opencode@microsoft.com) with questions or concerns 10 | -------------------------------------------------------------------------------- /benchmarks/rnn/baselines/stacked_lstm/tf_model/__init__.py: -------------------------------------------------------------------------------- 1 | # ------------------------------------------------------------------------- 2 | # Copyright (c) Microsoft Corporation. All rights reserved. 3 | # Licensed under the MIT License. 4 | # -------------------------------------------------------------------------- 5 | 6 | import os 7 | import sys 8 | 9 | sys.path.insert( 10 | 0, os.path.abspath(os.path.join(os.path.dirname(__file__), '../..'))) 11 | 12 | from . import rnn, rnn2 13 | -------------------------------------------------------------------------------- /kaleido/core/operators/print_op.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include "kaleido/core/tensor.h" 4 | 5 | namespace kaleido { 6 | namespace core { 7 | namespace ops { 8 | 9 | template 10 | class PrintOp { 11 | public: 12 | std::string operator()(const Tensor& input, int precision = 3, int count = -1, 13 | int pos = -1) const; 14 | }; 15 | 16 | } // namespace ops 17 | } // namespace core 18 | } // namespace kaleido 19 | -------------------------------------------------------------------------------- /kaleido/core/operators/online_softmax_op.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include "kaleido/core/tensor.h" 4 | 5 | namespace kaleido { 6 | namespace core { 7 | namespace ops { 8 | 9 | template 10 | class OnlineNormalizedSoftmaxOp { 11 | public: 12 | void operator()(const DeviceContext& context, const Tensor& x, Tensor& y, 13 | int dim); 14 | }; 15 | 16 | } // namespace ops 17 | } // namespace core 18 | } // namespace kaleido 19 | -------------------------------------------------------------------------------- /kaleido/core/operators/gather_nd_op.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include "kaleido/core/tensor.h" 4 | 5 | namespace kaleido { 6 | namespace core { 7 | namespace ops { 8 | 9 | template 10 | class GatherNdOp { 11 | public: 12 | void operator()(const DeviceContext& context, Tensor& output, 13 | const Tensor& input, const Tensor& indices); 14 | }; 15 | 16 | } // namespace ops 17 | } // namespace core 18 | } // namespace kaleido 19 | -------------------------------------------------------------------------------- /kaleido/core/tensor.cc: -------------------------------------------------------------------------------- 1 | // Copyright (c) Microsoft Corporation. All rights reserved. 2 | // Licensed under the MIT License. 3 | 4 | #include "kaleido/core/tensor.h" 5 | 6 | #include 7 | 8 | namespace kaleido { 9 | namespace core { 10 | 11 | std::string Tensor::DebugString() const { 12 | std::stringstream ss; 13 | ss << "Tensor {" << std::endl << type_desc_.DebugString() << std::endl << "}"; 14 | return ss.str(); 15 | } 16 | 17 | } // namespace core 18 | } // namespace kaleido 19 | -------------------------------------------------------------------------------- /kaleido/frontend/__init__.py: -------------------------------------------------------------------------------- 1 | # ------------------------------------------------------------------------- 2 | # Copyright (c) Microsoft Corporation. All rights reserved. 3 | # Licensed under the MIT License. 4 | # -------------------------------------------------------------------------- 5 | 6 | from __future__ import absolute_import, division, print_function 7 | 8 | from kaleido.frontend.fractal_tensor import * 9 | from kaleido.frontend.tensor import * 10 | from kaleido.frontend.types import * 11 | -------------------------------------------------------------------------------- /kaleido/core/operators/scatter_nd_op.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include "kaleido/core/tensor.h" 4 | 5 | namespace kaleido { 6 | namespace core { 7 | namespace ops { 8 | 9 | template 10 | class ScatterNdAddOp { 11 | public: 12 | void operator()(const DeviceContext& context, Tensor& data, 13 | const Tensor& updates, const Tensor& indices); 14 | }; 15 | 16 | } // namespace ops 17 | } // namespace core 18 | } // namespace kaleido 19 | -------------------------------------------------------------------------------- /kaleido/core/device/cuda_info.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | #include 3 | 4 | #include 5 | namespace kaleido { 6 | namespace core { 7 | 8 | int GetGPUDeviceCount(); 9 | 10 | int GetGPUComputeCapability(int id); 11 | 12 | int GetGPUMultiProcessors(int id); 13 | 14 | int GetGPUMaxThreadsPerMultiProcessor(int id); 15 | 16 | int GetGPUMaxThreadsPerBlock(int id); 17 | 18 | dim3 GetGpuMaxGridDimSize(int); 19 | 20 | std::string GetDeviceName(); 21 | 22 | } // namespace core 23 | } // namespace kaleido 24 | -------------------------------------------------------------------------------- /kaleido/core/operators/concat_op.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include "kaleido/core/tensor.h" 4 | 5 | namespace kaleido { 6 | namespace core { 7 | namespace ops { 8 | 9 | template 10 | class ConcatOp { 11 | public: 12 | void operator()(const DeviceContext& context, 13 | const std::vector& inputs, Tensor& output, 14 | size_t dim); 15 | }; 16 | 17 | } // namespace ops 18 | } // namespace core 19 | } // namespace kaleido 20 | -------------------------------------------------------------------------------- /kaleido/frontend/operations/tests/context.py: -------------------------------------------------------------------------------- 1 | # ------------------------------------------------------------------------- 2 | # Copyright (c) Microsoft Corporation. All rights reserved. 3 | # Licensed under the MIT License. 4 | # -------------------------------------------------------------------------- 5 | 6 | import os 7 | import sys 8 | 9 | sys.path.insert( 10 | 0, os.path.abspath(os.path.join(os.path.dirname(__file__), '../../../..'))) 11 | 12 | import random 13 | import unittest 14 | 15 | import kaleido 16 | -------------------------------------------------------------------------------- /benchmarks/rnn/baselines/stacked_lstm/pt_model/__init__.py: -------------------------------------------------------------------------------- 1 | # ------------------------------------------------------------------------- 2 | # Copyright (c) Microsoft Corporation. All rights reserved. 3 | # Licensed under the MIT License. 4 | # -------------------------------------------------------------------------- 5 | 6 | import os 7 | import sys 8 | 9 | sys.path.insert( 10 | 0, os.path.abspath(os.path.join(os.path.dirname(__file__), '../..'))) 11 | 12 | from .rnn import small_model 13 | 14 | __all__ = [ 15 | "small_model", 16 | ] 17 | -------------------------------------------------------------------------------- /benchmarks/rnn/baselines/stacked_lstm/triton_model/__init__.py: -------------------------------------------------------------------------------- 1 | # ------------------------------------------------------------------------- 2 | # Copyright (c) Microsoft Corporation. All rights reserved. 3 | # Licensed under the MIT License. 4 | # -------------------------------------------------------------------------- 5 | 6 | import os 7 | import sys 8 | 9 | from .rnn import StackedLSTM 10 | 11 | sys.path.insert( 12 | 0, os.path.abspath(os.path.join(os.path.dirname(__file__), '../..'))) 13 | 14 | __all__ = [ 15 | "StackedLSTM", 16 | ] 17 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | build/ 2 | *.log 3 | !requirements*.txt 4 | *.tgz 5 | *.gz 6 | *.pyc 7 | .ipynb_checkpoints/ 8 | __pycache__/ 9 | .vs/ 10 | .vscode/ 11 | .data/ 12 | venv/ 13 | .idea/ 14 | .checkpoints/ 15 | *.pb.h 16 | *.pb.cc 17 | *_pb2.py 18 | tensorboard/ 19 | benchmarks/attention/baseline/MultiHeadAttention/log 20 | 21 | # generated by compiling tex files. 22 | *.aux 23 | *.bbl 24 | *.blg 25 | *.idx 26 | *.ind 27 | *.lof 28 | *.lot 29 | *.out 30 | *.toc 31 | *.acn 32 | *.acr 33 | *.alg 34 | *.glg 35 | *.glo 36 | *.gls 37 | *.ist 38 | *.fls 39 | *.gv 40 | -------------------------------------------------------------------------------- /benchmarks/rnn/baselines/grid_lstm/pt_model/__init__.py: -------------------------------------------------------------------------------- 1 | # ------------------------------------------------------------------------- 2 | # Copyright (c) Microsoft Corporation. All rights reserved. 3 | # Licensed under the MIT License. 4 | # -------------------------------------------------------------------------- 5 | 6 | import os 7 | import sys 8 | 9 | sys.path.insert( 10 | 0, os.path.abspath(os.path.join(os.path.dirname(__file__), '../..'))) 11 | 12 | from .model import StackedGridModel 13 | 14 | __all__ = [ 15 | "StackedGridModel", 16 | ] 17 | -------------------------------------------------------------------------------- /benchmarks/rnn/baselines/stacked_dilated_rnn/tf_model/__init__.py: -------------------------------------------------------------------------------- 1 | # ------------------------------------------------------------------------- 2 | # Copyright (c) Microsoft Corporation. All rights reserved. 3 | # Licensed under the MIT License. 4 | # -------------------------------------------------------------------------- 5 | 6 | import os 7 | import sys 8 | 9 | sys.path.insert( 10 | 0, os.path.abspath(os.path.join(os.path.dirname(__file__), '../..'))) 11 | 12 | from .model import StackedDRNN 13 | 14 | __all__ = [ 15 | 'StackedDRNN', 16 | ] 17 | -------------------------------------------------------------------------------- /kaleido/core/operators/matmul_op.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include "kaleido/core/tensor.h" 4 | 5 | namespace kaleido { 6 | namespace core { 7 | namespace ops { 8 | 9 | template 10 | class MatMulOp { 11 | public: 12 | void operator()(const DeviceContext& context, const Tensor& A, bool trans_a, 13 | const Tensor& B, bool trans_b, Tensor& C, T alf = 1., 14 | T bet = 0.); 15 | }; 16 | 17 | } // namespace ops 18 | } // namespace core 19 | } // namespace kaleido 20 | -------------------------------------------------------------------------------- /benchmarks/rnn/baselines/grid_lstm/triton_model/__init__.py: -------------------------------------------------------------------------------- 1 | # ------------------------------------------------------------------------- 2 | # Copyright (c) Microsoft Corporation. All rights reserved. 3 | # Licensed under the MIT License. 4 | # -------------------------------------------------------------------------- 5 | 6 | import os 7 | import sys 8 | 9 | sys.path.insert( 10 | 0, os.path.abspath(os.path.join(os.path.dirname(__file__), '../..'))) 11 | 12 | from .model import StackedGridModel 13 | 14 | __all__ = [ 15 | "StackedGridModel", 16 | ] 17 | -------------------------------------------------------------------------------- /benchmarks/rnn/baselines/stacked_dilated_rnn/triton_model/__init__.py: -------------------------------------------------------------------------------- 1 | # ------------------------------------------------------------------------- 2 | # Copyright (c) Microsoft Corporation. All rights reserved. 3 | # Licensed under the MIT License. 4 | # -------------------------------------------------------------------------- 5 | 6 | import os 7 | import sys 8 | 9 | from .rnn import StackedDRNN 10 | 11 | sys.path.insert( 12 | 0, os.path.abspath(os.path.join(os.path.dirname(__file__), '../..'))) 13 | 14 | __all__ = [ 15 | "StackedDRNN", 16 | ] 17 | -------------------------------------------------------------------------------- /kaleido/core/operators/fill_op.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include "kaleido/core/tensor.h" 4 | 5 | namespace kaleido { 6 | namespace core { 7 | namespace ops { 8 | 9 | template 10 | class FillOp { 11 | public: 12 | void operator()(Tensor& input); 13 | void operator()(Tensor& input, float value); 14 | void operator()(Tensor& input, float mean, float stddev); 15 | void operator()(Tensor& input, const std::string& mode, float scale = 1.); 16 | }; 17 | 18 | } // namespace ops 19 | } // namespace core 20 | } // namespace kaleido 21 | -------------------------------------------------------------------------------- /kaleido/frontend/tests/context.py: -------------------------------------------------------------------------------- 1 | # ------------------------------------------------------------------------- 2 | # Copyright (c) Microsoft Corporation. All rights reserved. 3 | # Licensed under the MIT License. 4 | # -------------------------------------------------------------------------- 5 | 6 | import os 7 | import sys 8 | 9 | sys.path.insert( 10 | 0, os.path.abspath(os.path.join(os.path.dirname(__file__), '../../..'))) 11 | 12 | import random 13 | import unittest 14 | 15 | import kaleido 16 | from kaleido import FractalTensor, FractalTensorStorage, Tensor, TensorStorage 17 | -------------------------------------------------------------------------------- /benchmarks/rnn/baselines/stacked_dilated_rnn/pt_model/__init__.py: -------------------------------------------------------------------------------- 1 | # ------------------------------------------------------------------------- 2 | # Copyright (c) Microsoft Corporation. All rights reserved. 3 | # Licensed under the MIT License. 4 | # -------------------------------------------------------------------------- 5 | 6 | import os 7 | import sys 8 | 9 | sys.path.insert( 10 | 0, os.path.abspath(os.path.join(os.path.dirname(__file__), '../..'))) 11 | 12 | from .model import StackedDRNN, StackedDRNNJIT 13 | 14 | __all__ = [ 15 | 'StackedDRNNJIT', 16 | 'StackedDRNN', 17 | ] 18 | -------------------------------------------------------------------------------- /kaleido/core/data_types.proto: -------------------------------------------------------------------------------- 1 | syntax = "proto3"; 2 | 3 | package kaleido.core; 4 | 5 | message VarType { 6 | enum DataType { 7 | BOOL = 0; 8 | INT32 = 1; 9 | INT64 = 2; 10 | FP32 = 3; 11 | FP64 = 4; 12 | } 13 | 14 | DataType type = 1; 15 | 16 | message TensorTypeDesc { 17 | DataType dtype = 1; 18 | repeated int64 dims = 2; 19 | string place = 3; 20 | } 21 | 22 | message FractalTensorTypeDesc { 23 | TensorTypeDesc dtype = 1; 24 | int64 depth = 2; 25 | repeated bool is_static = 3; 26 | } 27 | } 28 | -------------------------------------------------------------------------------- /kaleido/core/device/kernels/softmax_common.h: -------------------------------------------------------------------------------- 1 | // Copyright (c) Microsoft Corporation. All rights reserved. 2 | // Licensed under the MIT License. 3 | 4 | #pragma once 5 | 6 | namespace kaleido { 7 | namespace core { 8 | namespace cuda_kernel { 9 | 10 | template 11 | struct MD { 12 | T m; 13 | T d; 14 | }; 15 | 16 | template <> 17 | struct __align__(8) MD { 18 | float m; 19 | float d; 20 | }; 21 | 22 | template <> 23 | struct __align__(16) MD { 24 | double m; 25 | double d; 26 | }; 27 | 28 | } // namespace cuda_kernel 29 | } // namespace core 30 | } // namespace kaleido 31 | -------------------------------------------------------------------------------- /kaleido/core/operators/gemm_batched_op.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include "kaleido/core/tensor.h" 4 | 5 | #include 6 | 7 | namespace kaleido { 8 | namespace core { 9 | namespace ops { 10 | 11 | template 12 | class GemmBatchedOp { 13 | public: 14 | void operator()(const DeviceContext& context, const std::vector& A, 15 | bool trans_a, const std::vector& B, bool trans_b, 16 | std::vector& C, T alf = 1., T bet = 0.); 17 | }; 18 | 19 | } // namespace ops 20 | } // namespace core 21 | } // namespace kaleido 22 | -------------------------------------------------------------------------------- /kaleido/core/init.cc: -------------------------------------------------------------------------------- 1 | // Copyright (c) Microsoft Corporation. All rights reserved. 2 | // Licensed under the MIT License. 3 | 4 | #include 5 | #include 6 | 7 | #include 8 | 9 | namespace py = pybind11; 10 | 11 | namespace kaleido { 12 | namespace core { 13 | 14 | std::once_flag glog_init_flag; 15 | 16 | void InitGLOG(const std::string& prog_name) { 17 | std::call_once(glog_init_flag, [&]() { 18 | google::InitGoogleLogging(strdup(prog_name.c_str())); 19 | }); 20 | } 21 | 22 | PYBIND11_MODULE(_core, m) { m.def("init_glog", InitGLOG); } 23 | 24 | } // namespace core 25 | } // namespace kaleido 26 | -------------------------------------------------------------------------------- /.clang-format: -------------------------------------------------------------------------------- 1 | # Run manually to reformat a file: 2 | # clang-format -i --style=file 3 | BasedOnStyle: Google 4 | ColumnLimit: 80 5 | IndentWidth: 2 6 | AccessModifierOffset: -2 7 | DerivePointerAlignment: false 8 | KeepEmptyLinesAtTheStartOfBlocks: false 9 | SortIncludes: true 10 | IncludeBlocks: Regroup 11 | IncludeCategories: 12 | - Regex: '<([A-Za-z0-9\Q/-_\E])+>' 13 | Priority: 4 14 | - Regex: '<(catch2|boost)\/' 15 | Priority: 3 16 | - Regex: '<([A-Za-z0-9.\Q/-_\E])+>' 17 | Priority: 2 18 | - Regex: '"([A-Za-z0-9.\Q/-_\E])+"' 19 | Priority: 1 20 | 21 | AllowShortLoopsOnASingleLine: true 22 | AllowShortIfStatementsOnASingleLine: true 23 | Cpp11BracedListStyle: true 24 | -------------------------------------------------------------------------------- /kaleido/core/operators/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | # ------------------------------------------------------------------------- 2 | # Copyright (c) Microsoft Corporation. All rights reserved. Licensed under the 3 | # MIT License. 4 | # -------------------------------------------------------------------------- 5 | 6 | op_library(fill_op DEPS ${CUDA_curand_LIBRARY}) 7 | op_library(concat_op) 8 | op_library(transpose_op) 9 | op_library(print_op) 10 | op_library(matmul_op) 11 | op_library(softmax_op) 12 | op_library(gemm_batched_op) 13 | op_library(elementwise_op) 14 | op_library(gather_nd_op) 15 | op_library(scatter_nd_op) 16 | op_library(online_softmax_op) 17 | op_library(expect_eq_op) 18 | add_subdirectory(tests) 19 | -------------------------------------------------------------------------------- /scripts/clang_format.hook: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | #------------------------------------------------------------------------- 3 | # Copyright (c) Microsoft Corporation. All rights reserved. 4 | # Licensed under the MIT License. 5 | # -------------------------------------------------------------------------- 6 | set -e 7 | 8 | readonly VERSION="18.1.5" 9 | 10 | version=$(clang-format -version) 11 | 12 | if ! [[ $version == *"$VERSION"* ]]; then 13 | echo "clang-format version check failed." 14 | echo "a version contains '$VERSION' is needed, but get '$version'" 15 | echo "you can install the right version, and make an soft-link to '\$PATH' env" 16 | exit -1 17 | fi 18 | 19 | clang-format $@ 20 | -------------------------------------------------------------------------------- /benchmarks/rnn/baselines/grid_lstm/tf_model/__init__.py: -------------------------------------------------------------------------------- 1 | # ------------------------------------------------------------------------- 2 | # Copyright (c) Microsoft Corporation. All rights reserved. 3 | # Licensed under the MIT License. 4 | # -------------------------------------------------------------------------- 5 | 6 | import os 7 | import sys 8 | 9 | sys.path.insert( 10 | 0, os.path.abspath(os.path.join(os.path.dirname(__file__), '../..'))) 11 | 12 | from .model import (BaseWhileOpGridLSTMNet, FineGrainedOpGridLSTMNet, 13 | WhileOpGridLSTMNet) 14 | 15 | __all__ = [ 16 | "WhileOpGridLSTMNet", 17 | "BaseWhileOpGridLSTMNet", 18 | "FineGrainedOpGridLSTMNet", 19 | ] 20 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | #------------------------------------------------------------------------- 2 | # Copyright (c) Microsoft Corporation. All rights reserved. 3 | # Licensed under the MIT License. 4 | # -------------------------------------------------------------------------- 5 | 6 | CUDNN_HOME ?= 7 | BUILD_DIR := build 8 | 9 | .PHONY: build clean 10 | 11 | build: 12 | @mkdir -p $(BUILD_DIR)/ 13 | @cd build && cmake ../ -D PYTHON_EXECUTABLE:FILEPATH=`which python3` \ 14 | -D CUDNN_INCLUDE_DIR=$(CUDNN_HOME)/include \ 15 | -D CUDNN_LIBRARY=$(CUDNN_HOME)/lib/libcudnn.so && make -j$(nproc) 16 | 17 | $(BUILD_DIR)/kaleido: 18 | @$(MAKE) build 19 | 20 | clean: 21 | @rm -f unittest.log 22 | @rm -rf $(BUILD_DIR) 23 | -------------------------------------------------------------------------------- /kaleido/core/operators/elementwise_op.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include "kaleido/core/tensor.h" 4 | 5 | namespace kaleido { 6 | namespace core { 7 | namespace ops { 8 | 9 | enum ElementwiseType { 10 | kUnary = 1, 11 | kBinary = 2, 12 | kTernary = 3, 13 | kArityFour = 4, 14 | kAny = -1 // 15 | }; 16 | 17 | template 19 | class ElementwiseOp { 20 | public: 21 | void operator()(const DeviceContext& context, 22 | const std::vector& inputs, Tensor& output, 23 | Functor func); 24 | }; 25 | 26 | } // namespace ops 27 | } // namespace core 28 | } // namespace kaleido 29 | -------------------------------------------------------------------------------- /benchmarks/rnn/baselines/stacked_lstm/README.md: -------------------------------------------------------------------------------- 1 | # Hyper-parameters 2 | 3 | 1. `num_layers` = 8, 8 LSTM layers are stacked 4 | 1. LSTM's `hidden_dim` = `output_dim` = 512 5 | 1. All training samples have a fixed length: `seq_len_` = 100 6 | 1. `batch_size` = 64 7 | 1. `warm_up` = 10, `iteration` = 30 8 | 9 | Explanation for some implementations: 10 | 11 | |Name|Explanation| 12 | |:--|:--| 13 | |Fine-grained Lstm Cell V1|Compute LSTM's Four gates separatedly.| 14 | |Fine-grained Lstm Cell V2|Manually batch GEMMs in LSTM's four gates into a large GEMM.| 15 | |Static LSTM cell in TensorFlow|LSTM cell as a single operator.| 16 | 17 |

18 | 19 |

20 | -------------------------------------------------------------------------------- /benchmarks/fused_two_hgemms/README.md: -------------------------------------------------------------------------------- 1 |

2 |
3 | Fig. Compose back-to-back GEMMs using parallel operator nesting. 4 |

5 | 6 |

7 |
8 | Fig. Extended task dependence graph representations for back-to-back GEMMs. 9 |

10 | 11 |

12 |
13 | Fig. AccessMap annotation attached to the extended task dependence graph. 14 |

15 | 16 |

17 |
18 | Fig. Translate into heirarchical dataflow on the CUDA device. 19 |

20 | -------------------------------------------------------------------------------- /benchmarks/rnn/fractaltensor/grid_lstm/run.sh: -------------------------------------------------------------------------------- 1 | // Copyright (c) Microsoft Corporation. All rights reserved. 2 | // Licensed under the MIT License. 3 | 4 | #!/bin/bash 5 | 6 | # overall for figure 7. 7 | ./build/grid_rnn 32 256 10 1 8 | ./build/grid_rnn 32 512 10 0 9 | ./build/grid_rnn 32 1024 10 0 10 | 11 | depths='1 2 4 8 16 32' 12 | hiddens='256 1024' # for middle size and large size 13 | # scale with depth in Figure 9 14 | for h in $hiddens; do 15 | for d in $depths; do 16 | ./build/grid_rnn $d $h 10 0 17 | done 18 | done 19 | 20 | hiddens='256 1024' # for middle size and large size 21 | # scale with length in Figure 9 22 | lengths='5, 7, 10' 23 | for h in $hiddens; do 24 | for l in $lengths; do 25 | ./build/grid_rnn 32 $h $l 0 26 | done 27 | done 28 | -------------------------------------------------------------------------------- /kaleido/core/tensor_shape.cc: -------------------------------------------------------------------------------- 1 | // Copyright (c) Microsoft Corporation. All rights reserved. 2 | // Licensed under the MIT License. 3 | 4 | #include "kaleido/core/tensor_shape.h" 5 | 6 | namespace kaleido { 7 | namespace core { 8 | 9 | bool TensorShape::IsEuqalShape(const TensorShape& b) const { 10 | if (b.ndim() != ndim()) return false; 11 | for (size_t i = 0; i < ndim(); ++i) { 12 | if (b.dim_size(i) != dim_size(i)) return false; 13 | } 14 | return true; 15 | } 16 | 17 | std::string TensorShape::DebugString() const { 18 | std::stringstream ss; 19 | ss << "shape : ["; 20 | for (size_t i = 0; i < dim_ - 1; ++i) ss << dim_sizes_[i] << ", "; 21 | ss << dim_sizes_[dim_ - 1] << "]"; 22 | return ss.str(); 23 | } 24 | 25 | } // namespace core 26 | } // namespace kaleido 27 | -------------------------------------------------------------------------------- /kaleido/core/place.cc: -------------------------------------------------------------------------------- 1 | // Copyright (c) Microsoft Corporation. All rights reserved. 2 | // Licensed under the MIT License. 3 | 4 | #include "kaleido/core/place.h" 5 | 6 | #include 7 | 8 | namespace kaleido { 9 | namespace core { 10 | 11 | class PlacePrinter : public boost::static_visitor<> { 12 | public: 13 | explicit PlacePrinter(std::ostream& os) : os_(os) {} 14 | 15 | void operator()(const CPUPlace&) { os_ << "CPU"; } 16 | void operator()(const CUDAPlace& p) { os_ << "CUDA:" << p.device; } 17 | 18 | private: 19 | std::ostream& os_; 20 | }; 21 | 22 | std::ostream& operator<<(std::ostream& out, const Place& place) { 23 | PlacePrinter printer(out); 24 | boost::apply_visitor(printer, place); 25 | return out; 26 | } 27 | 28 | } // namespace core 29 | } // namespace kaleido 30 | -------------------------------------------------------------------------------- /kaleido/core/config.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #if defined(__CUDA_ARCH__) 4 | #define HOST_DEVICE __forceinline__ __host__ __device__ 5 | #define DEVICE __forceinline__ __device__ 6 | #define HOST __forceinline__ __host__ 7 | #else 8 | #define HOST_DEVICE inline 9 | #define DEVICE inline 10 | #define HOST inline 11 | #endif 12 | 13 | #if defined(__CUDACC_RTC__) 14 | #include 15 | #include 16 | #include 17 | #include 18 | #include 19 | 20 | #define STL_NAMESPACE cuda::std 21 | 22 | #else 23 | #include // ptrdiff_t 24 | #include // uintptr_t 25 | #include // numeric_limits 26 | #include 27 | #include // tuple_size, tuple_element 28 | 29 | #define STL_NAMESPACE std 30 | #endif 31 | -------------------------------------------------------------------------------- /scripts/format.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | #------------------------------------------------------------------------- 4 | # Copyright (c) Microsoft Corporation. All rights reserved. 5 | # Licensed under the MIT License. 6 | # -------------------------------------------------------------------------- 7 | # Format Python files using yapf 8 | echo "Running yapf..." 9 | find . -type f -name "*.py" \ 10 | ! -path "./build/*" \ 11 | ! -path "./.git/*" \ 12 | ! -path "*.egg-info/*" \ 13 | -print0 | xargs -0 yapf --in-place 14 | 15 | # Format Python imports using isort 16 | echo "Running isort..." 17 | isort . 18 | 19 | 20 | 21 | find kaleido/ -name "*.cc" -o -name "*.cpp" -o -name "*.h" -o -name "*.cu" | xargs clang-format -i 22 | find benchmarks/ -name "*.cpp" -o -name "*.h" -o -name "*.cu" | xargs clang-format -i 23 | -------------------------------------------------------------------------------- /docs/fractaltensor_operations/information_query.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | - [Information query](#information-query) 4 | - [length](#length) 5 | - [depth](#depth) 6 | 7 | 12 | 13 | 14 | 15 | ## Information query 16 | 17 | ### length 18 | 19 | $$\mathbf{length} ::\Psi n.[\alpha]^d_n \rightarrow \text{int}$$ 20 | 21 | ```python 22 | length(x: FractalTensor[T]) -> List[int] 23 | ``` 24 | 25 | `length` is only available after data is feed to a `FractalTensor` variable, otherwise, return an empty list. 26 | 27 | ### depth 28 | 29 | $$\mathbf{depth} ::\Psi n.[\alpha]^d_n \rightarrow \text{int}$$ 30 | 31 | ```python 32 | depth(x: FractalTensor[T]) -> int 33 | ``` 34 | -------------------------------------------------------------------------------- /benchmarks/rnn/fractaltensor/cute_stacked_lstm/README.md: -------------------------------------------------------------------------------- 1 |

2 |
3 | Fig. The ETDG representation for staked LSTM. 4 |

5 | 6 |

7 |
8 | Fig. The original egde annotation for S0 and S1 before program transformation. 9 |

10 | 11 |

12 |
13 | Fig. The original egde annotation for S2 and S3 before program transformation. 14 |

15 | 16 |

17 |
18 | Fig. The egde annotation for S0 and S1 after program transformation. 19 |

20 | 21 |

22 |
23 | Fig. The egde annotation for S2 and S3 after program transformation. 24 | -------------------------------------------------------------------------------- /kaleido/core/device/traits_base.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | #include 3 | 4 | namespace kaleido { 5 | namespace core { 6 | 7 | // FIXME(ying). The swizzle function requires a data tile with a minimal 8 | // shape of <8, 32> for the <2, 3, 3> case, and a minimal shape of <8, 64> for 9 | // the <3, 3, 3> case. Here requires some check to ensure that the data tile 10 | // meets these requirements before using this function. 11 | template 12 | static constexpr int kSwizzle = (N == 32 ? 2 : 3); 13 | 14 | template 15 | struct TraitsBase { 16 | static constexpr int kAccessInBits = 128; // 128 bits 17 | static constexpr int kElmentBits = cutlass::sizeof_bits::value; 18 | static constexpr int kNumPerAccess = kAccessInBits / kElmentBits; 19 | }; 20 | 21 | } // namespace core 22 | } // namespace kaleido 23 | -------------------------------------------------------------------------------- /kaleido/core/tile_shape.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include "kaleido/core/config.h" 4 | 5 | #include 6 | #include 7 | 8 | namespace kaleido { 9 | namespace core { 10 | 11 | template 12 | using TileShape = cute::tuple...>; 13 | 14 | // FIXME(ying): Be careful that names like `rank` is quite common. 15 | // It is easy to conflict with cute's builtin function. 16 | template 17 | __device__ static constexpr size_t rank = cute::rank_v; 18 | 19 | template 20 | __device__ static constexpr size_t dim_size = cute::get(TileShape{}); 21 | 22 | template 23 | __device__ static constexpr int64_t get_numel = cute::size(TileShape{}); 24 | 25 | } // namespace core 26 | } // namespace kaleido 27 | -------------------------------------------------------------------------------- /cmake/python.cmake: -------------------------------------------------------------------------------- 1 | # ------------------------------------------------------------------------- 2 | # Copyright (c) Microsoft Corporation. All rights reserved. Licensed under the 3 | # MIT License. 4 | # -------------------------------------------------------------------------- 5 | 6 | # FIXME(Ying): This may lead to runtime error if users have multiple locally 7 | # installed Pythons. To avoid runtime error, it is better to explicitly specify 8 | # which python is in use through: cmake -DPYTHON_EXECUTABLE:FILEPATH=`which 9 | # python3` 10 | 11 | find_package(PythonLibs REQUIRED) 12 | 13 | message(STATUS "Python include dir: ${PYTHON_INCLUDE_DIR}") 14 | message(STATUS "Python library: ${PYTHON_LIBRARY}") 15 | 16 | add_library(python SHARED IMPORTED GLOBAL) 17 | set_property(TARGET python PROPERTY IMPORTED_LOCATION ${PYTHON_LIBRARIES}) 18 | include_directories(${PYTHON_INCLUDE_DIRS}) 19 | -------------------------------------------------------------------------------- /kaleido/core/tests/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | # ------------------------------------------------------------------------- 2 | # Copyright (c) Microsoft Corporation. All rights reserved. Licensed under the 3 | # MIT License. 4 | # -------------------------------------------------------------------------- 5 | 6 | set(TEST_ROOT ${PROJECT_SOURCE_DIR}/kaleido/core/tests) 7 | 8 | cc_test_build(test_allocator SRCS ${TEST_ROOT}/test_allocator.cc DEPS 9 | fractaltensor_core) 10 | cc_test_build( 11 | test_tensor_and_fractaltensor 12 | SRCS 13 | ${TEST_ROOT}/test_tensor_and_fractaltensor.cc 14 | DEPS 15 | fractaltensor_core 16 | print_op 17 | fill_op) 18 | 19 | cc_test_build(test_cuda_info SRCS ${TEST_ROOT}/test_cuda_info.cc DEPS 20 | fractaltensor_core) 21 | 22 | cc_test_build(test_layout SRCS ${TEST_ROOT}/test_layout.cc DEPS 23 | fractaltensor_core) 24 | -------------------------------------------------------------------------------- /benchmarks/rnn/tvm/utils.py: -------------------------------------------------------------------------------- 1 | # ------------------------------------------------------------------------- 2 | # Copyright (c) Microsoft Corporation. All rights reserved. 3 | # Licensed under the MIT License. 4 | # -------------------------------------------------------------------------- 5 | 6 | import logging 7 | 8 | 9 | def get_logger(log_file_name='tvm_codegen.txt', log_level=logging.DEBUG): 10 | logger = logging.getLogger() 11 | logger.setLevel(log_level) 12 | formatter = logging.Formatter( 13 | '%(asctime)s - %(name)s - %(levelname)s: - %(message)s', 14 | datefmt='%Y-%m-%d %H:%M:%S') 15 | 16 | fh = logging.FileHandler(log_file_name) 17 | fh.setLevel(log_level) 18 | fh.setFormatter(formatter) 19 | 20 | ch = logging.StreamHandler() 21 | ch.setLevel(log_level) 22 | ch.setFormatter(formatter) 23 | 24 | logger.addHandler(ch) 25 | logger.addHandler(fh) 26 | return logger 27 | -------------------------------------------------------------------------------- /kaleido/__init__.py: -------------------------------------------------------------------------------- 1 | # ------------------------------------------------------------------------- 2 | # Copyright (c) Microsoft Corporation. All rights reserved. 3 | # Licensed under the MIT License. 4 | # -------------------------------------------------------------------------- 5 | 6 | from __future__ import absolute_import, division, print_function 7 | 8 | import sys 9 | 10 | from kaleido.frontend import operations 11 | from kaleido.frontend.fractal_tensor import * 12 | from kaleido.frontend.tensor import Parameter, Tensor 13 | from kaleido.frontend.types import * 14 | from kaleido.parser import * 15 | 16 | del absolute_import 17 | del division 18 | del print_function 19 | 20 | # FIXME(Ying): please manually create the soft link of the build dynamic library 21 | # in the build directory. 22 | # It needs a standarded way to distribute the package and import 23 | # bindings in future. 24 | # import _core 25 | 26 | # _core.init_glog(sys.argv[0]) 27 | -------------------------------------------------------------------------------- /kaleido/core/device/cuda_timer.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include "kaleido/core/device/cuda_utils.h" 4 | 5 | namespace kaleido { 6 | namespace core { 7 | class CudaTimer { 8 | private: 9 | cudaEvent_t start; 10 | cudaEvent_t stop; 11 | cudaStream_t stream; 12 | 13 | public: 14 | CudaTimer() { 15 | CudaCheck(cudaEventCreate(&start)); 16 | CudaCheck(cudaEventCreate(&stop)); 17 | } 18 | 19 | ~CudaTimer() { 20 | CudaCheck(cudaEventDestroy(start)); 21 | CudaCheck(cudaEventDestroy(stop)); 22 | } 23 | 24 | void Start(cudaStream_t st = 0) { 25 | stream = st; 26 | CudaCheck(cudaEventRecord(start, stream)); 27 | } 28 | 29 | float Stop() { 30 | float milliseconds = 0.; 31 | CudaCheck(cudaEventRecord(stop, stream)); 32 | CudaCheck(cudaEventSynchronize(stop)); 33 | CudaCheck(cudaEventElapsedTime(&milliseconds, start, stop)); 34 | return milliseconds; 35 | } 36 | }; 37 | } // namespace core 38 | } // namespace kaleido 39 | -------------------------------------------------------------------------------- /examples/convolution/utils.py: -------------------------------------------------------------------------------- 1 | # ------------------------------------------------------------------------- 2 | # Copyright (c) Microsoft Corporation. All rights reserved. 3 | # Licensed under the MIT License. 4 | # -------------------------------------------------------------------------- 5 | 6 | import random 7 | from typing import NamedTuple, Tuple 8 | 9 | import torch 10 | 11 | import kaleido 12 | from kaleido import FractalTensor, FractalTensorStorage, Tensor, TensorStorage 13 | from kaleido import operations as ops 14 | 15 | 16 | def gen_image_batch(tensor_shape: Tuple[int], 17 | batch_size: int, 18 | device='cpu') -> FractalTensor[FractalTensor[Tensor]]: 19 | """Returns a batch of image in format of NCHW.""" 20 | x = FractalTensor( 21 | TensorStorage(tensor_shape, kaleido.float32, device=device)) 22 | x.indices = list(range(batch_size)) 23 | x.initialize(torch.rand, *x.flatten_shape, device=device) 24 | return x 25 | -------------------------------------------------------------------------------- /benchmarks/rnn/tvm/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | cmake_minimum_required(VERSION 3.0) 2 | project(tvm_test CXX C) 3 | 4 | set(CMAKE_CXX_STANDARD 14) 5 | set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fPIC -Wl,--no-undefined") 6 | set(CMAKE_CXX_FLAGS_DEBUG 7 | "$ENV{CXXFLAGS} -O0 -fPIC -Wall -Wno-sign-compare -g2 -ggdb") 8 | set(CMAKE_CXX_FLAGS_RELEASE "$ENV{CXXFLAGS} -fPIC -O3 -Wall -Wno-sign-compare") 9 | 10 | find_package(CUDA QUIET REQUIRED) 11 | include_directories(${CUDA_INCLUDE_DIRS}) 12 | 13 | set(TVM_ROOT 14 | "${CMAKE_CURRENT_SOURCE_DIR}/../../../build/third_party/tvm/src/extern_tvm/" 15 | ) 16 | 17 | include_directories(${TVM_ROOT}/include ${TVM_ROOT}/3rdparty/dmlc-core/include 18 | ${TVM_ROOT}/3rdparty/dlpack/include) 19 | link_directories(${TVM_ROOT}) 20 | 21 | add_executable(main main.cc) 22 | target_link_libraries( 23 | main 24 | tvm_runtime 25 | cuda 26 | ${CUDA_CUDART_LIBRARY} 27 | ${CUDA_LIBRARIES} 28 | ${CUDA_CUBLAS_LIBRARIES} 29 | ${CUDA_curand_LIBRARY}) 30 | -------------------------------------------------------------------------------- /kaleido/parser/errors.py: -------------------------------------------------------------------------------- 1 | # ------------------------------------------------------------------------- 2 | # Copyright (c) Microsoft Corporation. All rights reserved. 3 | # Licensed under the MIT License. 4 | # -------------------------------------------------------------------------- 5 | 6 | __all__ = [ 7 | 'ParseError', 8 | 'UnsupportedConstruct', 9 | 'UnsupportedType', 10 | 'AnnotationError', 11 | 'UnknownPrimitiveOps', 12 | 'ShapeError', 13 | ] 14 | 15 | 16 | class ParseError(Exception): 17 | pass 18 | 19 | 20 | class UnsupportedConstruct(ParseError): 21 | """Exception for unsupported Python construct.""" 22 | 23 | def __init__(self, msg=None): 24 | self.msg = f"Unspport Python construct {msg}." 25 | 26 | 27 | class UnsupportedType(ParseError): 28 | pass 29 | 30 | 31 | class AnnotationError(ParseError): 32 | pass 33 | 34 | 35 | class UnknownPrimitiveOps(ParseError): 36 | pass 37 | 38 | 39 | class ShapeError(ParseError): 40 | pass 41 | -------------------------------------------------------------------------------- /benchmarks/rnn/baselines/stacked_dilated_rnn/README.md: -------------------------------------------------------------------------------- 1 | # Stacked Dilated LSTM 2 | 3 | ## PyTorch implementation 4 | 5 |

6 | 7 |

8 | 9 | ## Hyper-parameters 10 | 11 | 1. `batch_size` = 32 12 | 2. `seq_len` = 100 13 | 3. `layers_num`= 6, the corresponding `dilation`=`[1, 2, 4, 8, 16, 32]` 14 | 4. `input_size`= 64, while size means number of dims 15 | 5. `hidden_size` = `output_size` = 64 16 | 6. `rnn_cell` = `LSTM` 17 | 18 | ## Results 19 | 20 | `counting_iteration_num` = 50, `warmup_iteration_num` = 20 21 | 22 | |Test Name|Average Time(s)|Elapsed Time(s)|Throughput(seq/s)| 23 | |:--|:--|:--|:--| 24 | |PyTroch Imperative|0.0085 |0.1698 |3768.5783| 25 | |PyTorch_JITed|0.0059 |0.1176 |5443.7788| 26 | |PyTorch Pad per Layer (cannot be JITed)|0.0092 |0.1843 |3472.5731| 27 | |TensorFlow Eager|0.0656 |3.2781 |488.0861| 28 | |TensorFlow Auto-graph|0.0073 |0.3648 |4386.2863| 29 | |TensorFlow Graph-mode|0.0051 |0.2575 |6214.1577| 30 | -------------------------------------------------------------------------------- /benchmarks/rnn/fractaltensor/stacked_lstm/README.md: -------------------------------------------------------------------------------- 1 | # Methods 2 | 3 | Below is a small test case to illustrate the idea: 4 | 5 | - batch_size = 2 6 | - length = 7 7 | - depth = 4 8 | - `ysss` is laid out in `[depth, length, batch_size]` 9 | 10 |

11 | 12 |

13 | 14 | In this small, it is easy to observe that, within a hyperplance, parallel iterations that access `ysss`, `xs` and `hs` **follows a fixed stride**, thus `xs @ W` and `hs @ U` can be translated into `stridedBMM`. 15 | 16 | |No.|ysss|xs|hs| 17 | |:-:|:-:|:-:|:-:| 18 | |3-0|[16]|[2]|[14]| 19 | |3-1|[18,30]|[4,16]|[16,28]| 20 | |3-2|[20,32,44]|[6,18,30]|[18,30,42]| 21 | |3-3|[22,32,44]|[8,20,32]|[20,32,44]| 22 | |3-4|[24,36,48]|[10,22,34]|[22,34,46]| 23 | |3-5|[26,38,50]|[12,24,36]|[24,36,48]| 24 | |3-6|[40,52]|[26,38]|[38,50]| 25 | |3-7|[54]|[40]|[52]| 26 | 27 |

28 |
29 | Fig. The dataflow graph representation for the stacked LSTM network. 30 |

31 | -------------------------------------------------------------------------------- /kaleido/core/device/gpu_context.cc: -------------------------------------------------------------------------------- 1 | // Copyright (c) Microsoft Corporation. All rights reserved. 2 | // Licensed under the MIT License. 3 | 4 | #include "kaleido/core/device/gpu_context.h" 5 | 6 | #include "kaleido/core/device/cuda_info.h" 7 | 8 | namespace kaleido { 9 | namespace core { 10 | 11 | GPUContext::GPUContext() { 12 | CublasCheck(cublasCreate(&cublas_handle_)); 13 | CublasCheck(cublasSetPointerMode(cublas_handle_, CUBLAS_POINTER_MODE_HOST)); 14 | // CudnnCheck(cudnnCreate(&cudnn_handle_)); 15 | 16 | compute_capability_ = GetGPUComputeCapability(0); 17 | multi_process_ = GetGPUMultiProcessors(0); 18 | max_threads_per_mp_ = GetGPUMaxThreadsPerMultiProcessor(0); 19 | max_threads_per_block_ = GetGPUMaxThreadsPerBlock(0); 20 | max_grid_dim_size_ = GetGpuMaxGridDimSize(0); 21 | 22 | device_name_ = GetDeviceName(); 23 | } 24 | 25 | GPUContext::~GPUContext() { 26 | CublasCheck(cublasDestroy(cublas_handle_)); 27 | // CudnnCheck(cudnnDestroy(cudnn_handle_)); 28 | } 29 | 30 | } // namespace core 31 | } // namespace kaleido 32 | -------------------------------------------------------------------------------- /kaleido/core/tests/test_cuda_info.cc: -------------------------------------------------------------------------------- 1 | // Copyright (c) Microsoft Corporation. All rights reserved. 2 | // Licensed under the MIT License. 3 | 4 | #include "kaleido/core/device/cuda_info.h" 5 | 6 | #include 7 | 8 | #include 9 | 10 | namespace kaleido { 11 | namespace core { 12 | 13 | TEST(test, TEST_GET_CUDA_DEVICE_INFO) { 14 | std::cout << "cuda device count: " << GetGPUDeviceCount() << std::endl; 15 | std::cout << "Compute Capability: " << GetGPUComputeCapability(0) 16 | << std::endl; 17 | std::cout << "Multiprocessors: " << GetGPUMultiProcessors(0) << std::endl; 18 | std::cout << "Max threads per MP: " << GetGPUMaxThreadsPerMultiProcessor(0) 19 | << std::endl; 20 | std::cout << "Max threads per blocks: " << GetGPUMaxThreadsPerBlock(0) 21 | << std::endl; 22 | auto grid_size = GetGpuMaxGridDimSize(0); 23 | std::cout << "Max grid size (x, y, z): " << grid_size.x << ", " << grid_size.y 24 | << ", " << grid_size.z << std::endl; 25 | } 26 | 27 | } // namespace core 28 | } // namespace kaleido 29 | -------------------------------------------------------------------------------- /benchmarks/rnn/baselines/grid_lstm/run_grid_lstm_pt.sh: -------------------------------------------------------------------------------- 1 | // Copyright (c) Microsoft Corporation. All rights reserved. 2 | // Licensed under the MIT License. 3 | 4 | #!/bin/bash 5 | 6 | seq_len=10 7 | batch_size=32 8 | 9 | # overall 10 | hiddens='256 512 1024' 11 | for hidden in $hiddens; do 12 | python3 gridlstm_pt.py --seq_len=$seq_len \ 13 | --batch_size=$batch_size \ 14 | --hidden_size=$hidden \ 15 | --depth=32 16 | done 17 | 18 | # scale with depth 19 | depths='1 2 4 8 16 32' 20 | hiddens='256 1024' 21 | for hidden in $hiddens; do 22 | for depth in $depths; do 23 | python3 gridlstm_pt.py --seq_len=$seq_len \ 24 | --batch_size=$batch_size \ 25 | --hidden_size=$hidden \ 26 | --depth=$depth 27 | done 28 | done 29 | 30 | # scale with length 31 | lengths='5 7 10' 32 | hiddens='256 1024' 33 | for length in $lengths; do 34 | for hidden in $hiddens; do 35 | python3 gridlstm_pt.py --seq_len=$seq_len \ 36 | --batch_size=32 \ 37 | --hidden_size=$hidden \ 38 | --depth=32 39 | done 40 | done 41 | -------------------------------------------------------------------------------- /docs/fractaltensor_operations/access_primitives.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | - [Access primitives of FractalTensor](#access-primitives-of-FractalTensor) 4 | - [\*index](#index) 5 | - [\*slice](#slice) 6 | - [\*gather (permute elements)](#gather-permute-elements) 7 | 8 | 12 | 13 | 14 | 15 | # Access primitives of FractalTensor 16 | 17 | Primitive access operations have first-class implementations in the backend. 18 | 19 | ## \*index 20 | 21 | $$\mathbf{index} ::\Psi n.[\alpha]^d_n \rightarrow \Psi m.[\alpha]^{d-1}_m$$ 22 | 23 | ```python 24 | index(x: FractalTensor[T], i: int) -> T 25 | ``` 26 | 27 | Access a `FractalTensor` variable using the `[]` operator is equivalent to call `index` . 28 | 29 | ## \*slice 30 | 31 | ```python 32 | slice(x: FractalTensor[T], start: int, end: int, stride: int) -> FractalTensor[T] 33 | ``` 34 | 35 | ## \*gather (permute elements) 36 | 37 | ```python 38 | gather(x: FractalTensor[T], indices:Tuple[int]) -> FractalTensor[T] 39 | ``` 40 | -------------------------------------------------------------------------------- /kaleido/core/device/kernels/tile_transmitter.h: -------------------------------------------------------------------------------- 1 | // Copyright (c) Microsoft Corporation. All rights reserved. 2 | // Licensed under the MIT License. 3 | 4 | #pragma once 5 | 6 | #include 7 | #include 8 | #include 9 | #include 10 | #include 11 | #include 12 | #include 13 | #include 14 | #include 15 | #include 16 | #include 17 | #include 18 | #include 19 | 20 | namespace kaleido { 21 | namespace core { 22 | namespace cuda_kernel { 23 | 24 | enum class TileLayout { 25 | RowMajor = 0, 26 | ColumnMajor = 1, 27 | SwizzledRowMajor = 2, // shared memory layout 28 | SwizzledColumnMajor = 3 29 | }; 30 | 31 | } // namespace cuda_kernel 32 | } // namespace core 33 | } // namespace kaleido 34 | -------------------------------------------------------------------------------- /kaleido/frontend/operations/__init__.py: -------------------------------------------------------------------------------- 1 | # ------------------------------------------------------------------------- 2 | # Copyright (c) Microsoft Corporation. All rights reserved. 3 | # Licensed under the MIT License. 4 | # -------------------------------------------------------------------------- 5 | 6 | from __future__ import absolute_import, division, print_function 7 | 8 | from kaleido.frontend.operations.conversion import * 9 | from kaleido.frontend.operations.fractaltensor.access import * 10 | from kaleido.frontend.operations.fractaltensor.functional.aggregate import * 11 | from kaleido.frontend.operations.fractaltensor.functional.apply_to_each import * 12 | from kaleido.frontend.operations.tensor.arithmetic.broadcast import * 13 | from kaleido.frontend.operations.tensor.arithmetic.contraction import * 14 | from kaleido.frontend.operations.tensor.arithmetic.elementwise import * 15 | from kaleido.frontend.operations.tensor.arithmetic.reduction import * 16 | from kaleido.frontend.operations.tensor.constants import * 17 | from kaleido.frontend.operations.tensor.data_movements import * 18 | from kaleido.frontend.operations.tensor.reshape import * 19 | -------------------------------------------------------------------------------- /benchmarks/rnn/baselines/stacked_lstm/figures/stacked_lstm_results.tsv: -------------------------------------------------------------------------------- 1 | Test Name Average Time (s) Elapsed Time(s) Throughput(seq per sec) 2 | pt_cudnn_lstm 0.0251 0.7545 2544.7697 3 | pt_finegrained_op_v1_cuda:0 0.4980 14.9410 128.5051 4 | pt_finegrained_op_v2_cuda:0 0.2123 6.3702 301.4025 5 | pt_finegrained_op_v1_cuda:0_JIT 0.1387 4.1602 461.5115 6 | pt_finegrained_op_v2_cuda:0_JIT 0.0505 1.5161 1266.4127 7 | tf_graph_cudnnlstm 0.0374 1.1219 1711.4016 8 | tf_graph_fine_grained_op_lstm_v1_gpu 0.1267 3.8025 504.9329 9 | tf_graph_fine_grained_op_lstm_v2_gpu 0.0743 2.2293 861.2472 10 | tf_graph_static_lstm_cell_gpu 0.0823 2.4683 777.8715 11 | tf_graph_whileOpLstm_gpu 0.1068 3.2042 599.2203 12 | tf_eager_cudnnlstm 0.0751 2.2541 851.7819 13 | tf_eager_fine_grained_op_lstm_v1_gpu 4.5640 136.9190 14.0229 14 | tf_eager_fine_grained_op_lstm_v2_gpu 3.2110 96.3287 19.9318 15 | tf_eager_static_lstm_cell_gpu 2.3811 71.4322 26.8786 16 | tf_autograph_cudnnlstm 0.0328 0.9851 1949.1312 17 | tf_autograph_fine_grained_op_lstm_v1_gpu 0.0857 2.5710 746.7896 18 | tf_autograph_fine_grained_op_lstm_v2_gpu 0.0501 1.5038 1276.7482 19 | tf_autograph_static_lstm_cell_gpu 0.0503 1.5080 1273.2241 20 | -------------------------------------------------------------------------------- /kaleido/core/tests/test_layout.cc: -------------------------------------------------------------------------------- 1 | // Copyright (c) Microsoft Corporation. All rights reserved. 2 | // Licensed under the MIT License. 3 | 4 | #include "kaleido/core/cuda_allocator.h" 5 | #include "kaleido/core/layout.h" 6 | 7 | #include 8 | 9 | #include 10 | 11 | namespace kaleido { 12 | namespace core { 13 | 14 | TEST(test, TestLayout) { 15 | const int kRow = 3; 16 | const int kCol = 7; 17 | using L1 = RowMajor; 18 | L1 row_major; 19 | 20 | std::cout << "num_rows: " << num_rows << std::endl 21 | << "num_cols: " << num_cols; 22 | 23 | for (int row_id = 0; row_id < num_rows; ++row_id) { 24 | for (int col_id = 0; col_id < num_cols; ++col_id) { 25 | EXPECT_EQ(row_major(row_id, col_id), row_id * kCol + col_id); 26 | } 27 | } 28 | 29 | using L2 = ColMajor; 30 | L2 col_major; 31 | for (int row_id = 0; row_id < num_rows; ++row_id) { 32 | for (int col_id = 0; col_id < num_rows; ++col_id) { 33 | EXPECT_EQ(col_major(row_id, col_id), row_id + col_id * kRow); 34 | } 35 | } 36 | } 37 | 38 | } // namespace core 39 | } // namespace kaleido 40 | -------------------------------------------------------------------------------- /kaleido/core/device/kernels/lstm/dilated_lstm/region1.h: -------------------------------------------------------------------------------- 1 | // Copyright (c) Microsoft Corporation. All rights reserved. 2 | // Licensed under the MIT License. 3 | 4 | #pragma once 5 | 6 | #include "kaleido/core/device/kernels/lstm.h" 7 | #include "kaleido/core/tensor.h" 8 | 9 | namespace kaleido::core::cuda_kernel { 10 | 11 | template 13 | float DilatedLstmRegion1(Element* csss, Element* hsss, const Element* xss, 14 | const Element* ws, const Element* us, 15 | const Element* init, int seq_length) { 16 | float elapsed_time = 0.0; 17 | 18 | const Element* x = xss; 19 | const Element* w = ws; 20 | const Element* u = us; 21 | 22 | cuda_kernel::CuteLSTMLayer 24 | cute_lstm_layer; 25 | 26 | elapsed_time += cute_lstm_layer(w, x, u, init, init, csss, hsss, seq_length); 27 | 28 | return elapsed_time; 29 | } 30 | } // namespace kaleido::core::cuda_kernel 31 | -------------------------------------------------------------------------------- /kaleido/core/fractal_tensor.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | #include "kaleido/core/allocator.h" 3 | #include "kaleido/core/types.h" 4 | 5 | namespace kaleido { 6 | namespace core { 7 | 8 | class FractalTensor { 9 | public: 10 | FractalTensor(const FractalTensorTypeDesc& desc, 11 | std::shared_ptr alloc) 12 | : type_desc_(desc), alloc_(alloc), data_(nullptr) { 13 | long byteCount = type_desc_.GetNumBytes(); 14 | if (byteCount) data_ = alloc_->Allocate(byteCount); 15 | }; 16 | 17 | ~FractalTensor() = default; 18 | 19 | std::string DebugString() const; 20 | 21 | template 22 | const T* data() const { 23 | return reinterpret_cast(data_); 24 | } 25 | 26 | template 27 | T* mutable_data() { 28 | return reinterpret_cast(data_); 29 | } 30 | 31 | private: 32 | FractalTensorTypeDesc type_desc_; 33 | std::shared_ptr alloc_; 34 | 35 | void* data_; 36 | }; 37 | static inline std::ostream& operator<<(std::ostream& os, 38 | const FractalTensor& ft) { 39 | os << ft.DebugString(); 40 | return os; 41 | } 42 | 43 | } // namespace core 44 | } // namespace kaleido 45 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) Microsoft Corporation. 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE 22 | -------------------------------------------------------------------------------- /cmake/external/pybind.cmake: -------------------------------------------------------------------------------- 1 | # ------------------------------------------------------------------------- 2 | # Copyright (c) Microsoft Corporation. All rights reserved. Licensed under the 3 | # MIT License. 4 | # -------------------------------------------------------------------------- 5 | 6 | include(ExternalProject) 7 | 8 | set(PYBIND_PREFIX_DIR ${THIRD_PARTY_PATH}/pybind) 9 | set(PYBIND_SOURCE_DIR ${PYBIND_PREFIX_DIR}/src/extern_pybind) 10 | set(PYBIND_REPOSITORY https://github.com/pybind/pybind11.git) 11 | set(PYBIND_TAG v2.11.1) 12 | 13 | cache_third_party( 14 | extern_pybind 15 | REPOSITORY 16 | ${PYBIND_REPOSITORY} 17 | TAG 18 | ${PYBIND_TAG} 19 | DIR 20 | PYBIND_SOURCE_DIR) 21 | 22 | set(PYBIND_INCLUDE_DIR ${PYBIND_SOURCE_DIR}/include) 23 | include_directories(${PYBIND_INCLUDE_DIR}) 24 | 25 | ExternalProject_Add( 26 | extern_pybind 27 | ${EXTERNAL_PROJECT_LOG_ARGS} 28 | ${SHALLOW_CLONE} 29 | "${PYBIND_DOWNLOAD_CMD}" 30 | PREFIX ${PYBIND_PREFIX_DIR} 31 | SOURCE_DIR ${PYBIND_SOURCE_DIR} 32 | UPDATE_COMMAND "" 33 | CONFIGURE_COMMAND "" 34 | BUILD_COMMAND "" 35 | INSTALL_COMMAND "" 36 | TEST_COMMAND "") 37 | 38 | add_library(pybind INTERFACE) 39 | add_dependencies(pybind extern_pybind) 40 | -------------------------------------------------------------------------------- /benchmarks/rnn/baselines/README.md: -------------------------------------------------------------------------------- 1 | # Test Environment 2 | 3 | ``` {.text} 4 | OS: Ubuntu 16.04.7 LTS 5 | TensorFlow version: 2.2.3, compiled by gcc 5.0 6 | PyTorch v1.9.0 7 | CUDA Version 10.2 8 | CUDNN Version 7.6.5 9 | ``` 10 | ## CPU information 11 | 12 | ```bash 13 | lscpu 14 | ``` 15 | 16 | ``` {.text} 17 | Architecture: x86_64 18 | CPU op-mode(s): 32-bit, 64-bit 19 | Byte Order: Little Endian 20 | CPU(s): 12 # virtual CPU 21 | On-line CPU(s) list: 0-11 22 | Thread(s) per core: 2 23 | Core(s) per socket: 6 24 | Socket(s): 1 25 | NUMA node(s): 1 26 | Vendor ID: GenuineIntel 27 | CPU family: 6 28 | Model: 63 29 | Model name: Intel(R) Core(TM) i7-5930K CPU @ 3.50GHz 30 | Stepping: 2 31 | CPU MHz: 1200.117 32 | CPU max MHz: 3700.0000 33 | CPU min MHz: 1200.0000 34 | BogoMIPS: 7000.36 35 | Virtualization: VT-x 36 | L1d cache: 32K 37 | L1i cache: 32K 38 | L2 cache: 256K 39 | L3 cache: 15360K 40 | NUMA node0 CPU(s): 0-11 41 | ``` 42 | 43 | ### GPU information 44 | 45 | GeForce RTX 2080 Ti, Compute Capability 7.5 46 | -------------------------------------------------------------------------------- /examples/hello_world/utils.py: -------------------------------------------------------------------------------- 1 | # ------------------------------------------------------------------------- 2 | # Copyright (c) Microsoft Corporation. All rights reserved. 3 | # Licensed under the MIT License. 4 | # -------------------------------------------------------------------------- 5 | 6 | from typing import List, Tuple 7 | 8 | import torch 9 | 10 | import kaleido 11 | from kaleido import FractalTensor, FractalTensorStorage, Tensor, TensorStorage 12 | from kaleido import operations as ops 13 | 14 | seq_len = 10 15 | batch_size = 7 16 | 17 | hidden_dim = 512 18 | depth = 4 19 | 20 | device = 'cpu' 21 | 22 | 23 | def create_params(shape, depth): 24 | x = FractalTensor(TensorStorage(shape, kaleido.float32, device=device)) 25 | x.indices = list(range(depth)) 26 | x.initialize(torch.rand, *x.flatten_shape, device=device) 27 | return x 28 | 29 | 30 | xss = FractalTensor( 31 | FractalTensorStorage( 32 | TensorStorage((1, hidden_dim), kaleido.float32, device=device))) 33 | xss.indices = [list(range(seq_len)) for _ in range(batch_size)] 34 | xss.initialize(torch.rand, *xss.flatten_shape, device=device) 35 | 36 | Ws = create_params([hidden_dim, hidden_dim], depth) 37 | Us = create_params([hidden_dim, hidden_dim], depth) 38 | -------------------------------------------------------------------------------- /cmake/external/tvm.cmake: -------------------------------------------------------------------------------- 1 | # ------------------------------------------------------------------------- 2 | # Copyright (c) Microsoft Corporation. All rights reserved. Licensed under the 3 | # MIT License. 4 | # -------------------------------------------------------------------------- 5 | 6 | include(ExternalProject) 7 | 8 | set(TVM_PREFIX_DIR ${THIRD_PARTY_PATH}/tvm) 9 | set(TVM_SOURCE_DIR ${TVM_PREFIX_DIR}/src/extern_tvm) 10 | 11 | set(TVM_REPOSITORY https://github.com/apache/tvm.git) 12 | set(TVM_TAG v0.8.0) 13 | 14 | cache_third_party( 15 | extern_tvm 16 | REPOSITORY 17 | ${TVM_REPOSITORY} 18 | TAG 19 | ${TVM_TAG} 20 | DIR 21 | TVM_SOURCE_DIR) 22 | 23 | set(TVM_INCLUDE_DIR ${TVM_SOURCE_DIR}/include) 24 | include_directories(${TVM_INCLUDE_DIR}) 25 | 26 | ExternalProject_Add( 27 | extern_tvm 28 | ${EXTERNAL_PROJECT_LOG_ARGS} 29 | ${SHALLOW_CLONE} 30 | "${TVM_DOWNLOAD_CMD}" 31 | PREFIX ${TVM_PREFIX_DIR} 32 | SOURCE_DIR ${TVM_SOURCE_DIR} 33 | BUILD_IN_SOURCE 1 34 | COMMAND "cp ${TVM_SOURCE_DIR}/cmake/config.cmake ${TVM_SOURCE_DIR}" 35 | UPDATE_COMMAND "" 36 | CONFIGURE_COMMAND ${CMAKE_COMMAND} -DCMAKE_POSITION_INDEPENDENT_CODE=ON . 37 | BUILD_COMMAND $(MAKE) -j$(nproc) 38 | INSTALL_COMMAND "" 39 | TEST_COMMAND "") 40 | -------------------------------------------------------------------------------- /benchmarks/rnn/baselines/grid_lstm/test_utils.py: -------------------------------------------------------------------------------- 1 | # ------------------------------------------------------------------------- 2 | # Copyright (c) Microsoft Corporation. All rights reserved. 3 | # Licensed under the MIT License. 4 | # -------------------------------------------------------------------------- 5 | 6 | import tensorflow as tf 7 | 8 | 9 | def get_config(): 10 | config = tf.compat.v1.ConfigProto( 11 | gpu_options=tf.compat.v1.GPUOptions( 12 | allow_growth=True, per_process_gpu_memory_fraction=0.2)) 13 | 14 | config.log_device_placement = False 15 | config.allow_soft_placement = True 16 | 17 | config.intra_op_parallelism_threads = 0 18 | config.inter_op_parallelism_threads = 56 19 | 20 | return config 21 | 22 | 23 | def device(dtype="cpu"): 24 | """Return the TF device string. 25 | 26 | Args: 27 | dtype: String, "cpu" or "gpu". 28 | 29 | Raises: 30 | ValueError: if dtype is an unknown device. 31 | """ 32 | if dtype == "cpu": 33 | return "/device:CPU:0" 34 | elif dtype == "gpu": 35 | assert tf.test.is_gpu_available(cuda_only=True) 36 | return "/device:GPU:0" 37 | else: 38 | raise ValueError("Unknown device type. Should be cpu or gpu.") 39 | -------------------------------------------------------------------------------- /kaleido/frontend/operations/tests/test_product.py: -------------------------------------------------------------------------------- 1 | # ------------------------------------------------------------------------- 2 | # Copyright (c) Microsoft Corporation. All rights reserved. 3 | # Licensed under the MIT License. 4 | # -------------------------------------------------------------------------- 5 | 6 | from __future__ import print_function 7 | 8 | from context import * 9 | 10 | 11 | class TestProduct(unittest.TestCase): 12 | def setUp(self): 13 | random.seed(12345) 14 | 15 | data1 = list(range(11)) 16 | self.x = kaleido.FractalTensor.from_pylist(data1) 17 | 18 | data2 = list(range(7)) 19 | self.y = kaleido.FractalTensor.from_pylist(data2) 20 | 21 | def test_product(self): 22 | xss, yss = kaleido.operations.product(self.x, self.y) 23 | self.assertTrue(isinstance(xss, kaleido.FractalTensor)) 24 | self.assertTrue(isinstance(yss, kaleido.FractalTensor)) 25 | 26 | for i, (xs, ys) in enumerate(kaleido.operations.zip(xss, yss)): 27 | for j, (x, y) in enumerate(kaleido.operations.zip(xs, ys)): 28 | self.assertEqual(x.data.item(), j) 29 | self.assertEqual(y.data.item(), i) 30 | 31 | 32 | if __name__ == '__main__': 33 | unittest.main() 34 | -------------------------------------------------------------------------------- /benchmarks/rnn/baselines/stacked_lstm/test_utils.py: -------------------------------------------------------------------------------- 1 | # ------------------------------------------------------------------------- 2 | # Copyright (c) Microsoft Corporation. All rights reserved. 3 | # Licensed under the MIT License. 4 | # -------------------------------------------------------------------------- 5 | 6 | import tensorflow as tf 7 | 8 | 9 | def get_config(): 10 | config = tf.compat.v1.ConfigProto( 11 | gpu_options=tf.compat.v1.GPUOptions( 12 | allow_growth=True, per_process_gpu_memory_fraction=0.2)) 13 | 14 | config.log_device_placement = False 15 | config.allow_soft_placement = True 16 | 17 | config.intra_op_parallelism_threads = 0 18 | config.inter_op_parallelism_threads = 56 19 | 20 | return config 21 | 22 | 23 | def device(dtype='cpu'): 24 | '''Return the TF device string. 25 | 26 | Args: 27 | dtype: String, 'cpu' or 'gpu'. 28 | 29 | Raises: 30 | ValueError: if dtype is an unknown device. 31 | ''' 32 | 33 | if dtype == 'cpu': 34 | return '/device:CPU:0' 35 | elif dtype == 'gpu': 36 | assert len(tf.config.list_physical_devices('GPU')) 37 | return '/device:GPU:0' 38 | else: 39 | raise ValueError('Unknown device type. Should be cpu or gpu.') 40 | -------------------------------------------------------------------------------- /cmake/external/cccl.cmake: -------------------------------------------------------------------------------- 1 | # ------------------------------------------------------------------------- 2 | # Copyright (c) Microsoft Corporation. All rights reserved. Licensed under the 3 | # MIT License. 4 | # -------------------------------------------------------------------------- 5 | 6 | include(ExternalProject) 7 | 8 | set(CCCL_PREFIX_DIR ${THIRD_PARTY_PATH}/cccl) 9 | set(CCCL_SOURCE_DIR ${CCCL_PREFIX_DIR}/src/extern_cccl) 10 | set(CCCL_REPOSITORY https://github.com/NVIDIA/cccl.git) 11 | set(CCCL_TAG v2.3.0-rc0) 12 | 13 | cache_third_party( 14 | extern_cccl 15 | REPOSITORY 16 | ${CCCL_REPOSITORY} 17 | TAG 18 | ${CCCL_TAG} 19 | DIR 20 | CCCL_SOURCE_DIR) 21 | 22 | set(CUB_INCLUDE_DIR ${CCCL_SOURCE_DIR}/cub) 23 | set(THRUST_INCLUDE_DIR ${CCCL_SOURCE_DIR}/thrust) 24 | set(LIBCUDACXX_INCLUDE_DIR ${CCCL_SOURCE_DIR}/libcudacxx) 25 | 26 | ExternalProject_Add( 27 | extern_cccl 28 | ${EXTERNAL_PROJECT_LOG_ARGS} 29 | ${SHALLOW_CLONE} 30 | "${CCCL_DOWNLOAD_CMD}" 31 | PREFIX ${CCCL_PREFIX_DIR} 32 | SOURCE_DIR ${CCCL_SOURCE_DIR} 33 | UPDATE_COMMAND "" 34 | CONFIGURE_COMMAND "" 35 | BUILD_COMMAND "" 36 | INSTALL_COMMAND "" 37 | TEST_COMMAND "") 38 | 39 | include_directories(${CUB_INCLUDE_DIR}) 40 | include_directories(${THRUST_INCLUDE_DIR}) 41 | include_directories(${LIBCUDACXX_INCLUDE_DIR}) 42 | -------------------------------------------------------------------------------- /SUPPORT.md: -------------------------------------------------------------------------------- 1 | # TODO: The maintainer of this repo has not yet edited this file 2 | 3 | **REPO OWNER**: Do you want Customer Service & Support (CSS) support for this product/project? 4 | 5 | - **No CSS support:** Fill out this template with information about how to file issues and get help. 6 | - **Yes CSS support:** Fill out an intake form at [aka.ms/onboardsupport](https://aka.ms/onboardsupport). CSS will work with/help you to determine next steps. 7 | - **Not sure?** Fill out an intake as though the answer were "Yes". CSS will help you decide. 8 | 9 | *Then remove this first heading from this SUPPORT.MD file before publishing your repo.* 10 | 11 | # Support 12 | 13 | ## How to file issues and get help 14 | 15 | This project uses GitHub Issues to track bugs and feature requests. Please search the existing 16 | issues before filing new issues to avoid duplicates. For new issues, file your bug or 17 | feature request as a new Issue. 18 | 19 | For help and questions about using this project, please **REPO MAINTAINER: INSERT INSTRUCTIONS HERE 20 | FOR HOW TO ENGAGE REPO OWNERS OR COMMUNITY FOR HELP. COULD BE A STACK OVERFLOW TAG OR OTHER 21 | CHANNEL. WHERE WILL YOU HELP PEOPLE?**. 22 | 23 | ## Microsoft Support Policy 24 | 25 | Support for this **PROJECT or PRODUCT** is limited to the resources listed above. 26 | -------------------------------------------------------------------------------- /examples/flash_attention/flash_attention_utils.py: -------------------------------------------------------------------------------- 1 | # ------------------------------------------------------------------------- 2 | # Copyright (c) Microsoft Corporation. All rights reserved. 3 | # Licensed under the MIT License. 4 | # -------------------------------------------------------------------------- 5 | 6 | import torch 7 | 8 | import kaleido 9 | from kaleido import FractalTensor, FractalTensorStorage, TensorStorage 10 | 11 | 12 | def create_input(head_dim: int, 13 | num_heads: int, 14 | seq_len: int, 15 | batch_size: int, 16 | block_dim: int, 17 | device: str = 'cpu'): 18 | # depth-1: batch_size 19 | # depth-2: num_heads 20 | # depth-3: block_num = seq_length / block_dim 21 | xsss = FractalTensor( 22 | FractalTensorStorage( 23 | FractalTensorStorage( 24 | TensorStorage((block_dim, head_dim), 25 | kaleido.float32, 26 | device=device)))) 27 | indices = [] 28 | block_num = int(seq_len / block_dim) 29 | for _ in range(batch_size): 30 | indices.append([list(range(block_num)) for _ in range(num_heads)]) 31 | xsss.indices = indices 32 | xsss.initialize(torch.rand, *xsss.flatten_shape, device=device) 33 | return xsss 34 | -------------------------------------------------------------------------------- /examples/sparse_attention/torch_windowed_attention_demo.py: -------------------------------------------------------------------------------- 1 | # ------------------------------------------------------------------------- 2 | # Copyright (c) Microsoft Corporation. All rights reserved. 3 | # Licensed under the MIT License. 4 | # -------------------------------------------------------------------------- 5 | 6 | import torch 7 | import torch.nn.functional as F 8 | 9 | 10 | def test(qss, kss, vss): 11 | # Q, K, V: [batch_size, block_num, block_size, hidden] 12 | Q = qss[:, :, 2:-2:, :] 13 | K = torch.cat((kss[:, :, 1:-3, :], kss[:, :, 2:-2, :], kss[:, :, 3:-1, :]), 14 | 2) 15 | V = torch.cat((vss[:, :, 1:-3, :], vss[:, :, 2:-2, :], vss[:, :, 3:-1, :]), 16 | 2) 17 | QK = torch.einsum("blqd,blkd->blqk", Q, K) 18 | attn_weights = F.softmax(QK, -1) 19 | attn_vecs = torch.einsum("blqk,blkd->blqd", attn_weights, V) 20 | return attn_vecs 21 | 22 | 23 | if __name__ == '__main__': 24 | batch_size = 16 25 | block_size = 16 26 | seq_len = 512 27 | hidden_dim = 128 28 | block_num = seq_len // block_size 29 | 30 | queries = torch.rand(batch_size, block_num, block_size, hidden_dim) 31 | keys = torch.rand(batch_size, block_num, block_size, hidden_dim) 32 | values = torch.rand(batch_size, block_num, block_size, hidden_dim) 33 | 34 | test(queries, keys, values) 35 | -------------------------------------------------------------------------------- /benchmarks/rnn/cuDNN/lstm_cell_cudnn.cu: -------------------------------------------------------------------------------- 1 | // Copyright (c) Microsoft Corporation. All rights reserved. 2 | // Licensed under the MIT License. 3 | 4 | #include "utils.h" 5 | 6 | #include 7 | 8 | #include 9 | #include 10 | #include 11 | #include 12 | #include 13 | 14 | int main(int argc, char* argv[]) { 15 | assert(argc == 2); 16 | const char* filename = argv[1]; 17 | 18 | std::ofstream fout; 19 | fout.setf(std::ios::fixed); 20 | fout.precision(4); 21 | 22 | fout.open(filename, std::ios::out); 23 | 24 | srand(1234); 25 | constexpr std::array hidden_sizes = {128, 256, 512, 1024}; 26 | constexpr std::array batch_sizes = {32, 64, 128, 256}; 27 | const int seq_length = 1; 28 | const int depth = 1; 29 | 30 | fout << "[depth, seq_length, batch_size, hidden_size]\tAvgTime(ms)" 31 | << std::endl; 32 | 33 | for (auto hidden_size : hidden_sizes) { 34 | for (auto batch_size : batch_sizes) { 35 | genSeqs(batch_size, seq_length, false); 36 | float cudnn_time = TestCuDNNLSTM(batch_size, hidden_size, seq_length, 37 | depth, hidden_size); 38 | 39 | fout << "[" << depth << ", " << seq_length << ", " << batch_size << ", " 40 | << hidden_size << "]\t" << cudnn_time << std::endl; 41 | } 42 | } 43 | } 44 | -------------------------------------------------------------------------------- /kaleido/core/layout.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | 5 | namespace kaleido { 6 | namespace core { 7 | 8 | using namespace cute; 9 | 10 | // In the row major layout, the contiguous dimension in memory is the 11 | // last dimension. 12 | template 13 | using RowMajor = 14 | cute::Layout, Int>, Stride, _1>>; 15 | 16 | __device__ auto make_row_major_layout(const int row, const int col, 17 | const int stride) { 18 | return cute::make_layout(make_shape(row, col), make_stride(stride, 1)); 19 | } 20 | 21 | // In the column major layout, the contiguous dimension in memory is the 22 | // first dimension. 23 | template 24 | using ColMajor = 25 | cute::Layout, Int>, Stride<_1, Int>>; 26 | 27 | __device__ auto make_col_major_layout(const int row, const int col, 28 | const int stride) { 29 | return cute::make_layout(make_shape(row, col), make_stride(1, stride)); 30 | } 31 | 32 | template 33 | static constexpr size_t num_rows = cute::size<0>(Layout{}); 34 | 35 | template /* */ 36 | static constexpr size_t num_cols = cute::size<1>(Layout{}); 37 | 38 | } // namespace core 39 | } // namespace kaleido 40 | -------------------------------------------------------------------------------- /kaleido/core/operators/softmax_op.cu: -------------------------------------------------------------------------------- 1 | // Copyright (c) Microsoft Corporation. All rights reserved. 2 | // Licensed under the MIT License. 3 | 4 | #include "kaleido/core/device/gpu_context.h" 5 | #include "kaleido/core/device/kernels/softmax.h" 6 | #include "kaleido/core/operators/softmax_op.h" 7 | #include "kaleido/core/types.h" 8 | 9 | namespace kaleido { 10 | namespace core { 11 | namespace ops { 12 | 13 | template 14 | class SoftmaxOp { 15 | public: 16 | void operator()(const GPUContext& context, const Tensor& x, Tensor& y, 17 | int dim = 0) { 18 | assert(x.ndim() == 2 && x.ndim() == 2 && x.shape() == y.shape()); 19 | if (dim == 1) LOG(FATAL) << "Not implmented yet."; 20 | 21 | const int kThreadsPerBlock = context.GetMaxThreadsPerBlock(); 22 | int width = x.dim_size(1); 23 | int height = x.dim_size(0); 24 | 25 | int block_num = 26 | width > kThreadsPerBlock 27 | ? kThreadsPerBlock 28 | : pow(2, static_cast(log2(static_cast(width)))); 29 | 30 | dim3 block(block_num, 1); 31 | dim3 grid(height, 1); 32 | 33 | cuda_kernel::KeMatrixSoftMax<<>>( 34 | x.data(), y.mutable_data(), width); 35 | } 36 | }; 37 | 38 | template class SoftmaxOp; 39 | 40 | } // namespace ops 41 | } // namespace core 42 | } // namespace kaleido 43 | -------------------------------------------------------------------------------- /kaleido/frontend/operations/tests/test_zip.py: -------------------------------------------------------------------------------- 1 | # ------------------------------------------------------------------------- 2 | # Copyright (c) Microsoft Corporation. All rights reserved. 3 | # Licensed under the MIT License. 4 | # -------------------------------------------------------------------------- 5 | 6 | from __future__ import print_function 7 | 8 | from context import * 9 | 10 | 11 | class TestZip(unittest.TestCase): 12 | def setUp(self): 13 | random.seed(12345) 14 | 15 | data = list(range(7)) 16 | self.xs1 = kaleido.FractalTensor.from_pylist(data) 17 | 18 | data = list(range(7, 14, 1)) 19 | self.xs2 = kaleido.FractalTensor.from_pylist(data) 20 | 21 | def test_zipped_ta(self): 22 | zipped = kaleido.operations.zip(self.xs1, self.xs2) 23 | self.assertTrue(isinstance(zipped, kaleido.Iterative)) 24 | 25 | for x1, x2 in zipped: 26 | self.assertEqual(x2.data - x1.data, 7) 27 | 28 | def test_nested_zip(self): 29 | zipped = kaleido.operations.zip( 30 | kaleido.operations.zip(self.xs1, self.xs2), self.xs1) 31 | self.assertTrue(isinstance(zipped, kaleido.Iterative)) 32 | 33 | for xz, z in zipped: 34 | x, y = xz 35 | self.assertEqual(y.data - x.data, 7) 36 | self.assertEqual(y.data - z.data, 7) 37 | 38 | 39 | if __name__ == '__main__': 40 | unittest.main() 41 | -------------------------------------------------------------------------------- /benchmarks/rnn/cuDNN/stacked_lstm_cudnn.cu: -------------------------------------------------------------------------------- 1 | // Copyright (c) Microsoft Corporation. All rights reserved. 2 | // Licensed under the MIT License. 3 | 4 | #include "utils.h" 5 | 6 | #include 7 | 8 | #include 9 | #include 10 | #include 11 | #include 12 | #include 13 | 14 | int main(int argc, char* argv[]) { 15 | assert(argc == 2); 16 | const char* filename = argv[1]; 17 | 18 | std::ofstream fout; 19 | fout.setf(std::ios::fixed); 20 | fout.precision(4); 21 | 22 | fout.open(filename, std::ios::out); 23 | 24 | srand(1234); 25 | constexpr std::array hidden_sizes = {64, 128, 256, 512, 1024}; 26 | constexpr std::array batch_sizes = {32, 64}; 27 | constexpr size_t seq_length = 16; 28 | constexpr std::array depths = {1, 2, 4, 8, 16, 32}; 29 | 30 | fout << "[depth, seq_length, batch_size, hidden_size]\tAvgTime(ms)" 31 | << std::endl; 32 | 33 | for (auto depth : depths) { 34 | for (auto hidden_size : hidden_sizes) { 35 | for (auto batch_size : batch_sizes) { 36 | genSeqs(batch_size, seq_length, false); 37 | float cudnn_time = TestCuDNNLSTM(batch_size, hidden_size, seq_length, 38 | depth, hidden_size); 39 | 40 | fout << "[" << depth << ", " << seq_length << ", " << batch_size << ", " 41 | << hidden_size << "]\t" << cudnn_time << std::endl; 42 | } 43 | } 44 | } 45 | } 46 | -------------------------------------------------------------------------------- /kaleido/core/operators/online_softmax_op.cu: -------------------------------------------------------------------------------- 1 | // Copyright (c) Microsoft Corporation. All rights reserved. 2 | // Licensed under the MIT License. 3 | 4 | #include "kaleido/core/device/gpu_context.h" 5 | #include "kaleido/core/device/kernels/online_softmax.h" 6 | #include "kaleido/core/operators/online_softmax_op.h" 7 | #include "kaleido/core/types.h" 8 | 9 | namespace kaleido { 10 | namespace core { 11 | namespace ops { 12 | 13 | template 14 | class OnlineNormalizedSoftmaxOp { 15 | public: 16 | void operator()(const GPUContext& context, const Tensor& x, Tensor& y, 17 | int dim = 0) { 18 | assert(x.ndim() == 2 && x.ndim() == 2 && x.shape() == y.shape()); 19 | if (dim == 1) LOG(FATAL) << "Not implmented yet."; 20 | 21 | const int kThreadsPerBlock = context.GetMaxThreadsPerBlock(); 22 | int width = x.dim_size(1); 23 | int height = x.dim_size(0); 24 | 25 | int block_num = 26 | width > kThreadsPerBlock 27 | ? kThreadsPerBlock 28 | : pow(2, static_cast(log2(static_cast(width)))); 29 | 30 | dim3 block(block_num, 1); 31 | dim3 grid(height, 1); 32 | 33 | cuda_kernel::KeOnlineNormalizedSoftMax<<>>( 34 | x.data(), y.mutable_data(), width); 35 | } 36 | }; 37 | 38 | template class OnlineNormalizedSoftmaxOp; 39 | 40 | } // namespace ops 41 | } // namespace core 42 | } // namespace kaleido 43 | -------------------------------------------------------------------------------- /kaleido/frontend/operations/tensor/arithmetic/contraction.py: -------------------------------------------------------------------------------- 1 | # ------------------------------------------------------------------------- 2 | # Copyright (c) Microsoft Corporation. All rights reserved. 3 | # Licensed under the MIT License. 4 | # -------------------------------------------------------------------------- 5 | 6 | from __future__ import absolute_import, division, print_function 7 | 8 | from typing import Tuple 9 | 10 | import torch 11 | 12 | import kaleido 13 | from kaleido.frontend.operations.base import Contraction 14 | 15 | __all__ = [ 16 | 'mm', 17 | 'outer', 18 | ] 19 | 20 | 21 | class MatMul(Contraction): 22 | """ (tensor contraction: reduce + map) matrix multiplication 23 | 24 | y = a $\otimes$ b 25 | """ 26 | 27 | def __call__(self, x: kaleido.Tensor, y: kaleido.Tensor) -> kaleido.Tensor: 28 | t = super(MatMul, self).__call__(x, y) 29 | 30 | t.data = x.data @ y.data 31 | t._type._shape = list(t.data.shape) 32 | t.recompute_strides() 33 | return t 34 | 35 | 36 | mm = MatMul() 37 | 38 | 39 | class Outer(Contraction): 40 | """ Outer product of two vector.""" 41 | 42 | def __call__(self, x: kaleido.Tensor, y: kaleido.Tensor) -> kaleido.Tensor: 43 | t = super(Outer, self).__call__(x, y) 44 | 45 | t.data = torch.outer(x.data, y.data) 46 | t._type._shape = list(t.data.shape) 47 | 48 | t.recompute_strides() 49 | return t 50 | 51 | 52 | outer = Outer() 53 | -------------------------------------------------------------------------------- /.pre-commit-config.yaml: -------------------------------------------------------------------------------- 1 | minimum_pre_commit_version: 3.0.0 2 | 3 | repos: 4 | - repo: https://github.com/Lucas-C/pre-commit-hooks.git 5 | rev: v1.5.5 6 | hooks: 7 | - id: remove-crlf 8 | files: (?!.*third_party)^.*$ | (?!.*book)^.*$ 9 | - repo: https://github.com/pre-commit/mirrors-yapf.git 10 | rev: v0.32.0 11 | hooks: 12 | - id: yapf 13 | additional_dependencies: [toml] 14 | files: (.*\.(py|bzl)|BUILD|.*\.BUILD|WORKSPACE)$ 15 | - repo: https://github.com/pycqa/isort 16 | rev: 5.13.2 17 | hooks: 18 | - id: isort 19 | name: isort (python) 20 | - repo: https://github.com/pre-commit/pre-commit-hooks 21 | rev: v4.6.0 22 | hooks: 23 | - id: check-added-large-files 24 | - id: check-merge-conflict 25 | - id: check-symlinks 26 | - id: detect-private-key 27 | files: (?!.*third_party)^.*$ | (?!.*book)^.*$ 28 | - id: end-of-file-fixer 29 | - id: check-yaml 30 | - id: check-toml 31 | - id: check-ast 32 | - id: check-executables-have-shebangs 33 | - id: check-shebang-scripts-are-executable 34 | - id: detect-private-key 35 | - id: debug-statements 36 | - repo: local 37 | hooks: 38 | - id: clang-format-with-version-check 39 | name: clang-format 40 | description: Format files with ClangFormat. 41 | entry: bash ./scripts/clang_format.hook -i 42 | language: system 43 | files: \.(c|cc|cxx|cpp|cu|h|cuh|hpp|hxx)$ 44 | -------------------------------------------------------------------------------- /examples/flash_attention/README.md: -------------------------------------------------------------------------------- 1 | # Algorithm Idea 2 | 3 | ## The intuition 4 | 5 | Given a query $q_i \in \mathbb{R}^d$, and a lists of keys and values $k_1,\cdots,k_n$ and $v_1, \cdots, v_n \in \mathbb{R}^d$ of length $n$ 6 | 7 | 8 | $$\begin{align} 9 | s_i &= \text{dot}(q, k_i) \\ 10 | s_{i}' &= \frac{e^{s_i}}{\sum_je^{s_j}} \\ 11 | \text{attention}(q,k,v) &= \sum_i{v_is_i'} \\ 12 | \end{align}$$ 13 | 14 | The summation in equation (2) could be moved to the very end of the attention operation (3) 15 | 16 | $$\begin{align*} 17 | s_i &= \text{dot}(q, k_i) \\ 18 | s_{i}' &= e^{s_i} \\ 19 | \text{attention}(q,k,v) &= \frac{\sum_i{v_is_i'}}{\sum_je^{s_j}} 20 | \end{align*}$$ 21 | 22 | The processing process can be written as: 23 | 24 | $$\begin{align*} 25 | s_i &= \text{dot}(q,k_i) \\ 26 | v^* &\leftarrow v^* + v_ie^{s_i} \\ 27 | s^* &\leftarrow s^* + e^{s_i} \\ 28 | \text{attention}(q,k,v) &= \frac{v^*}{s^*} 29 | \end{align*}$$ 30 | 31 | ## Numerical Stability 32 | 33 | intialize $v^* \in \mathbb{R}^d = 0$, $s* \in R = 0$, $m = -\text{inf}$ 34 | 35 | $$\begin{align*} 36 | s_i &= \text{dot}(q,k_i) \\ 37 | m_i &= \text{max}(m^*,s_i)\\ 38 | v^* &\leftarrow v^*e^{m^*-m_i} + v_ie^{s_i-m_i} \\ 39 | s^* &\leftarrow s^*e^{m^*-m_i} + e^{s_i-m_i} \\ 40 | m^* &\leftarrow m_i \\ 41 | \text{attention}(q,k,v) &= \frac{v^*}{s^*} \\ 42 | \end{align*}$$ 43 | 44 | # Reference 45 | 46 | 1. Rabe, Markus N., and Charles Staats. "[Self-attention Does Not Need $ O (n^ 2) $ Memory](https://arxiv.org/pdf/2112.05682.pdf)." arXiv preprint arXiv:2112.05682 (2021). 47 | -------------------------------------------------------------------------------- /examples/grid_rnn/grid_rnn_utils.py: -------------------------------------------------------------------------------- 1 | # ------------------------------------------------------------------------- 2 | # Copyright (c) Microsoft Corporation. All rights reserved. 3 | # Licensed under the MIT License. 4 | # -------------------------------------------------------------------------- 5 | 6 | import random 7 | 8 | import torch 9 | 10 | import kaleido 11 | from examples.utils import gen_dataset 12 | from kaleido import Tensor 13 | 14 | device = 'cpu' 15 | # device = 'cuda' 16 | 17 | depth = 3 18 | batch_size = 16 19 | vocab_size = 5000 20 | hidden_dim = 128 21 | 22 | MIN_LEN = 3 23 | MAX_LEN = 7 24 | 25 | src_words = gen_dataset(batch_size, vocab_size, device=device) 26 | trg_words = gen_dataset(batch_size, vocab_size, device=device) 27 | 28 | src_emb = Tensor((vocab_size, hidden_dim), kaleido.float32, device=device) 29 | src_emb.initialize(torch.rand, *src_emb.shape, device=device) 30 | 31 | trg_emb = Tensor((vocab_size, hidden_dim), kaleido.float32, device=device) 32 | trg_emb.initialize(torch.rand, *trg_emb.shape, device=device) 33 | 34 | 35 | def create_cell(): 36 | i2h = Tensor((2 * hidden_dim, hidden_dim), kaleido.float32, device=device) 37 | i2h.initialize(torch.rand, *i2h.shape, device=device) 38 | 39 | h2h = Tensor((hidden_dim, hidden_dim), kaleido.float32, device=device) 40 | h2h.initialize(torch.rand, *h2h.shape, device=device) 41 | 42 | bias = Tensor((1, hidden_dim), kaleido.float32, device=device) 43 | bias.initialize(torch.rand, *bias.shape, device=device) 44 | return {'i2h': i2h, 'h2h': h2h, 'bias': bias} 45 | -------------------------------------------------------------------------------- /kaleido/core/operators/launch_config.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | namespace kaleido { 4 | namespace core { 5 | namespace ops { 6 | 7 | inline bool IsPow2(unsigned int x) { return ((x & (x - 1)) == 0); } 8 | 9 | inline unsigned int NextPow2(unsigned int x) { 10 | --x; 11 | x |= x >> 1; 12 | x |= x >> 2; 13 | x |= x >> 4; 14 | x |= x >> 8; 15 | x |= x >> 16; 16 | return ++x; 17 | } 18 | 19 | inline unsigned int Log2Floor(unsigned int x) { 20 | if (x == 0) return -1U; 21 | int log = 0; 22 | unsigned int value = x; 23 | for (int i = 4; i >= 0; --i) { 24 | int shift = (1 << i); 25 | unsigned int n = value >> shift; 26 | if (n != 0) { 27 | value = n; 28 | log += shift; 29 | } 30 | } 31 | assert(value == 1); 32 | return log; 33 | } 34 | 35 | template 36 | inline T DivUp(const X x, const Y y) { 37 | return static_cast((x + y - 1) / y); 38 | } 39 | 40 | void GetGpuLaunchConfig1D(const GPUContext& ctx, int64_t numel, int* threads, 41 | int* blocks) { 42 | int num_threads = ctx.GetMaxThreadsPerBlock(); 43 | int sm_count = ctx.GetSMCount(); 44 | 45 | if (numel / (sm_count << 1) < num_threads) 46 | num_threads = NextPow2(numel / (sm_count << 1)); 47 | else if (numel / (sm_count << 2) < num_threads) 48 | num_threads = NextPow2(numel / (sm_count << 2)); 49 | 50 | *threads = std::max(64, num_threads); 51 | *blocks = DivUp(numel, *threads); 52 | } 53 | 54 | } // namespace ops 55 | } // namespace core 56 | } // namespace kaleido 57 | -------------------------------------------------------------------------------- /kaleido/core/operators/tests/b2b_gemm_test_utils.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | /* 4 | A and D are laid out in row-major fashion 5 | B and C are laid out in column-major fashion 6 | 7 | A[m, k] @ B[k, n] 8 | D[m, p] = P[m, n] @ C[n, p] 9 | */ 10 | template 11 | void cublas_two_hgemms(cublasHandle_t& handle, const kaleido::core::Tensor& A, 12 | const kaleido::core::Tensor& B, 13 | const kaleido::core::Tensor& C, kaleido::core::Tensor& P, 14 | kaleido::core::Tensor& D) { 15 | int kM = A.dim_size(0); 16 | int kN = B.dim_size(1); 17 | int kK = A.dim_size(1); 18 | int kP = C.dim_size(1); 19 | 20 | // cuBLAS gemm as the groundtruth 21 | kaleido::core::cuda_kernel::CublasGemm hgemm; 22 | 23 | Element alf = static_cast(1.); 24 | Element bet = static_cast(0.); 25 | 26 | // P = A @ B 27 | // P^T = B^T @ A^T 28 | hgemm(handle, CUBLAS_OP_T, CUBLAS_OP_N /* transb*/, kN, kM, kK, &alf, 29 | B.data(), B.dim_size(0), A.data(), A.dim_size(1), 30 | &bet, P.mutable_data(), P.dim_size(1)); 31 | 32 | // D = P @ C, D and P are laid out in row-major fashion, while C is in 33 | // column major fashion. Operands of cuBLAS is by default in column fashion. 34 | // D^T = C^T @ P^T; [p, m] = [p, n] @ [n, m] 35 | hgemm(handle, CUBLAS_OP_T, CUBLAS_OP_N /* transb*/, kP, kM, kN, &alf, 36 | C.data(), C.dim_size(0), P.data(), P.dim_size(1), 37 | &bet, D.mutable_data(), D.dim_size(1)); 38 | } 39 | -------------------------------------------------------------------------------- /kaleido/core/place.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include "boost/variant.hpp" 4 | 5 | namespace kaleido { 6 | namespace core { 7 | 8 | struct CPUPlace { 9 | CPUPlace() {} 10 | 11 | inline bool operator==(const CPUPlace&) const { return true; } 12 | inline bool operator!=(const CPUPlace&) const { return false; } 13 | inline bool operator<(const CPUPlace&) const { return false; } 14 | }; 15 | 16 | struct CUDAPlace { 17 | CUDAPlace() : CUDAPlace(0) {} 18 | explicit CUDAPlace(int d) : device(d) {} 19 | 20 | inline int GetDeviceId() const { return device; } 21 | inline bool operator==(const CUDAPlace& o) const { 22 | return device == o.device; 23 | } 24 | inline bool operator!=(const CUDAPlace& o) const { return !(*this == o); } 25 | inline bool operator<(const CUDAPlace& o) const { return device < o.device; } 26 | 27 | int device; 28 | }; 29 | 30 | class Place : public boost::variant { 31 | private: 32 | using PlaceBase = boost::variant; 33 | 34 | public: 35 | Place() = default; 36 | Place(const CPUPlace& cpu_place) : PlaceBase(cpu_place) {} 37 | Place(const CUDAPlace& cuda_place) : PlaceBase(cuda_place) {} 38 | 39 | bool operator<(const Place& place) const { 40 | return PlaceBase::operator<(static_cast(place)); 41 | } 42 | 43 | bool operator==(const Place& place) const { 44 | return PlaceBase::operator==(static_cast(place)); 45 | } 46 | }; 47 | 48 | std::ostream& operator<<(std::ostream&, const Place&); 49 | 50 | } // namespace core 51 | } // namespace kaleido 52 | -------------------------------------------------------------------------------- /examples/sparse_attention/README.md: -------------------------------------------------------------------------------- 1 | # Hyper-parameters for the BERT model 2 | 3 | - batch size of 256 sequences 4 | - each sequence has 512 tokens 5 | 6 | ||BERT base|BERT large| 7 | |:--|:--|:--| 8 | |number of Transformer blocks ($L$)|12|24| 9 | |hidden size ($H$)|768|1024| 10 | |self-attention head ($A$)|12|16| 11 | |feed-forward/filter = 4$H$ |3072|4096| 12 | |total parameters|110MB|340MB| 13 | 14 | # BigBird attending pattern 15 | 16 |

17 | 18 |

19 | 20 | 21 | The implementation refers to: 22 | 23 | 1. https://github.com/google-research/bigbird 24 | 1. https://github.com/sanghuynh1501/bigbird_pytorch 25 | 26 | Pseudo-codes for the blocked windowed-attention 27 | 28 | ```c++ 29 | qss: List> // batch_size, block_num, [block_size, hidden] 30 | kss: List> // batch_size, block_num, [hidden, block_size] 31 | vss: List> // batch_size, block_num, [block_size, hidden] 32 | 33 | wss: List> // batch_size, block_num, [block_size, hidden] 34 | 35 | for 0 <= i < len(qss) // iterate over `batch_size` 36 | for 2 <= j < len(qss[i]) - 2 // iterate over `block_num` 37 | for -1 <= k <= 1 // iterate over window size 38 | // [block_size, hidden] @ [hidden, block_size] 39 | ss1[k + 1] = qss[i][j] @ kss[i][j + k] 40 | 41 | // ss1 has a shape of [block_size, block_size * 3] 42 | ss2 = softmax(ss1[:]) 43 | 44 | // [block_size , block_size * 3] @ [block_size * 3, hidden] 45 | wss[i][j] = ss2 @ vss[i][j - 1 : j + 1] 46 | ``` 47 | -------------------------------------------------------------------------------- /docs/fractaltensor_operations/memory_layout_of_fractaltensor.md: -------------------------------------------------------------------------------- 1 | # Memory layout of jagged FractalTensor 2 | 3 | `FractalTensor` is a nested collection whose elements could be a set of `FractalTensor` variables. Underneath the hood, a `FractalTensor` is just a convenient way of describing large blocks of computer memory, so that the elements contained could be efficiently iterated over and manipulated. 4 | 5 | Recap the constraint of `FractalTensor` : 6 | 7 | 1. All `FractalTensor` elements (could be integers, tensors, FractalTensors) are homogenous. 8 | 1. If two `FractalTensor` types have different depths, they are treated as inequivalent types. 9 | 10 | When `FractalTensor` is nested, it is easy to conclude that all tensors contained in a `FractalTensor` have the same depths. The indices of a `FractalTensor` is organized as a tree. At the compile-time, only the depth of the `FractalTensor` is known. The exact structure of the indices tree is not known. 11 | 12 |

13 |
14 | Fig 1. The meomory layout of a FractalTensor variable x. 15 |

16 | 17 | `FractalTensor` supports random read access. Elements contained in a `FractalTensor` can be indexed using the `[]` operator. 18 | 19 | Example: `a = x[1]` : 20 | 21 |

22 |
23 | Fig 2. index the FractalTensor variable x. 24 |

25 | 26 | Example: `a = x[2][1][3]` : 27 | 28 |

29 |
30 | Fig 3. index the FractalTensor variable x. 31 |

32 | -------------------------------------------------------------------------------- /CMakeLists.txt: -------------------------------------------------------------------------------- 1 | # ------------------------------------------------------------------------- 2 | # Copyright (c) Microsoft Corporation. All rights reserved. Licensed under the 3 | # MIT License. 4 | # -------------------------------------------------------------------------- 5 | 6 | cmake_minimum_required(VERSION 3.18) # cutlass 3.2 requires cmake 3.18+ 7 | 8 | set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} "${CMAKE_CURRENT_SOURCE_DIR}/cmake") 9 | list(APPEND CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} 10 | "${CMAKE_SOURCE_DIR}/cmake/Modules/") 11 | 12 | set(PYPARSER_SOURCE_DIR ${CMAKE_CURRENT_SOURCE_DIR}) 13 | set(PYPARSER_BINARY_DIR ${CMAKE_CURRENT_BINARY_DIR}) 14 | set(PYPARSER_PYTHON_BUILD_DIR "${CMAKE_CURRENT_BINARY_DIR}/python/build") 15 | 16 | project(kaleido CXX C) 17 | message(STATUS "CXX compiler: ${CMAKE_CXX_COMPILER}, version: " 18 | "${CMAKE_CXX_COMPILER_ID} ${CMAKE_CXX_COMPILER_VERSION}") 19 | message(STATUS "C compiler: ${CMAKE_C_COMPILER}, version: " 20 | "${CMAKE_C_COMPILER_ID} ${CMAKE_C_COMPILER_VERSION}") 21 | 22 | find_package(Threads REQUIRED) 23 | find_package(CUDA REQUIRED) 24 | find_package(CuDNN REQUIRED) 25 | 26 | set(Boost_USE_STATIC_LIBS OFF) 27 | set(Boost_USE_MULTITHREADED ON) 28 | set(Boost_USE_STATIC_RUNTIME OFF) 29 | find_package(Boost 1.45.0 COMPONENTS filesystem regex) 30 | 31 | if(Boost_FOUND) 32 | include_directories(${Boost_INCLUDE_DIR}) 33 | add_definitions("-DHAS_BOOST") 34 | else() 35 | message(FATAL_ERROR "Cannot find Boost.") 36 | endif() 37 | 38 | include(generic) 39 | include(python) 40 | include(third_party) 41 | 42 | add_subdirectory(kaleido/core) 43 | -------------------------------------------------------------------------------- /cmake/external/cutlass.cmake: -------------------------------------------------------------------------------- 1 | # ------------------------------------------------------------------------- 2 | # Copyright (c) Microsoft Corporation. All rights reserved. Licensed under the 3 | # MIT License. 4 | # -------------------------------------------------------------------------- 5 | 6 | include(ExternalProject) 7 | 8 | set(CUTLASS_PREFIX_DIR ${THIRD_PARTY_PATH}/cutlass) 9 | set(CUTLASS_SOURCE_DIR ${CUTLASS_PREFIX_DIR}/src/extern_cutlass) 10 | set(CUTLASS_REPOSITORY https://github.com/NVIDIA/cutlass.git) 11 | set(CUTLASS_TAG v3.2.2) 12 | 13 | cache_third_party( 14 | extern_cutlass 15 | REPOSITORY 16 | ${CUTLASS_REPOSITORY} 17 | TAG 18 | ${CUTLASS_TAG} 19 | DIR 20 | CUTLASS_SOURCE_DIR) 21 | 22 | set(CUTLASS_INCLUDE_DIR "${CUTLASS_SOURCE_DIR}/include") 23 | include_directories(${CUTLASS_INCLUDE_DIR}) 24 | include_directories("${CUTLASS_SOURCE_DIR}/tools/util/include") 25 | 26 | ExternalProject_Add( 27 | extern_cutlass 28 | ${EXTERNAL_PROJECT_LOG_ARGS} 29 | ${SHALLOW_CLONE} 30 | "${CUTLASS_DOWNLOAD_CMD}" 31 | PREFIX ${CUTLASS_PREFIX_DIR} 32 | SOURCE_DIR ${CUTLASS_SOURCE_DIR} 33 | UPDATE_COMMAND "" 34 | CONFIGURE_COMMAND "" 35 | BUILD_COMMAND "" 36 | INSTALL_COMMAND "" 37 | TEST_COMMAND "") 38 | 39 | add_library(cutlass INTERFACE) 40 | target_include_directories(cutlass INTERFACE ${ROOT_DIR}/include 41 | ${ROOT_DIR}/tools/util/include) 42 | target_link_libraries(cutlass INTERFACE CUDA::cudart) 43 | target_include_directories(cutlass 44 | INTERFACE ${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES}) 45 | add_dependencies(cutlass extern_cutlass) 46 | -------------------------------------------------------------------------------- /kaleido/frontend/operations/tests/test_constants.py: -------------------------------------------------------------------------------- 1 | # ------------------------------------------------------------------------- 2 | # Copyright (c) Microsoft Corporation. All rights reserved. 3 | # Licensed under the MIT License. 4 | # -------------------------------------------------------------------------- 5 | 6 | from __future__ import print_function 7 | 8 | from context import * 9 | 10 | 11 | class TestConstantsOp(unittest.TestCase): 12 | def setUp(self): 13 | random.seed(12345) 14 | 15 | def test_arange(self): 16 | x1 = kaleido.operations.slices(kaleido.operations.arange(11), dim=0) 17 | self.assertTrue(isinstance(x1, kaleido.FractalTensor)) 18 | self.assertTrue( 19 | isinstance(x1.element_type.element_type, 20 | kaleido.frontend.types.Int)) 21 | y = list(range(11)) 22 | for i, x in enumerate(x1): 23 | self.assertEqual(y[i], x.data.item()) 24 | 25 | x2 = kaleido.operations.slices(kaleido.operations.arange(5, 7), dim=0) 26 | y = list(range(5, 7)) 27 | for i, x in enumerate(x2): 28 | self.assertEqual(y[i], x.data.item()) 29 | 30 | x3 = kaleido.operations.slices( 31 | kaleido.operations.arange(5, 24, 3), dim=0) 32 | y = list(range(5, 24, 3)) 33 | for i, x in enumerate(x3): 34 | self.assertEqual(y[i], x.data.item()) 35 | 36 | def test_constants(self): 37 | for d in ['cpu', 'cuda']: 38 | x = kaleido.operations.zeros(shape=(3, 7), dtype='float', device=d) 39 | y = kaleido.operations.ones(shape=(3, 7), dtype='float', device=d) 40 | 41 | 42 | if __name__ == '__main__': 43 | unittest.main() 44 | -------------------------------------------------------------------------------- /kaleido/core/device/kernels/lstm/stacked_lstm/region1.h: -------------------------------------------------------------------------------- 1 | // Copyright (c) Microsoft Corporation. All rights reserved. 2 | // Licensed under the MIT License. 3 | 4 | #pragma once 5 | #include "kaleido/core/device/cuda_timer.h" 6 | #include "kaleido/core/device/kernels/lstm.h" 7 | 8 | namespace kaleido::core::cuda_kernel { 9 | template 11 | float StackedLstmRegion1(Element* hsss, Element* csss, const Element* xss, 12 | const Element* ws, const Element* us, const int depth, 13 | const int seq_length, const int batch_size, 14 | const int hidden_size) { 15 | CudaTimer timer; 16 | const Element* x = xss; 17 | Element* init; 18 | cudaMalloc((void**)&init, sizeof(Element) * hidden_size * batch_size); 19 | // Fill zero 20 | cudaMemset(reinterpret_cast(init), 0, 21 | sizeof(Element) * hidden_size * batch_size); 22 | 23 | const Element* c_init = init; 24 | const Element* h_init = init; 25 | const Element* w = ws; 26 | const Element* u = us; 27 | Element* css = csss; 28 | Element* hss = hsss; 29 | 30 | // TODO: NotFused version. 31 | using CuteFusedLSTMLayer = 32 | cuda_kernel::CuteLSTMLayer; 34 | 35 | CuteFusedLSTMLayer cute_fused_lstm_layer; 36 | 37 | float time = 38 | cute_fused_lstm_layer(w, x, u, c_init, h_init, css, hss, seq_length); 39 | 40 | CudaCheck(cudaFree(init)); 41 | 42 | return time; 43 | } 44 | } // namespace kaleido::core::cuda_kernel 45 | -------------------------------------------------------------------------------- /benchmarks/rnn/baselines/stacked_lstm/figures/perf_with_increased_depth_subplot1.tsv: -------------------------------------------------------------------------------- 1 | Depth TestName AvgTime Throughput Ratio 2 | 1 PT-JIT 1.96780 16261.78517 1.00000 3 | 4 PT-JIT 7.77687 4114.76474 3.95206 4 | 8 PT-JIT 15.53171 2060.30052 7.89292 5 | 12 PT-JIT 24.19508 1322.58278 12.29548 6 | 16 PT-JIT 32.10150 996.83803 16.31337 7 | 20 PT-JIT 41.20204 776.66064 20.93808 8 | 1 TF-WhileOpLSTM 3.90668 8191.10010 1.00000 9 | 4 TF-WhileOpLSTM 13.55371 2360.97774 3.46937 10 | 8 TF-WhileOpLSTM 27.79051 1151.47235 7.11359 11 | 12 TF-WhileOpLSTM 43.66595 732.83646 11.17726 12 | 16 TF-WhileOpLSTM 62.98433 508.06287 16.12222 13 | 20 TF-WhileOpLSTM 80.80373 396.02133 20.68348 14 | 1 TF-GraphMode 2.55039 12547.11352 1.00000 15 | 4 TF-GraphMode 8.89519 3597.44963 3.48778 16 | 8 TF-GraphMode 24.10036 1327.78104 9.44969 17 | 12 TF-GraphMode 30.35351 1054.24393 11.90153 18 | 16 TF-GraphMode 43.90387 728.86518 17.21459 19 | 20 TF-GraphMode 59.90020 534.22195 23.48671 20 | 1 TF-AutoGraph 2.50235 12787.99954 1.00000 21 | 4 TF-AutoGraph 6.49833 4924.34298 2.59689 22 | 8 TF-AutoGraph 12.64750 2530.14397 5.05426 23 | 12 TF-AutoGraph 18.24804 1753.61285 7.29237 24 | 16 TF-AutoGraph 55.36173 578.01660 22.12393 25 | 20 TF-AutoGraph 53.05073 603.19619 21.20040 26 | 1 TVM-Ansor 1.1106 28813.254097 1.000000 27 | 4 TVM-Ansor 3.7581 8514.941061 3.383847 28 | 8 TVM-Ansor 7.4149 4315.634735 6.676481 29 | 12 TVM-Ansor 11.1161 2878.707460 10.009094 30 | 16 TVM-Ansor 14.8240 2158.661630 13.347740 31 | 20 TVM-Ansor 18.4955 1730.150577 16.653611 32 | 1 CuDNN 0.37390 85583.48580 1.00000 33 | 4 CuDNN 1.07919 29651.76547 2.88629 34 | 8 CuDNN 2.02688 15787.78251 5.42087 35 | 12 CuDNN 2.97586 10753.18958 7.95889 36 | 16 CuDNN 4.06898 7864.38144 10.88242 37 | 20 CuDNN 4.85620 6589.51845 12.98782 38 | -------------------------------------------------------------------------------- /kaleido/frontend/tests/test_type_equivalence.py: -------------------------------------------------------------------------------- 1 | # ------------------------------------------------------------------------- 2 | # Copyright (c) Microsoft Corporation. All rights reserved. 3 | # Licensed under the MIT License. 4 | # -------------------------------------------------------------------------- 5 | 6 | from __future__ import print_function 7 | 8 | from context import * 9 | 10 | from kaleido.frontend.types import Bool, Int, Real 11 | 12 | device = 'cpu' 13 | 14 | 15 | class Test1(unittest.TestCase): 16 | 17 | def test_basic_type(self): 18 | self.assertFalse(Real(64).is_equal_type(Int(32))) 19 | self.assertFalse(Real(64).is_equal_type(Real(32))) 20 | self.assertFalse(Bool().is_equal_type(Real(16))) 21 | 22 | self.assertTrue(Real(64).is_equal_type(Real(64))) 23 | self.assertTrue(Int(16).is_equal_type(Int(16))) 24 | 25 | def test_tensor_type(self): 26 | x = TensorStorage((2, 64), kaleido.float32, device=device) 27 | 28 | self.assertTrue(x.is_equal_type(x)) 29 | self.assertFalse( 30 | x.is_equal_type(TensorStorage((1, 3), kaleido.float32, device))) 31 | 32 | y = FractalTensorStorage( 33 | TensorStorage((2, 64), kaleido.float32, device=device)) 34 | self.assertTrue(y.is_equal_type(y)) 35 | self.assertTrue( 36 | y.is_equal_type( 37 | FractalTensorStorage( 38 | TensorStorage((2, 64), kaleido.float32, device=device)))) 39 | 40 | self.assertFalse( 41 | y.is_equal_type( 42 | FractalTensorStorage( 43 | TensorStorage((4, 7), kaleido.float32, device=device)))) 44 | 45 | 46 | if __name__ == '__main__': 47 | unittest.main() 48 | -------------------------------------------------------------------------------- /kaleido/parser/tests/utils.py: -------------------------------------------------------------------------------- 1 | # ------------------------------------------------------------------------- 2 | # Copyright (c) Microsoft Corporation. All rights reserved. 3 | # Licensed under the MIT License. 4 | # -------------------------------------------------------------------------- 5 | 6 | import ast 7 | import inspect 8 | import textwrap 9 | 10 | import astpretty 11 | import asttokens 12 | import torch 13 | 14 | import kaleido 15 | from kaleido import FractalTensor, Tensor 16 | from kaleido.frontend.types import FractalTensorStorage, TensorStorage 17 | 18 | __all__ = [ 19 | 'get_ast', 20 | 'print_ast', 21 | 'create_fractaltensor', 22 | 'create_depth2_fractaltensor', 23 | ] 24 | 25 | 26 | def get_ast(func): 27 | source = inspect.getsource(func) 28 | source = textwrap.dedent(source) 29 | col_offset = len(source.split("\n")[0]) - len(source.split("\n")[0]) 30 | 31 | _, file_lineno = inspect.getsourcelines(func) 32 | 33 | return ast.increment_lineno( 34 | asttokens.ASTTokens(source, parse=True).tree, file_lineno) 35 | 36 | 37 | def print_ast(func): 38 | astpretty.pprint(get_ast(func)) 39 | 40 | 41 | def create_fractaltensor(size, length): 42 | xs = FractalTensor(TensorStorage(size, kaleido.float32, device='cpu')) 43 | xs.indices = list(range(length)) 44 | xs.initialize(torch.rand, *xs.flatten_shape) 45 | return xs 46 | 47 | 48 | def create_depth2_fractaltensor(size, length1, length2): 49 | xss = FractalTensor( 50 | FractalTensorStorage(TensorStorage(size, kaleido.float32, 51 | device='cpu'))) 52 | xss.indices = [list(range(length1)) for _ in range(length2)] 53 | xss.initialize(torch.rand, *xss.flatten_shape) 54 | return xss 55 | -------------------------------------------------------------------------------- /kaleido/frontend/operations/tensor/data_movements.py: -------------------------------------------------------------------------------- 1 | # ------------------------------------------------------------------------- 2 | # Copyright (c) Microsoft Corporation. All rights reserved. 3 | # Licensed under the MIT License. 4 | # -------------------------------------------------------------------------- 5 | 6 | from __future__ import absolute_import, division, print_function 7 | 8 | from typing import Tuple 9 | 10 | import torch 11 | 12 | import kaleido 13 | from kaleido import Tensor 14 | from kaleido.frontend.operations.base import Access 15 | 16 | __all__ = [ 17 | 'cat', 18 | 'permute', 19 | 'stack', 20 | ] 21 | 22 | 23 | class Cat(Access): 24 | 25 | def __call__(self, xs: Tuple, dim: int = 0): 26 | 27 | assert (len(xs)) 28 | 29 | v = torch.cat([x.data for x in xs], dim=dim) 30 | t = Tensor(v.shape, xs[0]._type._dtype, device=xs[0].device) 31 | t.data = v 32 | return t 33 | 34 | 35 | cat = Cat() 36 | 37 | 38 | class Permute(Access): 39 | 40 | def __call__(self, x: Tensor, axes: Tuple[int]) -> Tensor: 41 | assert len(axes) == x.ndim 42 | shape = [x.shape[i] for i in axes] 43 | t = kaleido.Tensor(shape, x._type._dtype, device=x.device) 44 | 45 | t.data = x.data.permute(*axes) 46 | t._type._shape = list(t.data.shape) 47 | t.recompute_strides() 48 | return t 49 | 50 | 51 | permute = Permute() 52 | 53 | 54 | class Stack(Access): 55 | 56 | def __call__(self, xs: Tuple, dim: int = 0): 57 | 58 | assert (len(xs)) 59 | 60 | v = torch.stack([x.data for x in xs], dim=dim) 61 | t = Tensor(v.shape, xs[0]._type._dtype, device=xs[0].device) 62 | t.data = v 63 | return t 64 | 65 | 66 | stack = Stack() 67 | -------------------------------------------------------------------------------- /benchmarks/rnn/fractaltensor/stacked_lstm/images/preprocess.py: -------------------------------------------------------------------------------- 1 | # ------------------------------------------------------------------------- 2 | # Copyright (c) Microsoft Corporation. All rights reserved. 3 | # Licensed under the MIT License. 4 | # -------------------------------------------------------------------------- 5 | 6 | import sys 7 | 8 | test_num = int(sys.argv[1]) 9 | print(f'test number : {test_num}') 10 | 11 | data1 = open(f'stacked_lstm_unfused_elem_fused_bmm_{test_num}.csv', 12 | 'r').read().rstrip().split('\n') 13 | data2 = open(f'stacked_lstm_fused_elem_fused_bmm_{test_num}.csv', 14 | 'r').read().rstrip().split('\n') 15 | 16 | length = len(data1) 17 | header = data1[0] 18 | 19 | with open(f'stacked_lstm{test_num}.csv', 'w') as f: 20 | f.write('%s\n' % (header)) 21 | for i in range(1, length, 2): 22 | unfused_elem_fused_bmm = data1[i] 23 | unfused_elem_fused_bmm = unfused_elem_fused_bmm.replace( 24 | 'FractalTensor', 'FT_unfused-elem_fused-bmm') 25 | cudnn1 = float(data1[i + 1].split('|')[-2]) 26 | 27 | # depth = int(data1[i].split('|')[2].replace('[', '').replace( 28 | # ']', '').split(',')[-1]) 29 | # if i >= 3 and depth % 2: continue 30 | 31 | fused_elem_fused_bmm = data2[i] 32 | fused_elem_fused_bmm = fused_elem_fused_bmm.replace( 33 | 'FractalTensor', 'FT_fused-elem_fused-bmm') 34 | cudnn2 = float(data2[i + 1].split('|')[-2]) 35 | 36 | cudnn = (cudnn1 + cudnn2) / 2. 37 | cudnn_str = data1[i + 1].split('|') 38 | cudnn_str = '|'.join(cudnn_str[0:-2]) + '|%.3f' % (cudnn) + '|' 39 | 40 | f.write('%s\n' % (unfused_elem_fused_bmm)) 41 | f.write('%s\n' % (fused_elem_fused_bmm)) 42 | f.write('%s\n' % (cudnn_str)) 43 | -------------------------------------------------------------------------------- /kaleido/frontend/operations/tensor/arithmetic/broadcast.py: -------------------------------------------------------------------------------- 1 | # ------------------------------------------------------------------------- 2 | # Copyright (c) Microsoft Corporation. All rights reserved. 3 | # Licensed under the MIT License. 4 | # -------------------------------------------------------------------------- 5 | 6 | from __future__ import absolute_import, division, print_function 7 | 8 | from typing import Tuple 9 | 10 | import kaleido 11 | from kaleido.frontend.operations.base import Broadcast 12 | 13 | __all__ = [ 14 | 'scale', 15 | '_broadcast_div', 16 | '_broadcast_pow', 17 | ] 18 | 19 | 20 | class Scale(Broadcast): 21 | """y = x * y where x is a scalar 22 | 23 | x is the smaller tensor while y is the larger tensor. 24 | """ 25 | 26 | def __call__(self, x: kaleido.Tensor, y: kaleido.Tensor) -> kaleido.Tensor: 27 | t = super(Scale, self).__call__(x, y) 28 | 29 | t.data = x.data * y.data 30 | t._type._shape = t.data.shape 31 | t.recompute_strides() 32 | return t 33 | 34 | 35 | scale = Scale() 36 | 37 | 38 | class _BroadcastDiv(Broadcast): 39 | 40 | def __call__(self, x: kaleido.Tensor, y: kaleido.Tensor) -> kaleido.Tensor: 41 | t = super(_BroadcastDiv, self).__call__(x, y) 42 | 43 | t.data = x.data / y.data 44 | t._type._shape = t.data.shape 45 | t.recompute_strides() 46 | return t 47 | 48 | 49 | _broadcast_div = _BroadcastDiv() 50 | 51 | 52 | class _BroadcastPow(Broadcast): 53 | 54 | def __call__(self, x: kaleido.Tensor, y: kaleido.Tensor) -> kaleido.Tensor: 55 | t = super(_BroadcastPow, self).__call__(x, y) 56 | 57 | t.data = x.data**y.data 58 | t._type._shape = t.data.shape 59 | t.recompute_strides() 60 | return t 61 | 62 | 63 | _broadcast_pow = _BroadcastPow() 64 | -------------------------------------------------------------------------------- /examples/rnn_attention/rnn_attention_utils.py: -------------------------------------------------------------------------------- 1 | # ------------------------------------------------------------------------- 2 | # Copyright (c) Microsoft Corporation. All rights reserved. 3 | # Licensed under the MIT License. 4 | # -------------------------------------------------------------------------- 5 | 6 | import torch 7 | 8 | import kaleido 9 | from examples.utils import gen_dataset 10 | from kaleido import Tensor 11 | 12 | # ============= hyper parameters 13 | device = 'cpu' 14 | # device = 'cuda' 15 | 16 | batch_size = 7 17 | vocab_size = 5000 18 | hidden_dim = 512 19 | 20 | src_emb = Tensor((vocab_size, hidden_dim), kaleido.float32, device=device) 21 | src_emb.initialize(torch.rand, *src_emb.shape, device=device) 22 | 23 | trg_emb = Tensor((vocab_size, hidden_dim), kaleido.float32, device=device) 24 | trg_emb.initialize(torch.rand, *trg_emb.shape, device=device) 25 | 26 | 27 | def create_cell_param(): 28 | W = Tensor((hidden_dim, hidden_dim), kaleido.float32, device=device) 29 | W.initialize(torch.rand, *W.shape, device=device) 30 | 31 | U = Tensor((hidden_dim, hidden_dim), kaleido.float32, device=device) 32 | U.initialize(torch.rand, *U.shape, device=device) 33 | 34 | b = Tensor((1, hidden_dim), kaleido.float32, device=device) 35 | b.initialize(torch.rand, *b.shape, device=device) 36 | return {'W': W, 'U': U, 'b': b} 37 | 38 | 39 | src_params = create_cell_param() 40 | trg_params = create_cell_param() 41 | 42 | encoder_proj = Tensor((hidden_dim, 1), kaleido.float32, device=device) 43 | encoder_proj.initialize(torch.rand, *encoder_proj.shape, device=device) 44 | 45 | decoder_proj = Tensor((hidden_dim, 1), kaleido.float32, device=device) 46 | decoder_proj.initialize(torch.rand, *decoder_proj.shape, device=device) 47 | 48 | attn_params = (encoder_proj, decoder_proj) 49 | 50 | src_words = gen_dataset(batch_size, vocab_size) 51 | trg_words = gen_dataset(batch_size, vocab_size) 52 | -------------------------------------------------------------------------------- /examples/dilated_rnn/dilated_rnn.py: -------------------------------------------------------------------------------- 1 | # ------------------------------------------------------------------------- 2 | # Copyright (c) Microsoft Corporation. All rights reserved. 3 | # Licensed under the MIT License. 4 | # -------------------------------------------------------------------------- 5 | 6 | from typing import Tuple 7 | 8 | import context 9 | 10 | from examples.stacked_rnn.rnn_utils import * 11 | from examples.stacked_rnn.stacked_rnn import lstm_cell 12 | from examples.utils import gen_dataset 13 | 14 | 15 | # @kaleido.function(ctx) 16 | def dilated_layer(state: FractalTensor[Tensor['1, 512', float, 'cpu']], 17 | itr: int, 18 | Ws: FractalTensor[Tensor['512, 521', float, 'cpu']], 19 | Us: FractalTensor[Tensor['512, 512', float, 'cpu']], 20 | bs: FractalTensor[Tensor['1, 512', float, 'cpu']] 21 | ) -> FractalTensor[Tensor['1, 512', float, 'cpu']]: 22 | zeros = ops.zeros(shape=(1, 512), device='cpu', dtype='float') 23 | h, _ = ops.dilated_map( 24 | lambda xs: ops.scan(lambda s, x: lstm_cell(*s, x, Ws, Us, bs), 25 | xs, initializer=(zeros, zeros)), 26 | state, 27 | dilation=2**itr) 28 | return h 29 | 30 | 31 | # @kaleido.function(ctx) 32 | def model(batch_words: FractalTensor[FractalTensor[Tensor['1,', int, 'cpu']]], 33 | params: ModelParams 34 | ) -> FractalTensor[FractalTensor[Tensor['1, 512', float, 'cpu']]]: 35 | embs = ops.map(lambda words: ops.map(lambda word: 36 | ops.index(ops.slices(params.embedding, dim=0), word), words), 37 | batch_words) 38 | itrs = ops.enumerate(params.Wss, params.Uss, params.bss) 39 | rnn_outs = ops.map(lambda xs: ops.fold(lambda s, x: dilated_layer(s, *x), 40 | itrs, initializer=xs), embs) 41 | return rnn_outs 42 | 43 | 44 | if __name__ == '__main__': 45 | xss = gen_dataset(batch_size, vocab_size) 46 | yss = model(xss, params) 47 | -------------------------------------------------------------------------------- /kaleido/core/tensor_shape.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | #include 11 | #include 12 | 13 | namespace kaleido { 14 | namespace core { 15 | 16 | // The tensor shape is static after declaration. 17 | class TensorShape { 18 | public: 19 | explicit TensorShape(std::initializer_list sizes) 20 | : dim_sizes_(sizes), 21 | dim_(sizes.size()), 22 | numel_(std::accumulate(sizes.begin(), sizes.end(), 1, 23 | std::multiplies())) {} 24 | explicit TensorShape(const std::vector sizes) 25 | : dim_sizes_(std::move(sizes)), 26 | dim_(sizes.size()), 27 | numel_(std::accumulate(sizes.begin(), sizes.end(), 1, 28 | std::multiplies())) {} 29 | ~TensorShape() = default; 30 | bool IsEuqalShape(const TensorShape& b) const; 31 | void operator=(TensorShape& b) { 32 | dim_sizes_ = std::move(b.dims()); 33 | dim_ = b.ndim(); 34 | numel_ = b.numel(); 35 | }; 36 | 37 | bool operator==(const TensorShape& b) const { return IsEuqalShape(b); }; 38 | bool operator!=(const TensorShape& b) const { return !IsEuqalShape(b); }; 39 | 40 | std::string DebugString() const; 41 | 42 | size_t ndim() const { return dim_; } 43 | int64_t dim_size(int i) const { 44 | return i >= 0 ? dim_sizes_[i] : dim_sizes_[dim_ + i]; 45 | } 46 | 47 | const std::vector& dims() const { return dim_sizes_; } 48 | int64_t numel() const { return numel_; } 49 | 50 | int64_t count(int i) const { 51 | return std::accumulate(dim_sizes_.begin() + i, dim_sizes_.end(), 1, 52 | std::multiplies()); 53 | } 54 | 55 | std::vector dim_sizes_; 56 | size_t dim_; 57 | int64_t numel_; 58 | }; 59 | 60 | } // namespace core 61 | } // namespace kaleido 62 | -------------------------------------------------------------------------------- /kaleido/core/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | # ------------------------------------------------------------------------- 2 | # Copyright (c) Microsoft Corporation. All rights reserved. Licensed under the 3 | # MIT License. 4 | # -------------------------------------------------------------------------- 5 | 6 | set(TARGET fractaltensor_core) 7 | 8 | include_directories("${PROJECT_SOURCE_DIR}") 9 | include_directories(${CUDA_INCLUDE_DIRS}) 10 | include_directories(${CUDNN_INCLUDE_DIRS}) 11 | include_directories(${Boost_INCLUDE_DIRS}) 12 | 13 | # set(PATH_PREFIX ${PROJECT_SOURCE_DIR}/kaleido/core) file(GLOB_RECURSE 14 | # PROTOBUF_FILE "${PATH_PREFIX}/*.proto") get_filename_component(PROTO_PATH 15 | # ${PROTOBUF_FILE} ABSOLUTE) get_filename_component(PROTO_NAME ${PROTOBUF_FILE} 16 | # NAME_WE) 17 | 18 | # cpp_proto_generate("${TARGET}_proto" SRCS "${PROTOBUF_FILE}") 19 | # add_custom_command( TARGET "${TARGET}_proto" POST_BUILD COMMAND cp *.pb.* 20 | # ${PATH_PREFIX} COMMENT "Copy generated C++ proto into directory kaleido/core." 21 | # WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}) 22 | # 23 | # py_proto_generate("${TARGET}_proto_py" SRCS "${PROTOBUF_FILE}") 24 | # add_custom_command( TARGET "${TARGET}_proto_py" POST_BUILD COMMAND cp *.py 25 | # "${PROJECT_SOURCE_DIR}/kaleido/frontend" COMMENT "Copy generated python proto 26 | # into directory kaleido/frontend." WORKING_DIRECTORY 27 | # ${CMAKE_CURRENT_BINARY_DIR}) 28 | 29 | add_subdirectory(operators) 30 | 31 | file( 32 | GLOB KALEIDO_CORE_SRCS 33 | RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" 34 | "*.cc" "device/*.cc") 35 | 36 | cc_library( 37 | ${TARGET} 38 | SHARED 39 | IMPORTED 40 | SRCS 41 | ${KALEIDO_CORE_SRCS} 42 | DEPS 43 | python 44 | # ${TARGET}_proto protobuf 45 | ) 46 | 47 | target_link_libraries(${TARGET} Boost::filesystem Boost::regex) 48 | target_link_libraries(${TARGET} ${CUDA_LIBRARIES} ${CUDA_CUBLAS_LIBRARIES} 49 | ${CUDNN_LIBRARIES}) 50 | 51 | add_subdirectory(tests) 52 | add_subdirectory(device/tests) 53 | -------------------------------------------------------------------------------- /kaleido/core/operators/fill_op.cu: -------------------------------------------------------------------------------- 1 | // Copyright (c) Microsoft Corporation. All rights reserved. 2 | // Licensed under the MIT License. 3 | 4 | #include "kaleido/core/device/cuda_utils.h" 5 | #include "kaleido/core/device/gpu_context.h" 6 | #include "kaleido/core/device/kernels/fill.h" 7 | #include "kaleido/core/operators/fill_op.h" 8 | 9 | #include 10 | 11 | namespace kaleido { 12 | namespace core { 13 | namespace ops { 14 | 15 | template 16 | class FillOp { 17 | public: 18 | void operator()(Tensor& input, float value) { 19 | int numel = static_cast(input.numel()); 20 | T* data = input.mutable_data(); 21 | 22 | int threads = 128; 23 | int blocks = DIVUP(numel, threads); 24 | cuda_kernel::KeFillValue<<>>(data, numel, value); 25 | } 26 | 27 | void operator()(Tensor& input) { 28 | T* data = input.mutable_data(); 29 | int num = static_cast(input.numel()); 30 | cuda_kernel::FillRandomValue(data, num); 31 | } 32 | 33 | void operator()(Tensor& input, float mean = 0, float stddev = 0.1) { 34 | T* data = input.mutable_data(); 35 | int num = static_cast(input.numel()); 36 | cuda_kernel::FillRandomValue(data, num, mean, stddev); 37 | } 38 | 39 | void operator()(Tensor& input, const std::string& mode, float scale = 1.) { 40 | if (mode == "seq") { 41 | T* data = input.mutable_data(); 42 | int64_t numel = input.numel(); 43 | 44 | int threads = 128; 45 | int blocks = DIVUP(numel, threads); 46 | cuda_kernel::KeFillSequential<<>>(data, numel, scale); 47 | } else { 48 | LOG(FATAL) << "Unknown mode: " << mode << std::endl; 49 | } 50 | } 51 | }; 52 | 53 | template class FillOp; 54 | template class FillOp; 55 | template class FillOp; 56 | 57 | } // namespace ops 58 | } // namespace core 59 | } // namespace kaleido 60 | -------------------------------------------------------------------------------- /docs/fractaltensor_operations/memory_operations.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | - [Memory operations](#memory-operations) 4 | - [\*copy](#copy) 5 | - [repeat](#repeat) 6 | - [stack](#stack) 7 | - [flatten](#flatten) 8 | - [split (partition)](#split-partition) 9 | 10 | 14 | 15 | 16 | # Memory operations 17 | 18 | ## \*copy 19 | 20 | ```python 21 | copy(x: FractalTensor[T]) -> FractalTensor[T] 22 | ``` 23 | 24 | ## repeat 25 | 26 | $$\mathbf{repeat} ::\Psi n.[\alpha]^d_n \rightarrow \text{int}\rightarrow \Psi m.[\alpha]^{d}_m$$ 27 | 28 | ```python 29 | repeat(x: FractalTensor[T], repeats: int) -> FractalTensor[T] 30 | ``` 31 | 32 | Examples: 33 | 34 | ``` 35 | x: FractalTensor = [t1, t2, t3] 36 | 37 | y: FractalTensor = repeats(x, 3) 38 | 39 | y = [t1, t2, t3, t1, t2, t3] 40 | ``` 41 | 42 | ## stack 43 | 44 | $$\mathbf{stack} ::\Psi n.[\alpha]^1_n \rightarrow \text{int}\rightarrow \beta$$ 45 | 46 | ```python 47 | stack(x: FractalTensor[Tensor], axis: int) -> Tensor 48 | ``` 49 | 50 | `stack` is **ONLY** defined for a depth-1 `FractalTensor` . 51 | 52 | Example, suppose `x = FractalTensor[Tensor[3, 7], float32]` 53 | 54 |

55 | 56 |

57 | 58 | ## flatten 59 | 60 | $$\mathbf{flatten} ::\Psi n.[\alpha]^d_n \rightarrow \text{int}\rightarrow \beta$$ 61 | 62 | ```python 63 | flatten(x: FractalTensor[T], axis: int) -> Tensor 64 | ``` 65 | 66 | `flatten` is equal to retrieve all tensors contained in `x` by positive lexicographic order, put them into a depth-1 `FractalTensor` , and then call `stack` on this depth-1 `FractalTensor` . 67 | 68 | ## split (partition) 69 | 70 | $$\mathbf{split} ::\Psi n.[\alpha]^d_n \rightarrow \text{int}\rightarrow (\Psi m.[\beta]^d_m)$$ 71 | 72 | Partition a `FractalTensor` into a tuple of `FractalTensor` . 73 | 74 | ```python 75 | split(x: FractalTensor[T], n: int, pad_value: T = None) -> Tuple[FractalTensor[T]] 76 | ``` 77 | -------------------------------------------------------------------------------- /kaleido/frontend/operations/tests/test_flatten.py: -------------------------------------------------------------------------------- 1 | # ------------------------------------------------------------------------- 2 | # Copyright (c) Microsoft Corporation. All rights reserved. 3 | # Licensed under the MIT License. 4 | # -------------------------------------------------------------------------- 5 | 6 | from __future__ import print_function 7 | 8 | import torch 9 | from context import * 10 | 11 | from kaleido import FractalTensor 12 | 13 | 14 | class TestFlatten(unittest.TestCase): 15 | 16 | def create_data(self): 17 | shape = [3, 4] 18 | dtype = kaleido.TensorStorage(shape, 19 | kaleido.float32, 20 | device='cpu', 21 | order='row') 22 | 23 | count = 0 24 | 25 | xs = [] 26 | x_indices = [] 27 | for i in range(5): 28 | xs.append(FractalTensor(dtype)) 29 | n = random.randint(13, 27) 30 | count += n 31 | x_indices.append(list(range(n))) 32 | 33 | x = FractalTensor.from_fractaltensors(*xs) 34 | x.indices = x_indices 35 | 36 | x.initialize(torch.rand, *x.flatten_shape) 37 | return x, count 38 | 39 | def test1(self): 40 | x, count = self.create_data() 41 | dim = 0 42 | y = kaleido.operations.flatten(x, dim) 43 | self.assertTrue(isinstance(y, kaleido.Tensor)) 44 | 45 | new_shape = x.element_type.shape 46 | new_shape[dim] = new_shape[dim] * count 47 | for s1, s2 in zip(new_shape, y.shape): 48 | self.assertEqual(s1, s2) 49 | 50 | def test2(self): 51 | x, count = self.create_data() 52 | dim = 1 53 | y = kaleido.operations.flatten(x, dim) 54 | self.assertTrue(isinstance(y, kaleido.Tensor)) 55 | 56 | new_shape = x.element_type.shape 57 | new_shape[dim] = new_shape[dim] * count 58 | for s1, s2 in zip(new_shape, y.shape): 59 | self.assertEqual(s1, s2) 60 | 61 | 62 | if __name__ == '__main__': 63 | unittest.main() 64 | -------------------------------------------------------------------------------- /kaleido/frontend/operations/tensor/reshape.py: -------------------------------------------------------------------------------- 1 | # ------------------------------------------------------------------------- 2 | # Copyright (c) Microsoft Corporation. All rights reserved. 3 | # Licensed under the MIT License. 4 | # -------------------------------------------------------------------------- 5 | 6 | from __future__ import absolute_import, division, print_function 7 | 8 | __all__ = [ 9 | 'reshape', 10 | 'squeeze', 11 | 'unsqueeze', 12 | ] 13 | 14 | from typing import List, Union 15 | 16 | import torch 17 | 18 | import kaleido 19 | from kaleido.frontend.operations.base import BaseOp 20 | 21 | 22 | class Reshape(BaseOp): 23 | 24 | def __call__(self, x: kaleido.Tensor, shape: Union[List[int], int]): 25 | if not isinstance(shape, List): 26 | if isinstance(shape, int): 27 | shape = [shape] 28 | else: 29 | raise ValueError('shape should be list of integers.') 30 | 31 | super(Reshape, self).__call__(x) 32 | 33 | t = kaleido.Tensor(shape, x._type._dtype, device=x.device) 34 | t.data = torch.reshape(x.data, shape) 35 | t._type._shape = list(t.data.shape) 36 | t.recompute_strides() 37 | return t 38 | 39 | 40 | reshape = Reshape() 41 | 42 | 43 | class Squeeze(BaseOp): 44 | 45 | def __call__(self, x: kaleido.Tensor, dim: int = None): 46 | super(Squeeze, self).__call__(x) 47 | 48 | t = kaleido.Tensor([0], x._type._dtype, device=x.device) 49 | t.data = x.data.squeeze(dim) if dim else x.data.squeeze() 50 | t._type._shape = list(t.data.shape) 51 | t.recompute_strides() 52 | return t 53 | 54 | 55 | squeeze = Squeeze() 56 | 57 | 58 | class Unsqueeze(BaseOp): 59 | 60 | def __call__(self, x: kaleido.Tensor, dim: int): 61 | super(Unsqueeze, self).__call__(x) 62 | 63 | t = kaleido.Tensor([0], x._type._dtype, device=x.device) 64 | t.data = x.data.unsqueeze(dim) 65 | t._type._shape = list(t.data.shape) 66 | t.recompute_strides() 67 | return t 68 | 69 | 70 | unsqueeze = Unsqueeze() 71 | -------------------------------------------------------------------------------- /kaleido/core/tests/test_allocator.cc: -------------------------------------------------------------------------------- 1 | // Copyright (c) Microsoft Corporation. All rights reserved. 2 | // Licensed under the MIT License. 3 | 4 | #include "kaleido/core/cuda_allocator.h" 5 | #include "kaleido/core/device/cuda_utils.h" 6 | 7 | #include 8 | 9 | #include 10 | #include 11 | #include 12 | 13 | namespace kaleido { 14 | namespace core { 15 | 16 | TEST(test1, TEST_MEMORY_POOL) { 17 | cudaStream_t stream; 18 | CudaCheck(cudaStreamCreate(&stream)); 19 | 20 | std::shared_ptr memoryPool = 21 | std::make_shared(); 22 | // CudaMemoryPool is NOT multi-thread safe, 23 | // prevent multiple threads from modifying the memory pool at the same time. 24 | std::mutex mtx; 25 | 26 | // Before allocating memory from the memory pool, 27 | // you need to register **all** the streams that may use memory space 28 | // allocated from the memory pool. 29 | const std::lock_guard lock(mtx); 30 | memoryPool->add_track_stream(stream); 31 | 32 | // Get 256MB cuda memory block. 33 | // Only when the memory pool does not have a memory block that meets the 34 | // requirements, a new memory block is actually allocated from the physical 35 | // device. Requirements: 36 | // - The returned memory block size should be greater than the required 37 | // size. 38 | // - The returned memory block size should be less than twice the requested 39 | // size. 40 | 41 | // The returned memory space is guaranteed to meet the requested size. 42 | // If the user reads and writes beyond the requested size, undefined 43 | // behavior may occur. 44 | int nbytes = 256 * 1024 * 2014; 45 | void* ret = memoryPool->Allocate(nbytes); 46 | 47 | // Put the memory space back into the memory pool. 48 | memoryPool->Deallocate(ret); 49 | 50 | nbytes = 128 * 1024 * 2014; 51 | ret = memoryPool->Allocate(nbytes); 52 | memoryPool->Deallocate(ret); 53 | 54 | nbytes = 128 * 1024 * 2014; 55 | ret = memoryPool->Allocate(nbytes); 56 | memoryPool->Deallocate(ret); 57 | } 58 | 59 | } // namespace core 60 | } // namespace kaleido 61 | -------------------------------------------------------------------------------- /benchmarks/rnn/cuDNN/utils.h: -------------------------------------------------------------------------------- 1 | // Copyright (c) Microsoft Corporation. All rights reserved. 2 | // Licensed under the MIT License. 3 | 4 | #include "RNN_example.h" 5 | 6 | template 7 | float runRNNSample(RNNSampleOptions& options) { 8 | RNNSample sample; 9 | sample.setup(options); 10 | sample.run(); 11 | return sample.timeForward; 12 | } 13 | 14 | float TestCuDNNLSTM(int mini_batch, int hidden_size, int seq_length, 15 | int num_layers, int input_size) { 16 | RNNSampleOptions options; 17 | 18 | options.dataType = 1; // CUDNN_DATA_FLOAT 19 | // options.dataType = 0; 20 | options.seqLength = seq_length; 21 | options.numLayers = num_layers; 22 | options.inputSize = input_size; 23 | options.hiddenSize = hidden_size; 24 | options.projSize = hidden_size; 25 | options.miniBatch = mini_batch; 26 | options.inputMode = 1; // CUDNN_LINEAR_INPUT 27 | options.dirMode = 0; // CUDNN_UNIDIRECTIONAL 28 | options.cellMode = 2; // CUDNN_LSTM 29 | options.biasMode = 3; // CUDNN_RNN_DOUBLE_BIAS 30 | options.algorithm = 0; // CUDNN_RNN_ALGO_STANDARD 31 | options.mathPrecision = 1; // CUDNN_DATA_FLOAT 32 | // options.mathPrecision = 0; 33 | options.mathType = 0; // CUDNN_DEFAULT_MATH 34 | // options.mathType = 1; // CUDNN_TENSOR_OP_MATH 35 | options.dropout = 0.; 36 | options.printWeights = 0; 37 | 38 | return runRNNSample(options); 39 | // return runRNNSample<__half>(options); 40 | } 41 | 42 | int getRand(int min, int max) { return (rand() % (max - min)) + min + 1; } 43 | 44 | void genSeqs(int batch_size, int seq_length, bool random) { 45 | std::vector temp(batch_size, seq_length); 46 | 47 | std::default_random_engine e; 48 | e.seed(1234); 49 | std::normal_distribution distribution(seq_length / 2, seq_length / 8); 50 | 51 | for (int i = 1; i < batch_size; ++i) { 52 | if (random) { 53 | temp[i] = (int)distribution(e); 54 | } else { 55 | temp[i] = seq_length; 56 | } 57 | } 58 | sort(temp.begin(), temp.end()); 59 | reverse(temp.begin(), temp.end()); 60 | seqs = temp; 61 | } 62 | -------------------------------------------------------------------------------- /kaleido/core/operators/gather_nd_op.cu: -------------------------------------------------------------------------------- 1 | // Copyright (c) Microsoft Corporation. All rights reserved. 2 | // Licensed under the MIT License. 3 | 4 | #include "kaleido/core/device/gpu_context.h" 5 | #include "kaleido/core/device/kernels/gather_scatter.h" 6 | #include "kaleido/core/operators/gather_nd_op.h" 7 | 8 | namespace kaleido { 9 | namespace core { 10 | namespace ops { 11 | 12 | template 13 | class GatherNdOp { 14 | public: 15 | void operator()(const GPUContext& context, Tensor& output, 16 | const Tensor& input, const Tensor& indices) { 17 | auto index_dims = indices.dims(); 18 | size_t index_dims_size = indices.ndim(); 19 | auto input_dims = input.dims(); 20 | size_t input_dims_size = input.ndim(); 21 | 22 | // indices for the first `end_size` dimensionalities are specified 23 | int64_t end_size = index_dims[index_dims_size - 1]; 24 | 25 | int64_t remain_numel = 1; 26 | for (int i = 0; i < index_dims_size - 1; ++i) remain_numel *= index_dims[i]; 27 | 28 | // slice size 29 | int64_t slice_size = 1; 30 | for (int64_t i = end_size; i < input_dims_size; ++i) { 31 | // innermost dimensionalities form contiguous memory to slice. 32 | slice_size *= input_dims[i]; 33 | } 34 | 35 | int64_t* g_input_dims; 36 | CudaCheck(cudaMalloc(&g_input_dims, input_dims_size * sizeof(int64_t))); 37 | CudaCheck(cudaMemcpy(g_input_dims, input_dims.data(), 38 | input_dims_size * sizeof(int64_t), 39 | cudaMemcpyHostToDevice)); 40 | 41 | int64_t block = 512; 42 | int64_t n = slice_size * remain_numel; 43 | int64_t grid = (n + block - 1) / block; 44 | 45 | cuda_kernel::GatherNdCUDAKernel<<>>( 46 | input.data(), g_input_dims, indices.data(), 47 | output.mutable_data(), remain_numel, slice_size, end_size); 48 | 49 | cudaFree(g_input_dims); 50 | } 51 | }; 52 | 53 | template class GatherNdOp; 54 | template class GatherNdOp; 55 | 56 | } // namespace ops 57 | } // namespace core 58 | } // namespace kaleido 59 | -------------------------------------------------------------------------------- /benchmarks/rnn/cuDNN/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | cmake_minimum_required(VERSION 3.0) 2 | project(benchmarks CXX C) 3 | 4 | list(APPEND CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} 5 | "${CMAKE_SOURCE_DIR}/../../../cmake/Modules/") 6 | 7 | message(STATUS "CXX compiler: ${CMAKE_CXX_COMPILER}, version: " 8 | "${CMAKE_CXX_COMPILER_ID} ${CMAKE_CXX_COMPILER_VERSION}") 9 | message(STATUS "C compiler: ${CMAKE_C_COMPILER}, version: " 10 | "${CMAKE_C_COMPILER_ID} ${CMAKE_C_COMPILER_VERSION}") 11 | 12 | find_package(CUDA QUIET REQUIRED) 13 | find_package(CuDNN QUIET REQUIRED) 14 | 15 | set(CMAKE_BUILD_TYPE Release) 16 | 17 | set(CMAKE_CXX_STANDARD 14) 18 | set(CMAKE_CXX_STANDARD_REQUIRED TRUE) 19 | set(CMAKE_CUDA_STANDARD 14) 20 | set(CMAKE_CUDA_STANDARD_REQUIRED TRUE) 21 | 22 | set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fPIC -Wl,--no-undefined") 23 | set(CMAKE_CXX_FLAGS_DEBUG 24 | "$ENV{CXXFLAGS} -O0 -fPIC -Wall -Wno-sign-compare -g2 -ggdb") 25 | set(CMAKE_CXX_FLAGS_RELEASE "$ENV{CXXFLAGS} -fPIC -O3 -Wall -Wno-sign-compare") 26 | 27 | set(CMAKE_CXX_LINK_EXECUTABLE 28 | "${CMAKE_CXX_LINK_EXECUTABLE} -lpthread -ldl -lrt") 29 | 30 | set(CUDA_PROPAGATE_HOST_FLAGS OFF) 31 | 32 | set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS} -w -gencode arch=compute_75,code=sm_75) 33 | set(CUDA_NVCC_FLAGS_DEBUG ${CUDA_NVCC_FLAGS_DEBUG} -w -gencode 34 | arch=compute_75,code=sm_75) 35 | set(CUDA_NVCC_FLAGS_RELEASE ${CUDA_NVCC_FLAGS_RELEASE} -w -O3 -gencode 36 | arch=compute_75,code=sm_75) 37 | 38 | include_directories(${CUDA_INCLUDE_DIRS}) 39 | include_directories(${CUDNN_INCLUDE_DIRS}) 40 | 41 | cuda_add_executable(cudnn_lstm main.cu) 42 | target_link_libraries(cudnn_lstm ${CUDA_LIBRARIES} ${CUDA_curand_LIBRARY} 43 | ${CUDNN_LIBRARIES}) 44 | 45 | cuda_add_executable(lstm_cell_cudnn lstm_cell_cudnn.cu) 46 | target_link_libraries(lstm_cell_cudnn ${CUDA_LIBRARIES} ${CUDA_curand_LIBRARY} 47 | ${CUDNN_LIBRARIES}) 48 | 49 | cuda_add_executable(stacked_lstm_cudnn stacked_lstm_cudnn.cu) 50 | target_link_libraries(stacked_lstm_cudnn ${CUDA_LIBRARIES} 51 | ${CUDA_curand_LIBRARY} ${CUDNN_LIBRARIES}) 52 | -------------------------------------------------------------------------------- /cmake/external/zlib.cmake: -------------------------------------------------------------------------------- 1 | # ------------------------------------------------------------------------- 2 | # Copyright (c) Microsoft Corporation. All rights reserved. Licensed under the 3 | # MIT License. 4 | # -------------------------------------------------------------------------- 5 | 6 | include(ExternalProject) 7 | 8 | set(ZLIB_PREFIX_DIR ${THIRD_PARTY_PATH}/zlib) 9 | set(ZLIB_SOURCE_DIR ${THIRD_PARTY_PATH}/zlib/src/extern_zlib) 10 | set(ZLIB_INSTALL_DIR ${THIRD_PARTY_PATH}/install/zlib) 11 | set(ZLIB_ROOT 12 | ${ZLIB_INSTALL_DIR} 13 | CACHE FILEPATH "zlib root directory." FORCE) 14 | set(ZLIB_INCLUDE_DIR 15 | "${ZLIB_INSTALL_DIR}/include" 16 | CACHE PATH "zlib include directory." FORCE) 17 | set(ZLIB_REPOSITORY https://github.com/madler/zlib.git) 18 | set(ZLIB_TAG v1.2.8) 19 | 20 | include_directories(${ZLIB_INCLUDE_DIR}) 21 | include_directories(${THIRD_PARTY_PATH}/install) 22 | 23 | cache_third_party( 24 | extern_zlib 25 | REPOSITORY 26 | ${ZLIB_REPOSITORY} 27 | TAG 28 | ${ZLIB_TAG} 29 | DIR 30 | ZLIB_SOURCE_DIR) 31 | 32 | ExternalProject_Add( 33 | extern_zlib 34 | ${EXTERNAL_PROJECT_LOG_ARGS} 35 | ${SHALLOW_CLONE} 36 | "${ZLIB_DOWNLOAD_CMD}" 37 | PREFIX ${ZLIB_PREFIX_DIR} 38 | SOURCE_DIR ${ZLIB_SOURCE_DIR} 39 | UPDATE_COMMAND "" 40 | CMAKE_ARGS -DCMAKE_C_COMPILER=${CMAKE_C_COMPILER} 41 | -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER} 42 | -DCMAKE_C_FLAGS=${CMAKE_C_FLAGS} 43 | -DCMAKE_CXX_FLAGS=${CMAKE_CXX_FLAGS} 44 | -DCMAKE_INSTALL_PREFIX=${ZLIB_INSTALL_DIR} 45 | -DBUILD_SHARED_LIBS=OFF 46 | -DCMAKE_POSITION_INDEPENDENT_CODE=ON 47 | -DCMAKE_MACOSX_RPATH=ON 48 | -DCMAKE_BUILD_TYPE=${THIRD_PARTY_BUILD_TYPE} 49 | ${EXTERNAL_OPTIONAL_ARGS} 50 | CMAKE_CACHE_ARGS 51 | -DCMAKE_INSTALL_PREFIX:PATH=${ZLIB_INSTALL_DIR} 52 | -DCMAKE_POSITION_INDEPENDENT_CODE:BOOL=ON 53 | -DCMAKE_BUILD_TYPE:STRING=${THIRD_PARTY_BUILD_TYPE}) 54 | set(ZLIB_LIBRARIES 55 | "${ZLIB_INSTALL_DIR}/lib/libz.a" 56 | CACHE FILEPATH "zlib library." FORCE) 57 | 58 | add_library(zlib STATIC IMPORTED GLOBAL) 59 | set_property(TARGET zlib PROPERTY IMPORTED_LOCATION ${ZLIB_LIBRARIES}) 60 | add_dependencies(zlib extern_zlib) 61 | -------------------------------------------------------------------------------- /kaleido/frontend/operations/tests/test_aggregate.py: -------------------------------------------------------------------------------- 1 | # ------------------------------------------------------------------------- 2 | # Copyright (c) Microsoft Corporation. All rights reserved. 3 | # Licensed under the MIT License. 4 | # -------------------------------------------------------------------------- 5 | 6 | from __future__ import print_function 7 | 8 | import itertools 9 | import operator 10 | 11 | import torch 12 | from context import * 13 | 14 | 15 | class TestScan(unittest.TestCase): 16 | MAX = 193 17 | N = 17 18 | 19 | def setUp(self): 20 | random.seed(12345) 21 | 22 | self.data = [ 23 | random.randint(0, TestScan.MAX) for _ in range(TestScan.N) 24 | ] 25 | self.xs = kaleido.FractalTensor.from_pylist(self.data) 26 | 27 | def test1(self): 28 | """Test single-level scan.""" 29 | 30 | expected_results = list(itertools.accumulate(self.data, operator.add)) 31 | 32 | ys = kaleido.operations.scan(lambda s, x: kaleido.operations.add(s, x), 33 | self.xs) 34 | self.assertTrue(isinstance(ys, kaleido.FractalTensor)) 35 | self.assertEqual(len(ys), len(self.xs)) 36 | 37 | init = kaleido.Tensor((1, ), kaleido.int32) 38 | init.data = torch.LongTensor([5]) 39 | ys = kaleido.operations.scan(lambda s, x: kaleido.operations.add(s, x), 40 | self.xs, init) 41 | self.assertTrue(isinstance(ys, kaleido.FractalTensor)) 42 | self.assertEqual(len(ys), len(self.xs)) 43 | 44 | expected_results = list( 45 | itertools.accumulate([5] + self.data, operator.add)) 46 | for i, y in enumerate(ys): 47 | self.assertEqual(y.data.item(), expected_results[i + 1]) 48 | 49 | def test2(self): 50 | init = kaleido.Tensor((1, ), kaleido.int32) 51 | init.data = torch.LongTensor([5]) 52 | 53 | ys, zs = kaleido.operations.scan(lambda s, x: (x, x), self.xs, init) 54 | 55 | for x, y, z in kaleido.operations.zip(self.xs, ys, zs): 56 | self.assertEqual(y.data.item(), x.data.item()) 57 | self.assertEqual(y.data.item(), x.data.item()) 58 | 59 | 60 | if __name__ == '__main__': 61 | unittest.main() 62 | -------------------------------------------------------------------------------- /kaleido/frontend/operations/tests/test_join.py: -------------------------------------------------------------------------------- 1 | # ------------------------------------------------------------------------- 2 | # Copyright (c) Microsoft Corporation. All rights reserved. 3 | # Licensed under the MIT License. 4 | # -------------------------------------------------------------------------- 5 | 6 | from __future__ import print_function 7 | 8 | import torch 9 | from context import * 10 | 11 | 12 | class TestJoin(unittest.TestCase): 13 | 14 | def create_depth1_fractaltensor(self, length, device='cpu'): 15 | shape = [3, 7] 16 | xs = kaleido.FractalTensor( 17 | kaleido.TensorStorage(shape, kaleido.float32, device=device)) 18 | xs.indices = list(range(length)) 19 | xs.initialize(torch.rand, *xs.flatten_shape, device=device) 20 | return xs 21 | 22 | def create_depth2_fractaltensor(self, length, device='cpu'): 23 | shape = [3, 7] 24 | xss = kaleido.FractalTensor( 25 | kaleido.FractalTensorStorage( 26 | kaleido.TensorStorage(shape, kaleido.float32, device=device))) 27 | xss.indices = [ 28 | list(range(random.randint(5, 17))) for _ in range(length) 29 | ] 30 | xss.initialize(torch.rand, *xss.flatten_shape, device=device) 31 | return xss 32 | 33 | def setUp(self): 34 | random.seed(12345) 35 | 36 | def test_join1(self): 37 | xs = self.create_depth1_fractaltensor(19) 38 | ys = self.create_depth1_fractaltensor(3) 39 | zs = kaleido.operations.join(xs, ys) 40 | 41 | self.assertTrue(isinstance(zs, kaleido.FractalTensor)) 42 | self.assertEqual(zs.depth, xs.depth) 43 | self.assertEqual(len(xs) + len(ys), len(zs)) 44 | self.assertEqual(xs.numel + ys.numel, zs.numel) 45 | 46 | def test_join2(self): 47 | xss = self.create_depth2_fractaltensor(11) 48 | yss = self.create_depth2_fractaltensor(7) 49 | zss = kaleido.operations.join(xss, yss) 50 | 51 | self.assertTrue(isinstance(zss, kaleido.FractalTensor)) 52 | self.assertEqual(zss.depth, xss.depth) 53 | self.assertEqual(len(xss) + len(yss), len(zss)) 54 | self.assertEqual(xss.numel + yss.numel, zss.numel) 55 | 56 | 57 | if __name__ == '__main__': 58 | unittest.main() 59 | -------------------------------------------------------------------------------- /kaleido/core/device/gpu_context.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include "kaleido/core/device/cuda_info.h" 4 | #include "kaleido/core/device/cuda_utils.h" 5 | #include "kaleido/core/device/device_context.h" 6 | #include "kaleido/core/place.h" 7 | 8 | namespace kaleido { 9 | namespace core { 10 | 11 | class GPUContext : public DeviceContext { 12 | public: 13 | GPUContext(); 14 | explicit GPUContext(const CUDAPlace& place) : place_{place} { 15 | CublasCheck(cublasCreate(&cublas_handle_)); 16 | CublasCheck(cublasSetPointerMode(cublas_handle_, CUBLAS_POINTER_MODE_HOST)); 17 | CudnnCheck(cudnnCreate(&cudnn_handle_)); 18 | 19 | compute_capability_ = GetGPUComputeCapability(place_.GetDeviceId()); 20 | multi_process_ = GetGPUMultiProcessors(place_.GetDeviceId()); 21 | max_threads_per_mp_ = 22 | GetGPUMaxThreadsPerMultiProcessor(place_.GetDeviceId()); 23 | max_threads_per_block_ = GetGPUMaxThreadsPerBlock(place_.GetDeviceId()); 24 | max_grid_dim_size_ = GetGpuMaxGridDimSize(place_.GetDeviceId()); 25 | device_name_ = GetDeviceName(); 26 | } 27 | 28 | static GPUContext& GetInstance() { 29 | static GPUContext context; 30 | return context; 31 | } 32 | 33 | ~GPUContext(); 34 | GPUContext(GPUContext const&) = delete; 35 | void operator=(GPUContext const&) = delete; 36 | 37 | int GetComputeCapability() const { return compute_capability_; }; 38 | 39 | int GetMaxPhysicalThreadCount() const { 40 | return multi_process_ * max_threads_per_mp_; 41 | }; 42 | 43 | int GetMaxThreadsPerBlock() const { return max_threads_per_block_; }; 44 | 45 | int GetSMCount() const { return multi_process_; }; 46 | 47 | dim3 GetCUDAMaxGridDimSize() const { return max_grid_dim_size_; }; 48 | std::string GetDeviceName() const { return device_name_; } 49 | 50 | cublasHandle_t cublas_handle() const { return cublas_handle_; } 51 | cudnnHandle_t cudnn_handle() const { return cudnn_handle_; } 52 | 53 | private: 54 | cublasHandle_t cublas_handle_; 55 | cudnnHandle_t cudnn_handle_; 56 | 57 | CUDAPlace place_; 58 | int compute_capability_; 59 | int multi_process_; 60 | int max_threads_per_mp_; 61 | int max_threads_per_block_; 62 | dim3 max_grid_dim_size_; 63 | std::string device_name_; 64 | }; 65 | 66 | } // namespace core 67 | } // namespace kaleido 68 | -------------------------------------------------------------------------------- /docs/fractaltensor_operations/extended_access_operations.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | - [Extended operations to access a single FractalTensor](#extended-operations-to-access-a-single-fractaltensor) 4 | - [head and tail](#head-and-tail) 5 | - [slide](#slide) 6 | - [[**deprecated**] access_by_depth](#deprecated-access_by_depth) 7 | 8 | 12 | 13 | 14 | # Extended operations to access a single FractalTensor 15 | 16 | In the full program, accessing `FractalTensor` is not materialized directly. They encode information of how parallel functions read the inputs. 17 | 18 | Extended access APIs are wrappers of accessing primitives. It is not necessary to enumerate and implement them all. They are implemented through access primitives and are all unified into and analyzed as some form of access functions in the IR program. 19 | 20 | ## head and tail 21 | 22 | $$\mathbf{head}::\Psi n.[\alpha]_n^d \rightarrow [\alpha]_1^{[d-1]}$$ 23 | $$\mathbf{tail}::\Psi n.[\alpha]_n^d \rightarrow [\alpha]_1^{[d-1]}$$ 24 | 25 | ```python 26 | head(x: FractalTensor[T]) -> T 27 | tail(x: FractalTensor[T]) -> T 28 | ``` 29 | 30 | ## slide 31 | 32 | ```python 33 | slide(input: FractalTensor[T], 34 | window_size: int, 35 | stride: int, 36 | dilation: int, 37 | padding: int = None, 38 | padding_value: T = None) -> FractalTensor[FractalTensor[T]]: 39 | ``` 40 | 41 |

42 |
43 | Fig. Apply a sliding window over a FractalTensor variable. 44 |

45 | 46 | 47 | ## [**deprecated**] access_by_depth 48 | 49 | _#TODO(ying): this operation is a little bit awkward. Rethink about this._ 50 | 51 | ```python 52 | access_by_depth(x: FractalTensor[T], depth: int) -> FractalTensor[T] 53 | ``` 54 | 55 | Example `x = access_by_depth(x, x.depth)` : 56 | 57 |

58 |
59 | Fig. Access a depth-N FractalTensor variable x by the depth N. 60 |

61 | 62 | Example `x = access_by_depth(x, x.depth - 1)` : 63 | 64 |

65 |
66 | Fig. Access a depth-N FractalTensor variable x by the depth N - 1. 67 |

68 | -------------------------------------------------------------------------------- /benchmarks/rnn/baselines/stacked_dilated_rnn/stacked_drnn_tensorflow.py: -------------------------------------------------------------------------------- 1 | # ------------------------------------------------------------------------- 2 | # Copyright (c) Microsoft Corporation. All rights reserved. 3 | # Licensed under the MIT License. 4 | # -------------------------------------------------------------------------- 5 | 6 | import os 7 | 8 | os.environ['CUDA_VISIBLE_DEVICES'] = '0' 9 | 10 | import math 11 | import unittest 12 | from time import time 13 | 14 | import tensorflow as tf 15 | from tf_model import StackedDRNN 16 | from utils import * 17 | 18 | 19 | class TFGraphDRNN(unittest.TestCase): 20 | 21 | def setUp(self): 22 | self.shape = (SEQ_LEN, BATCH_SIZE, INPUT_SIZE) 23 | self.stddev = 1.0 / math.sqrt(HIDDEN_SIZE) 24 | 25 | self.log_dir = '' 26 | self.logger = init_logger(self.log_dir, 'tensorflow_drnn.txt') 27 | 28 | def _apply_forward(self, test_name, model, *inputs): 29 | for i in range(WARMUP): 30 | output = model(*inputs) 31 | 32 | start = time() 33 | 34 | for i in range(ITERS): 35 | output = model(*inputs) 36 | report(test_name, start, self.logger) 37 | 38 | def test_drnn_forward(self): 39 | shape = (SEQ_LEN, BATCH_SIZE, INPUT_SIZE) 40 | stddev = 1.0 / math.sqrt(HIDDEN_SIZE) 41 | 42 | gpus = tf.config.list_physical_devices('GPU') 43 | for device in [ 44 | # 'cpu', 45 | '/device:GPU:0', 46 | ]: 47 | with tf.device(device): 48 | model = StackedDRNN(batch_size=BATCH_SIZE, 49 | seq_len=SEQ_LEN, 50 | input_size=INPUT_SIZE, 51 | hidden_size=HIDDEN_SIZE, 52 | dilation=DILATION) 53 | 54 | x = tf.random.uniform(shape, minval=-stddev, maxval=stddev) 55 | rate = DILATION[-1] 56 | padding_data = tf.zeros( 57 | ((rate - (SEQ_LEN % rate)) % rate, BATCH_SIZE, INPUT_SIZE), 58 | dtype=tf.dtypes.float32) 59 | test_name = f'TensorFlow_Stacked_DLSTM_{device}' 60 | self._apply_forward(test_name, model, x, padding_data) 61 | 62 | 63 | if __name__ == '__main__': 64 | unittest.main(argv=['first-arg-is-ignored']) 65 | -------------------------------------------------------------------------------- /kaleido/core/device/kernels/gather_scatter.h: -------------------------------------------------------------------------------- 1 | // Copyright (c) Microsoft Corporation. All rights reserved. 2 | // Licensed under the MIT License. 3 | 4 | #pragma once 5 | 6 | namespace kaleido { 7 | namespace core { 8 | namespace cuda_kernel { 9 | 10 | #define CUDA_KERNEL_LOOP_TYPE(i, num, index_type) \ 11 | int64_t __index__ = blockIdx.x * blockDim.x + threadIdx.x; \ 12 | for (index_type i = __index__; __index__ < (num); \ 13 | __index__ += blockDim.x * gridDim.x, i = __index__) 14 | 15 | template 16 | __global__ void GatherNdCUDAKernel(const T* input, const int64_t* input_dims, 17 | const int64_t* indices, T* output, 18 | size_t remain_size, size_t slice_size, 19 | size_t end_size) { 20 | CUDA_KERNEL_LOOP_TYPE(i, remain_size * slice_size, int64_t) { 21 | int64_t indices_i = i / slice_size; 22 | int64_t slice_i = i - indices_i * slice_size; // offset inside the slice 23 | int64_t gather_i = 0; 24 | int64_t temp = slice_size; 25 | for (int64_t j = end_size - 1; j >= 0; --j) { 26 | auto index_value = indices[indices_i * end_size + j]; 27 | gather_i += (index_value * temp); 28 | temp *= input_dims[j]; 29 | } 30 | int64_t input_i = gather_i + slice_i; 31 | *(output + i) = *(input + input_i); 32 | } 33 | } 34 | 35 | template 36 | __global__ void ScatterNdCUDAKernel(const T* update, const int64_t* indices, 37 | T* output, const int64_t* output_dims, 38 | size_t remain_size, size_t slice_size, 39 | size_t end_size) { 40 | CUDA_KERNEL_LOOP_TYPE(i, remain_size * slice_size, int64_t) { 41 | int64_t indices_i = i / slice_size; 42 | int64_t slice_i = i - indices_i * slice_size; // offset inside the slice 43 | int64_t gather_i = 0; 44 | int64_t temp = slice_size; 45 | for (int64_t j = end_size - 1; j >= 0; --j) { 46 | int64_t index_value = indices[indices_i * end_size + j]; 47 | 48 | gather_i += (index_value * temp); 49 | temp *= output_dims[j]; 50 | } 51 | int64_t output_i = gather_i + slice_i; 52 | atomicAdd(output + output_i, *(update + i)); 53 | } 54 | } 55 | 56 | } // namespace cuda_kernel 57 | } // namespace core 58 | } // namespace kaleido 59 | -------------------------------------------------------------------------------- /kaleido/core/device/cuda_info.cc: -------------------------------------------------------------------------------- 1 | // Copyright (c) Microsoft Corporation. All rights reserved. 2 | // Licensed under the MIT License. 3 | 4 | #include "kaleido/core/device/cuda_info.h" 5 | 6 | #include "kaleido/core/device/cuda_utils.h" 7 | 8 | #include 9 | 10 | #include 11 | #include 12 | 13 | namespace kaleido { 14 | namespace core { 15 | 16 | int GetGPUDeviceCount() { 17 | int deviceCount = 0; 18 | CudaCheck(cudaGetDeviceCount(&deviceCount)); 19 | return deviceCount; 20 | } 21 | 22 | int GetGPUComputeCapability(int id) { 23 | int major, minor; 24 | CudaCheck( 25 | cudaDeviceGetAttribute(&major, cudaDevAttrComputeCapabilityMajor, id)); 26 | CudaCheck( 27 | cudaDeviceGetAttribute(&minor, cudaDevAttrComputeCapabilityMinor, id)); 28 | return major * 10 + minor; 29 | } 30 | 31 | int GetGPUMultiProcessors(int id) { 32 | int count; 33 | CudaCheck(cudaDeviceGetAttribute(&count, cudaDevAttrMultiProcessorCount, id)); 34 | return count; 35 | } 36 | 37 | int GetGPUMaxThreadsPerMultiProcessor(int id) { 38 | int count; 39 | CudaCheck(cudaDeviceGetAttribute(&count, 40 | cudaDevAttrMaxThreadsPerMultiProcessor, id)); 41 | return count; 42 | } 43 | 44 | int GetGPUMaxThreadsPerBlock(int id) { 45 | int count; 46 | CudaCheck(cudaDeviceGetAttribute(&count, cudaDevAttrMaxThreadsPerBlock, id)); 47 | return count; 48 | } 49 | 50 | dim3 GetGpuMaxGridDimSize(int id) { 51 | dim3 grid_size; 52 | 53 | int size; 54 | CudaCheck(cudaDeviceGetAttribute(&size, cudaDevAttrMaxGridDimX, id)); 55 | grid_size.x = size; 56 | 57 | CudaCheck(cudaDeviceGetAttribute(&size, cudaDevAttrMaxGridDimY, id)); 58 | grid_size.y = size; 59 | 60 | CudaCheck(cudaDeviceGetAttribute(&size, cudaDevAttrMaxGridDimZ, id)); 61 | grid_size.z = size; 62 | return grid_size; 63 | } 64 | 65 | std::string GetDeviceName() { 66 | cudaDeviceProp prop; 67 | cudaGetDeviceProperties(&prop, 0); 68 | 69 | std::stringstream ss(prop.name); 70 | const char delim = ' '; 71 | 72 | std::string s; 73 | std::vector out; 74 | 75 | while (std::getline(ss, s, delim)) { 76 | out.push_back(s); 77 | } 78 | 79 | std::stringstream out_ss; 80 | int i = 0; 81 | for (; i < out.size() - 1; ++i) out_ss << out[i] << "_"; 82 | out_ss << out[i]; 83 | return out_ss.str(); 84 | } 85 | 86 | } // namespace core 87 | } // namespace kaleido 88 | -------------------------------------------------------------------------------- /kaleido/parser/operations/access_patterns.py: -------------------------------------------------------------------------------- 1 | # ------------------------------------------------------------------------- 2 | # Copyright (c) Microsoft Corporation. All rights reserved. 3 | # Licensed under the MIT License. 4 | # -------------------------------------------------------------------------- 5 | 6 | from __future__ import absolute_import, division, print_function 7 | 8 | from collections import OrderedDict 9 | from typing import Tuple 10 | 11 | from kaleido.frontend.types import FractalTensorStorage, Storage, TensorStorage 12 | from kaleido.parser.ir_nodes import AccessNode 13 | from kaleido.parser.operations.common import registers 14 | 15 | 16 | @registers.access.register 17 | class Index(AccessNode): 18 | opcode = 'index' 19 | arity = 1 20 | 21 | def __init__(self, name: str): 22 | super(Index, self).__init__(name, OrderedDict(), OrderedDict()) 23 | 24 | def propagate_storage(self) -> Storage: 25 | super(Index, self).propagate_storage() 26 | ids = self.attributes['index'] 27 | self.output_ports[list(self.output_ports.keys())[-1]] = list( 28 | self.input_ports.values())[0].element_type() 29 | 30 | 31 | @registers.access.register 32 | class Last(AccessNode): 33 | opcode = 'last' 34 | arity = 1 35 | 36 | def __init__(self, name: str): 37 | super(Last, self).__init__(name, OrderedDict(), OrderedDict()) 38 | 39 | 40 | @registers.access.register 41 | class Slice(AccessNode): 42 | opcode = 'slice' 43 | arity = 1 44 | 45 | def __init__(self, name: str): 46 | super(Slice, self).__init__(name, OrderedDict(), OrderedDict()) 47 | 48 | def propagate_storage(self) -> Storage: 49 | super(Slice, self).propagate_storage() 50 | 51 | lower = self.attributes['lower'] 52 | step = self.attributes['step'] 53 | upper = self.attributes['upper'] 54 | 55 | s_in = list(self.input_ports.values())[0].element_type() 56 | s_out = FractalTensorStorage(s_in) 57 | s_out.indices = list(range((upper - lower) // step)) 58 | self.output_ports[list(self.output_ports.keys())[-1]] = s_out 59 | 60 | 61 | @registers.access.register 62 | class Slices(AccessNode): 63 | opcode = 'slices' 64 | arity = 1 65 | 66 | def __init__(self, name: str): 67 | super(Slices, self).__init__(name, OrderedDict(), OrderedDict()) 68 | 69 | def propagate_storage(self) -> Storage: 70 | super().propagate_storage() 71 | -------------------------------------------------------------------------------- /benchmarks/rnn/cuDNN/main.cu: -------------------------------------------------------------------------------- 1 | // Copyright (c) Microsoft Corporation. All rights reserved. 2 | // Licensed under the MIT License. 3 | 4 | #include "utils.h" 5 | 6 | int main(int argc, char* argv[]) { 7 | srand(1234); 8 | int batch_size = 64; 9 | int hidden_size = 256; 10 | int seq_length = 100; 11 | int depth = 10; 12 | 13 | int input_size = hidden_size; 14 | 15 | genSeqs(batch_size, seq_length, false); 16 | 17 | for (auto depth : {2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22}) { 18 | float cudnn_time = 19 | TestCuDNNLSTM(batch_size, hidden_size, seq_length, depth, input_size); 20 | 21 | std::stringstream ss; 22 | ss << "[" << batch_size << ", " << hidden_size << ", " << seq_length << ", " 23 | << depth << "]|"; 24 | std::cout << "|CuDNN|" << ss.str() << "||||" << cudnn_time << "|" 25 | << std::endl; 26 | } 27 | 28 | std::cout << std::endl; 29 | 30 | for (auto seq_length : {50, 75, 100, 125, 150, 175, 200}) { 31 | genSeqs(batch_size, seq_length, false); 32 | float cudnn_time = 33 | TestCuDNNLSTM(batch_size, hidden_size, seq_length, depth, input_size); 34 | 35 | std::stringstream ss; 36 | ss << "[" << batch_size << ", " << hidden_size << ", " << seq_length << ", " 37 | << depth << "]|"; 38 | std::cout << "|CuDNN|" << ss.str() << "||||" << cudnn_time << "|" 39 | << std::endl; 40 | } 41 | 42 | std::cout << std::endl; 43 | 44 | genSeqs(batch_size, seq_length, true); 45 | 46 | for (auto depth : {2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22}) { 47 | float cudnn_time = 48 | TestCuDNNLSTM(batch_size, hidden_size, seq_length, depth, input_size); 49 | 50 | std::stringstream ss; 51 | ss << "[" << batch_size << ", " << hidden_size << ", " << seq_length << ", " 52 | << depth << "]|"; 53 | std::cout << "|CuDNN|" << ss.str() << "||||" << cudnn_time << "|" 54 | << std::endl; 55 | } 56 | 57 | std::cout << std::endl; 58 | 59 | for (auto seq_length : {50, 75, 100, 125, 150, 175, 200}) { 60 | genSeqs(batch_size, seq_length, true); 61 | float cudnn_time = 62 | TestCuDNNLSTM(batch_size, hidden_size, seq_length, depth, input_size); 63 | 64 | std::stringstream ss; 65 | ss << "[" << batch_size << ", " << hidden_size << ", " << seq_length << ", " 66 | << depth << "]|"; 67 | std::cout << "|CuDNN|" << ss.str() << "||||" << cudnn_time << "|" 68 | << std::endl; 69 | } 70 | 71 | return 0; 72 | } 73 | -------------------------------------------------------------------------------- /kaleido/core/operators/scatter_nd_op.cu: -------------------------------------------------------------------------------- 1 | // Copyright (c) Microsoft Corporation. All rights reserved. 2 | // Licensed under the MIT License. 3 | 4 | #include "kaleido/core/device/gpu_context.h" 5 | #include "kaleido/core/device/kernels/gather_scatter.h" 6 | #include "kaleido/core/operators/scatter_nd_op.h" 7 | 8 | namespace kaleido { 9 | namespace core { 10 | namespace ops { 11 | 12 | template 13 | class ScatterNdAddOp { 14 | public: 15 | void operator()(const GPUContext& context, Tensor& data, 16 | const Tensor& updates, const Tensor& indices) { 17 | auto index_dims = indices.dims(); 18 | auto index_dims_size = index_dims.size(); 19 | 20 | // output_dims = data.dims() 21 | auto output_dims = data.dims(); 22 | auto output_dims_size = output_dims.size(); 23 | 24 | // final dim 25 | int64_t end_size = index_dims[index_dims_size - 1]; 26 | 27 | // remain dim 28 | auto remain_dims = std::vector(index_dims.begin(), 29 | index_dims.end() - index_dims_size); 30 | 31 | // Compute the product of the indices dimensions. 32 | int64_t remain_numel = 1; 33 | for (int i = 0; i < index_dims_size - 1; ++i) remain_numel *= index_dims[i]; 34 | 35 | // slice size 36 | int64_t slice_size = 1; 37 | 38 | // Calculate the product of output dimensions. 39 | for (int64_t i = end_size; i < output_dims_size; ++i) 40 | slice_size *= output_dims[i]; 41 | 42 | // Calculate bytes of each slice. 43 | const size_t slice_bytes = slice_size * sizeof(T); 44 | 45 | int64_t* g_output_dims; 46 | CudaCheck(cudaMalloc(&g_output_dims, output_dims_size * sizeof(int64_t))); 47 | CudaCheck(cudaMemcpy(g_output_dims, output_dims.data(), 48 | output_dims_size * sizeof(int64_t), 49 | cudaMemcpyHostToDevice)); 50 | 51 | int64_t block = 512; 52 | int64_t n = slice_size * remain_numel; 53 | int64_t grid = (n + block - 1) / block; 54 | 55 | cuda_kernel::ScatterNdCUDAKernel<<>>( 56 | updates.data(), indices.data(), data.mutable_data(), 57 | g_output_dims, remain_numel, slice_size, end_size); 58 | } 59 | }; 60 | 61 | template class ScatterNdAddOp; 62 | template class ScatterNdAddOp; 63 | 64 | } // namespace ops 65 | } // namespace core 66 | } // namespace kaleido 67 | -------------------------------------------------------------------------------- /benchmarks/rnn/baselines/stacked_dilated_rnn/tf_model/model.py: -------------------------------------------------------------------------------- 1 | # ------------------------------------------------------------------------- 2 | # Copyright (c) Microsoft Corporation. All rights reserved. 3 | # Licensed under the MIT License. 4 | # -------------------------------------------------------------------------- 5 | 6 | from typing import List 7 | 8 | import tensorflow as tf 9 | 10 | __all__ = [ 11 | 'StackedDRNN', 12 | ] 13 | 14 | 15 | class StackedDRNN(tf.keras.Model): 16 | 17 | def __init__(self, batch_size: int, seq_len: int, input_size: int, 18 | hidden_size: int, dilation: List[int]): 19 | super(StackedDRNN, self).__init__() 20 | 21 | self.batch_size = batch_size 22 | self.seq_len = seq_len 23 | self.hidden_size = hidden_size 24 | self.input_size = input_size 25 | self.dilation = dilation 26 | self.num_layers = len(dilation) 27 | 28 | rate = dilation[-1] 29 | self.padded_length = (rate - (seq_len % rate)) % rate + self.seq_len 30 | 31 | self.cells = [] 32 | for i in range(self.num_layers): 33 | self.cells.append( 34 | tf.compat.v1.keras.layers.CuDNNLSTM(hidden_size, 35 | return_sequences=False)) 36 | 37 | # uncomment the following line to enable auto-graph. 38 | # @tf.function 39 | def call(self, input, padding_data): 40 | # step 0: pad the input 41 | input_x = tf.concat((input, padding_data), axis=0) 42 | 43 | # no special treatment for the first layer. 44 | xs = self.cells[0](input_x) 45 | 46 | for i, cell in enumerate(self.cells[1:]): 47 | # for layers above the frist layer. 48 | # step 1: pre-process: form a new batch 49 | num_split = self.padded_length // self.dilation[i + 1] 50 | 51 | xs_ = [ 52 | tf.reshape(x, (-1, self.hidden_size)) 53 | for x in tf.split(xs, num_or_size_splits=num_split, axis=0) 54 | ] 55 | dilated_input = tf.stack(xs_) 56 | 57 | # step 2: call LSTM layer 58 | xs = cell(dilated_input) 59 | 60 | # step 3: post-processing, revert to the original layout 61 | xss = [ 62 | tf.split(x, self.dilation[i + 1], axis=0) 63 | for x in tf.unstack(xs, axis=0) 64 | ] 65 | 66 | xs = tf.stack([x for sublist in xss for x in sublist]) 67 | return xs 68 | -------------------------------------------------------------------------------- /kaleido/core/device/kernels/softmax_v2.h: -------------------------------------------------------------------------------- 1 | // Copyright (c) Microsoft Corporation. All rights reserved. 2 | // Licensed under the MIT License. 3 | 4 | #pragma once 5 | 6 | #include "kaleido/core/device/cuda_utils.h" 7 | #include "kaleido/core/operators/kernels/math_functor.h" 8 | #include "kaleido/core/operators/kernels/reduce.h" 9 | #include "kaleido/core/operators/kernels/softmax_common.h" 10 | 11 | namespace kaleido { 12 | namespace core { 13 | namespace cuda_kernel { 14 | 15 | template 16 | __launch_bounds__(BLOCK_SIZE) __global__ 17 | void KeMatrixSoftMaxV2(const T* __restrict input, T* __restrict output, 18 | int width) { 19 | // use shared memory to cache input and intermediate results. 20 | extern __shared__ __align__(sizeof(double)) unsigned char shared_buf[]; 21 | auto* sbuf = reinterpret_cast(shared_buf); 22 | __shared__ MD md; 23 | 24 | // cache input into shared memory 25 | int tid = threadIdx.x; 26 | int next_idx = blockIdx.x * width + tid; // element index in input array 27 | int cur_idx = tid; // element index in current row 28 | for (; cur_idx < width; next_idx += BLOCK_SIZE, cur_idx += BLOCK_SIZE) { 29 | sbuf[cur_idx] = input[next_idx]; 30 | } 31 | __syncthreads(); 32 | 33 | // Loop1: reduction Max, the maximum value is stored in md.m. 34 | Max max; 35 | md.m = 36 | BlockRowReduce, BLOCK_SIZE>(sbuf, width, max, -MaxValue()); 37 | __syncthreads(); 38 | 39 | // Loop2: reduction sum of exponential and substraction: 40 | // sum(exp(x - m)). the reduction sum is stored in md.d; 41 | SubAndExp sub_and_exp(md.m); // mapper 42 | Add sum; // reducer 43 | Inverse inverse; // finalizer 44 | md.d = BlockRowReduce, Add, Inverse, BLOCK_SIZE>( 45 | sbuf, sbuf, width, sub_and_exp /*mapper*/, sum /*reducer*/, 46 | inverse /*finalizer*/, static_cast(0) /*initialier of reduction*/); 47 | __syncthreads(); 48 | 49 | // Loop3: map to rescale. 50 | for (int cur_idx = tid; cur_idx < width; cur_idx += BLOCK_SIZE) { 51 | sbuf[cur_idx] *= md.d; 52 | } 53 | 54 | // Store result into global memory. 55 | tid = threadIdx.x; 56 | next_idx = blockIdx.x * width + tid; 57 | cur_idx = tid; 58 | for (; cur_idx < width; next_idx += BLOCK_SIZE, cur_idx += BLOCK_SIZE) { 59 | output[next_idx] = sbuf[cur_idx]; 60 | } 61 | } 62 | 63 | } // namespace cuda_kernel 64 | } // namespace core 65 | } // namespace kaleido 66 | -------------------------------------------------------------------------------- /cmake/third_party.cmake: -------------------------------------------------------------------------------- 1 | # ------------------------------------------------------------------------- 2 | # Copyright (c) Microsoft Corporation. All rights reserved. Licensed under the 3 | # MIT License. 4 | # -------------------------------------------------------------------------- 5 | 6 | set(THIRD_PARTY_PATH 7 | "${CMAKE_BINARY_DIR}/third_party" 8 | CACHE STRING 9 | "A path setting third party libraries download & build directories.") 10 | 11 | set(THIRD_PARTY_CACHE_PATH 12 | "${CMAKE_SOURCE_DIR}" 13 | CACHE STRING 14 | "A path cache third party source code to avoid repeated download.") 15 | 16 | set(THIRD_PARTY_BUILD_TYPE Release) 17 | set(EXTERNAL_PROJECT_LOG_ARGS 18 | LOG_DOWNLOAD 19 | 0 20 | LOG_UPDATE 21 | 1 22 | LOG_CONFIGURE 23 | 1 24 | LOG_BUILD 25 | 0 26 | LOG_TEST 27 | 1 28 | LOG_INSTALL 29 | 0) 30 | set(SHALLOW_CLONE "GIT_SHALLOW TRUE") 31 | 32 | function(cache_third_party TARGET) 33 | set(options "") 34 | set(oneValueArgs URL REPOSITORY TAG DIR) 35 | set(multiValueArgs "") 36 | cmake_parse_arguments(cache_third_party "${optionps}" "${oneValueArgs}" 37 | "${multiValueArgs}" ${ARGN}) 38 | 39 | string(REPLACE "extern_" "" TARGET_NAME ${TARGET}) 40 | string(REGEX REPLACE "[0-9]+" "" TARGET_NAME ${TARGET_NAME}) 41 | string(TOUPPER ${TARGET_NAME} TARGET_NAME) 42 | 43 | if(cache_third_party_REPOSITORY) 44 | set(${TARGET_NAME}_DOWNLOAD_CMD GIT_REPOSITORY 45 | ${cache_third_party_REPOSITORY}) 46 | 47 | if(cache_third_party_TAG) 48 | list(APPEND ${TARGET_NAME}_DOWNLOAD_CMD GIT_TAG ${cache_third_party_TAG}) 49 | endif() 50 | elseif(cache_third_party_URL) 51 | set(${TARGET_NAME}_DOWNLOAD_CMD URL ${cache_third_party_URL}) 52 | else() 53 | message( 54 | FATAL_ERROR "Download link (Git repo or URL) must be specified for cache!" 55 | ) 56 | endif() 57 | 58 | # Pass ${TARGET_NAME}_DOWNLOAD_CMD to parent scope, the double quotation marks 59 | # can't be removed 60 | set(${TARGET_NAME}_DOWNLOAD_CMD 61 | "${${TARGET_NAME}_DOWNLOAD_CMD}" 62 | PARENT_SCOPE) 63 | endfunction() 64 | 65 | set(third_party_deps) 66 | 67 | include(external/gflags) 68 | include(external/glog) 69 | include(external/gtest) 70 | include(external/pybind) 71 | include(external/zlib) 72 | include(external/protobuf) 73 | # required by benchmarks include(external/tvm) 74 | include(external/cccl) 75 | include(external/cutlass) 76 | list(APPEND third_party_deps extern_gtest extern_glog) 77 | -------------------------------------------------------------------------------- /cmake/external/gflags.cmake: -------------------------------------------------------------------------------- 1 | # ------------------------------------------------------------------------- 2 | # Copyright (c) Microsoft Corporation. All rights reserved. Licensed under the 3 | # MIT License. 4 | # -------------------------------------------------------------------------- 5 | 6 | include(ExternalProject) 7 | 8 | set(GFLAGS_PREFIX_DIR ${THIRD_PARTY_PATH}/gflags) 9 | set(GFLAGS_SOURCE_DIR ${THIRD_PARTY_PATH}/gflags/src/extern_gflags) 10 | set(GFLAGS_INSTALL_DIR ${THIRD_PARTY_PATH}/install/gflags) 11 | set(GFLAGS_INCLUDE_DIR 12 | "${GFLAGS_INSTALL_DIR}/include" 13 | CACHE PATH "gflags include directory." FORCE) 14 | set(GFLAGS_REPOSITORY https://github.com/gflags/gflags.git) 15 | set(GFLAGS_TAG 77592648e3f3be87d6c7123eb81cbad75f9aef5a) 16 | set(GFLAGS_LIBRARIES 17 | "${GFLAGS_INSTALL_DIR}/lib/libgflags.a" 18 | CACHE FILEPATH "GFLAGS_LIBRARIES" FORCE) 19 | set(BUILD_COMMAND $(MAKE) --silent) 20 | set(INSTALL_COMMAND $(MAKE) install) 21 | 22 | include_directories(${GFLAGS_INCLUDE_DIR}) 23 | 24 | cache_third_party( 25 | extern_gflags 26 | REPOSITORY 27 | ${GFLAGS_REPOSITORY} 28 | TAG 29 | ${GFLAGS_TAG} 30 | DIR 31 | GFLAGS_SOURCE_DIR) 32 | 33 | ExternalProject_Add( 34 | extern_gflags 35 | ${EXTERNAL_PROJECT_LOG_ARGS} 36 | ${SHALLOW_CLONE} 37 | "${GFLAGS_DOWNLOAD_CMD}" 38 | PREFIX ${GFLAGS_PREFIX_DIR} 39 | SOURCE_DIR ${GFLAGS_SOURCE_DIR} 40 | BUILD_COMMAND ${BUILD_COMMAND} 41 | INSTALL_COMMAND ${INSTALL_COMMAND} 42 | UPDATE_COMMAND "" 43 | CMAKE_ARGS -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER} 44 | -DCMAKE_C_COMPILER=${CMAKE_C_COMPILER} 45 | -DCMAKE_CXX_FLAGS=${CMAKE_CXX_FLAGS} 46 | -DCMAKE_CXX_FLAGS_RELEASE=${CMAKE_CXX_FLAGS_RELEASE} 47 | -DCMAKE_CXX_FLAGS_DEBUG=${CMAKE_CXX_FLAGS_DEBUG} 48 | -DCMAKE_C_FLAGS=${CMAKE_C_FLAGS} 49 | -DCMAKE_C_FLAGS_DEBUG=${CMAKE_C_FLAGS_DEBUG} 50 | -DCMAKE_C_FLAGS_RELEASE=${CMAKE_C_FLAGS_RELEASE} 51 | -DBUILD_STATIC_LIBS=ON 52 | -DCMAKE_INSTALL_PREFIX=${GFLAGS_INSTALL_DIR} 53 | -DCMAKE_POSITION_INDEPENDENT_CODE=ON 54 | -DBUILD_TESTING=OFF 55 | -DCMAKE_BUILD_TYPE=${THIRD_PARTY_BUILD_TYPE} 56 | ${EXTERNAL_OPTIONAL_ARGS} 57 | CMAKE_CACHE_ARGS 58 | -DCMAKE_INSTALL_PREFIX:PATH=${GFLAGS_INSTALL_DIR} 59 | -DCMAKE_POSITION_INDEPENDENT_CODE:BOOL=ON 60 | -DCMAKE_BUILD_TYPE:STRING=${THIRD_PARTY_BUILD_TYPE}) 61 | 62 | add_library(gflags STATIC IMPORTED GLOBAL) 63 | set_property(TARGET gflags PROPERTY IMPORTED_LOCATION ${GFLAGS_LIBRARIES}) 64 | add_dependencies(gflags extern_gflags) 65 | -------------------------------------------------------------------------------- /cmake/external/glog.cmake: -------------------------------------------------------------------------------- 1 | # ------------------------------------------------------------------------- 2 | # Copyright (c) Microsoft Corporation. All rights reserved. Licensed under the 3 | # MIT License. 4 | # -------------------------------------------------------------------------- 5 | 6 | include(ExternalProject) 7 | 8 | set(GLOG_PREFIX_DIR ${THIRD_PARTY_PATH}/glog) 9 | set(GLOG_SOURCE_DIR ${THIRD_PARTY_PATH}/glog/src/extern_glog) 10 | set(GLOG_INSTALL_DIR ${THIRD_PARTY_PATH}/install/glog) 11 | set(GLOG_INCLUDE_DIR 12 | "${GLOG_INSTALL_DIR}/include" 13 | CACHE PATH "glog include directory." FORCE) 14 | set(GLOG_REPOSITORY https://github.com/google/glog.git) 15 | set(GLOG_TAG v0.3.5) 16 | 17 | set(GLOG_LIBRARIES 18 | "${GLOG_INSTALL_DIR}/lib/libglog.a" 19 | CACHE FILEPATH "glog library." FORCE) 20 | set(GLOG_CMAKE_CXX_FLAGS ${CMAKE_CXX_FLAGS}) 21 | 22 | include_directories(${GLOG_INCLUDE_DIR}) 23 | 24 | cache_third_party( 25 | extern_glog 26 | REPOSITORY 27 | ${GLOG_REPOSITORY} 28 | TAG 29 | ${GLOG_TAG} 30 | DIR 31 | GLOG_SOURCE_DIR) 32 | 33 | ExternalProject_Add( 34 | extern_glog 35 | ${EXTERNAL_PROJECT_LOG_ARGS} 36 | ${SHALLOW_CLONE} 37 | "${GLOG_DOWNLOAD_CMD}" 38 | DEPENDS gflags 39 | PREFIX ${GLOG_PREFIX_DIR} 40 | SOURCE_DIR ${GLOG_SOURCE_DIR} 41 | UPDATE_COMMAND "" 42 | CMAKE_ARGS -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER} 43 | -DCMAKE_C_COMPILER=${CMAKE_C_COMPILER} 44 | -DCMAKE_CXX_FLAGS=${GLOG_CMAKE_CXX_FLAGS} 45 | -DCMAKE_CXX_FLAGS_RELEASE=${CMAKE_CXX_FLAGS_RELEASE} 46 | -DCMAKE_CXX_FLAGS_DEBUG=${CMAKE_CXX_FLAGS_DEBUG} 47 | -DCMAKE_C_FLAGS=${CMAKE_C_FLAGS} 48 | -DCMAKE_C_FLAGS_DEBUG=${CMAKE_C_FLAGS_DEBUG} 49 | -DCMAKE_C_FLAGS_RELEASE=${CMAKE_C_FLAGS_RELEASE} 50 | -DCMAKE_INSTALL_PREFIX=${GLOG_INSTALL_DIR} 51 | -DCMAKE_INSTALL_LIBDIR=${GLOG_INSTALL_DIR}/lib 52 | -DCMAKE_POSITION_INDEPENDENT_CODE=ON 53 | -DWITH_GFLAGS=ON 54 | -Dgflags_DIR=${GFLAGS_INSTALL_DIR}/lib/cmake/gflags 55 | -DBUILD_TESTING=OFF 56 | -DCMAKE_BUILD_TYPE=${THIRD_PARTY_BUILD_TYPE} 57 | ${EXTERNAL_OPTIONAL_ARGS} 58 | CMAKE_CACHE_ARGS 59 | -DCMAKE_INSTALL_PREFIX:PATH=${GLOG_INSTALL_DIR} 60 | -DCMAKE_INSTALL_LIBDIR:PATH=${GLOG_INSTALL_DIR}/lib 61 | -DCMAKE_POSITION_INDEPENDENT_CODE:BOOL=ON 62 | -DCMAKE_BUILD_TYPE:STRING=${THIRD_PARTY_BUILD_TYPE}) 63 | 64 | add_library(glog SHARED IMPORTED GLOBAL) 65 | set_property(TARGET glog PROPERTY IMPORTED_LOCATION ${GLOG_LIBRARIES}) 66 | add_dependencies(glog extern_glog gflags) 67 | link_libraries(glog gflags) 68 | -------------------------------------------------------------------------------- /benchmarks/rnn/fractaltensor/stacked_lstm/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | cmake_minimum_required(VERSION 3.18) 2 | project(recurrence_test CXX C) 3 | 4 | list(APPEND CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} 5 | "${CMAKE_SOURCE_DIR}/../../../../cmake/Modules/") 6 | 7 | set(CMAKE_BUILD_TYPE Release) 8 | 9 | set(CMAKE_CXX_STANDARD 17) 10 | set(CMAKE_CXX_STANDARD_REQUIRED TRUE) 11 | set(CMAKE_CUDA_STANDARD 17) 12 | set(CMAKE_CUDA_STANDARD_REQUIRED TRUE) 13 | 14 | set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fPIC -Wl,--no-undefined") 15 | set(CMAKE_CXX_FLAGS_DEBUG 16 | "$ENV{CXXFLAGS} -O0 -fPIC -Wall -Wno-sign-compare -g2 -ggdb") 17 | set(CMAKE_CXX_FLAGS_RELEASE "$ENV{CXXFLAGS} -fPIC -O3 -Wall -Wno-sign-compare") 18 | 19 | set(CMAKE_CXX_LINK_EXECUTABLE 20 | "${CMAKE_CXX_LINK_EXECUTABLE} -lpthread -ldl -lrt") 21 | 22 | find_package(CUDA QUIET REQUIRED) 23 | find_package(CuDNN QUIET REQUIRED) 24 | 25 | cuda_select_nvcc_arch_flags(ARCH_FLAGS "Auto") 26 | set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} ${ARCH_FLAGS}") 27 | message(STATUS "CUDA Architecture flags = ${ARCH_FLAGS}") 28 | set(CUDA_PROPAGATE_HOST_FLAGS OFF) 29 | 30 | # FIXME(ying): The RNN examples do not rely on cutlass, but the fill kernel in 31 | # `fill.h` depends on cutlass, and cutlass require C++17. This is a hotfix to 32 | # bypass the compiling error. Make the dependency clean in the future. 33 | if(CUTLASS_NATIVE_CUDA) 34 | set(CMAKE_CUDA_STANDARD 17) 35 | set(CMAKE_CUDA_STANDARD_REQUIRED ON) 36 | list(APPEND CUTLASS_CUDA_NVCC_FLAGS --expt-relaxed-constexpr) 37 | else() 38 | list(APPEND CUTLASS_CUDA_NVCC_FLAGS --std=c++17) 39 | endif() 40 | include_directories( 41 | "../../../../build/third_party/cutlass/src/extern_cutlass/include") 42 | include_directories( 43 | "../../../../build/third_party/cutlass/src/extern_cutlass/tools/util/include") 44 | 45 | set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS} -w ${ARCH_FLAGS}) 46 | set(CUDA_NVCC_FLAGS_DEBUG ${CUDA_NVCC_FLAGS_DEBUG} -w ${ARCH_FLAGS}) 47 | set(CUDA_NVCC_FLAGS_RELEASE ${CUDA_NVCC_FLAGS_RELEASE} -w -O3 ${ARCH_FLAGS}) 48 | 49 | include_directories(${CUDA_INCLUDE_DIRS}) 50 | include_directories(${CUDNN_INCLUDE_DIRS}) 51 | 52 | include_directories("../") 53 | include_directories("../../../../") 54 | include_directories("../../../../build/third_party/install/glog/include") 55 | include_directories( 56 | "../../../../build/third_party/gflags/src/extern_gflags-build/include") 57 | link_directories("../../../../build/kaleido/core") 58 | link_directories("../../../../build/kaleido/core/operators") 59 | 60 | cuda_add_executable(lstm lstm.cu) 61 | target_link_libraries( 62 | lstm 63 | ${CUDA_LIBRARIES} 64 | ${CUDNN_LIBRARIES} 65 | ${CUDA_CUBLAS_LIBRARIES} 66 | ${CUDA_curand_LIBRARY} 67 | fractaltensor_core 68 | print_op) 69 | -------------------------------------------------------------------------------- /benchmarks/rnn/fractaltensor/grid_lstm/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | cmake_minimum_required(VERSION 3.18) 2 | project(recurrence_test CXX C) 3 | 4 | list(APPEND CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} 5 | "${CMAKE_SOURCE_DIR}/../../../../cmake/Modules/") 6 | 7 | set(CMAKE_BUILD_TYPE Release) 8 | 9 | set(CMAKE_CXX_STANDARD 17) 10 | set(CMAKE_CXX_STANDARD_REQUIRED TRUE) 11 | set(CMAKE_CUDA_STANDARD 17) 12 | set(CMAKE_CUDA_STANDARD_REQUIRED TRUE) 13 | 14 | set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fPIC -Wl,--no-undefined") 15 | set(CMAKE_CXX_FLAGS_DEBUG 16 | "$ENV{CXXFLAGS} -O0 -fPIC -Wall -Wno-sign-compare -g2 -ggdb") 17 | set(CMAKE_CXX_FLAGS_RELEASE "$ENV{CXXFLAGS} -fPIC -O3 -Wall -Wno-sign-compare") 18 | 19 | set(CMAKE_CXX_LINK_EXECUTABLE 20 | "${CMAKE_CXX_LINK_EXECUTABLE} -lpthread -ldl -lrt") 21 | 22 | find_package(CUDA QUIET REQUIRED) 23 | find_package(CuDNN QUIET REQUIRED) 24 | 25 | cuda_select_nvcc_arch_flags(ARCH_FLAGS "Auto") 26 | set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} ${ARCH_FLAGS}") 27 | message(STATUS "CUDA Architecture flags = ${ARCH_FLAGS}") 28 | set(CUDA_PROPAGATE_HOST_FLAGS OFF) 29 | 30 | # FIXME(ying): The RNN examples do not rely on cutlass, but the fill kernel in 31 | # `fill.h` depends on cutlass, and cutlass require C++17. This is a hotfix to 32 | # bypass the compiling error. Make the dependency clean in the future. 33 | if(CUTLASS_NATIVE_CUDA) 34 | set(CMAKE_CUDA_STANDARD 17) 35 | set(CMAKE_CUDA_STANDARD_REQUIRED ON) 36 | list(APPEND CUTLASS_CUDA_NVCC_FLAGS --expt-relaxed-constexpr) 37 | else() 38 | list(APPEND CUTLASS_CUDA_NVCC_FLAGS --std=c++17) 39 | endif() 40 | include_directories( 41 | "../../../../build/third_party/cutlass/src/extern_cutlass/include") 42 | include_directories( 43 | "../../../../build/third_party/cutlass/src/extern_cutlass/tools/util/include") 44 | 45 | set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS} -w ${ARCH_FLAGS}) 46 | set(CUDA_NVCC_FLAGS_DEBUG ${CUDA_NVCC_FLAGS_DEBUG} -w ${ARCH_FLAGS}) 47 | set(CUDA_NVCC_FLAGS_RELEASE ${CUDA_NVCC_FLAGS_RELEASE} -w -O3 ${ARCH_FLAGS}) 48 | 49 | include_directories(${CUDA_INCLUDE_DIRS}) 50 | include_directories(${CUDNN_INCLUDE_DIRS}) 51 | 52 | include_directories("../") 53 | include_directories("../../../../") 54 | include_directories("../../../../build/third_party/install/glog/include") 55 | include_directories( 56 | "../../../../build/third_party/gflags/src/extern_gflags-build/include") 57 | link_directories("../../../../build/kaleido/core") 58 | link_directories("../../../../build/kaleido/core/operators") 59 | 60 | cuda_add_executable(grid_rnn stacked_grid_rnn.cu) 61 | target_link_libraries( 62 | grid_rnn 63 | ${CUDA_LIBRARIES} 64 | ${CUDNN_LIBRARIES} 65 | ${CUDA_CUBLAS_LIBRARIES} 66 | ${CUDA_curand_LIBRARY} 67 | fractaltensor_core 68 | concat_op 69 | print_op) 70 | -------------------------------------------------------------------------------- /benchmarks/rnn/fractaltensor/dilated_lstm/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | cmake_minimum_required(VERSION 3.18) 2 | project(recurrence_test CXX C) 3 | 4 | list(APPEND CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} 5 | "${CMAKE_SOURCE_DIR}/../../../../cmake/Modules/") 6 | 7 | set(CMAKE_BUILD_TYPE Release) 8 | 9 | set(CMAKE_CXX_STANDARD 17) 10 | set(CMAKE_CXX_STANDARD_REQUIRED TRUE) 11 | set(CMAKE_CUDA_STANDARD 17) 12 | set(CMAKE_CUDA_STANDARD_REQUIRED TRUE) 13 | 14 | set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fPIC -Wl,--no-undefined") 15 | set(CMAKE_CXX_FLAGS_DEBUG 16 | "$ENV{CXXFLAGS} -O0 -fPIC -Wall -Wno-sign-compare -g2 -ggdb") 17 | set(CMAKE_CXX_FLAGS_RELEASE "$ENV{CXXFLAGS} -fPIC -O3 -Wall -Wno-sign-compare") 18 | 19 | set(CMAKE_CXX_LINK_EXECUTABLE 20 | "${CMAKE_CXX_LINK_EXECUTABLE} -lpthread -ldl -lrt") 21 | 22 | find_package(CUDA QUIET REQUIRED) 23 | find_package(CuDNN QUIET REQUIRED) 24 | 25 | cuda_select_nvcc_arch_flags(ARCH_FLAGS "Auto") 26 | set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} ${ARCH_FLAGS}") 27 | message(STATUS "CUDA Architecture flags = ${ARCH_FLAGS}") 28 | 29 | # FIXME(ying): The RNN examples do not rely on cutlass, but the fill kernel in 30 | # `fill.h` depends on cutlass, and cutlass require C++17. This is a hotfix to 31 | # bypass the compiling error. Make the dependency clean in the future. 32 | if(CUTLASS_NATIVE_CUDA) 33 | set(CMAKE_CUDA_STANDARD 17) 34 | set(CMAKE_CUDA_STANDARD_REQUIRED ON) 35 | list(APPEND CUTLASS_CUDA_NVCC_FLAGS --expt-relaxed-constexpr) 36 | else() 37 | list(APPEND CUTLASS_CUDA_NVCC_FLAGS --std=c++17) 38 | endif() 39 | include_directories( 40 | "../../../../build/third_party/cutlass/src/extern_cutlass/include") 41 | include_directories( 42 | "../../../../build/third_party/cutlass/src/extern_cutlass/tools/util/include") 43 | 44 | set(CUDA_PROPAGATE_HOST_FLAGS OFF) 45 | set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS} -w ${ARCH_FLAGS}) 46 | set(CUDA_NVCC_FLAGS_DEBUG ${CUDA_NVCC_FLAGS_DEBUG} -w ${ARCH_FLAGS}) 47 | set(CUDA_NVCC_FLAGS_RELEASE ${CUDA_NVCC_FLAGS_RELEASE} -w -O3 ${ARCH_FLAGS}) 48 | 49 | include_directories(${CUDA_INCLUDE_DIRS}) 50 | include_directories(${CUDNN_INCLUDE_DIRS}) 51 | 52 | include_directories("../") 53 | include_directories("../../../../") 54 | include_directories("../../../../build/third_party/install/glog/include") 55 | include_directories( 56 | "../../../../build/third_party/gflags/src/extern_gflags-build/include") 57 | link_directories("../../../../build/kaleido/core") 58 | link_directories("../../../../build/kaleido/core/operators") 59 | 60 | cuda_add_executable(stacked_dilated_lstm stacked_dilated_lstm.cu) 61 | target_link_libraries( 62 | stacked_dilated_lstm 63 | ${CUDA_LIBRARIES} 64 | ${CUDNN_LIBRARIES} 65 | ${CUDA_CUBLAS_LIBRARIES} 66 | ${CUDA_curand_LIBRARY} 67 | fractaltensor_core 68 | print_op) 69 | -------------------------------------------------------------------------------- /examples/hello_world/hello_world.py: -------------------------------------------------------------------------------- 1 | # ------------------------------------------------------------------------- 2 | # Copyright (c) Microsoft Corporation. All rights reserved. 3 | # Licensed under the MIT License. 4 | # -------------------------------------------------------------------------- 5 | 6 | from typing import NamedTuple 7 | 8 | import context 9 | 10 | import kaleido 11 | from examples.hello_world.utils import * 12 | from kaleido import FractalTensor, Tensor 13 | from kaleido import operations as ops 14 | from kaleido.parser.plot import PlotProgram 15 | 16 | ctx = kaleido.Context() 17 | 18 | 19 | @kaleido.params(ctx) 20 | class Params(NamedTuple): 21 | Ws: FractalTensor[Tensor['512, 512', float, 'cpu']] 22 | Us: FractalTensor[Tensor['512, 512', float, 'cpu']] 23 | 24 | 25 | @kaleido.function(ctx) 26 | def f3(a: Tensor['1, 512', float, 'cpu'], b: Tensor['1, 512', float, 'cpu'], 27 | c: Tensor['512, 512', float, 'cpu'], 28 | d: Tensor['512, 512', float, 'cpu']) -> Tensor['1, 512', float, 'cpu']: 29 | y = a @ c + b @ d 30 | return y 31 | 32 | 33 | @kaleido.function(ctx) 34 | def f2( 35 | xs: FractalTensor[Tensor['1, 512', float, 'cpu']], 36 | w: Tensor['512, 512', float, 'cpu'], u: Tensor['512, 512', float, 'cpu'] 37 | ) -> FractalTensor[Tensor['1, 512', float, 'cpu']]: 38 | ys = ops.scan(lambda s, x: f3(x, s, w, u), 39 | xs, 40 | initializer=ops.zeros(shape=(1, 512), 41 | device='cpu', 42 | dtype='float')) 43 | return ys 44 | 45 | 46 | @kaleido.function(ctx) 47 | def f1( 48 | xs: FractalTensor[Tensor['1, 512', float, 'cpu']], 49 | Ws: FractalTensor[Tensor['512, 512', float, 50 | 'cpu']], Us: FractalTensor[Tensor['512, 512', 51 | float, 'cpu']] 52 | ) -> FractalTensor[FractalTensor[Tensor['1, 512', float, 'cpu']]]: 53 | yss = ops.scan(lambda state, x: f2(state, *x), 54 | ops.zip(Ws, Us), 55 | initializer=xs) 56 | return yss 57 | 58 | 59 | @kaleido.function(ctx) 60 | def f( 61 | xss: FractalTensor[FractalTensor[Tensor['1, 512', float, 62 | 'cpu']]], params: Params 63 | ) -> FractalTensor[FractalTensor[FractalTensor[Tensor['1, 512', float, 64 | 'cpu']]]]: 65 | ysss = ops.map(lambda xs: f1(xs, params.Ws, params.Us), xss) 66 | return ysss 67 | 68 | 69 | block = ctx[-1].ir_block 70 | block.propagate_storage() 71 | 72 | p = PlotProgram() 73 | p.plot(block) 74 | 75 | if __name__ == '__main__': 76 | param = Params(Ws=Ws, Us=Us) 77 | 78 | ysss = f(xss, param) 79 | -------------------------------------------------------------------------------- /SECURITY.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | ## Security 4 | 5 | Microsoft takes the security of our software products and services seriously, which includes all source code repositories managed through our GitHub organizations, which include [Microsoft](https://github.com/Microsoft), [Azure](https://github.com/Azure), [DotNet](https://github.com/dotnet), [AspNet](https://github.com/aspnet) and [Xamarin](https://github.com/xamarin). 6 | 7 | If you believe you have found a security vulnerability in any Microsoft-owned repository that meets [Microsoft's definition of a security vulnerability](https://aka.ms/security.md/definition), please report it to us as described below. 8 | 9 | ## Reporting Security Issues 10 | 11 | **Please do not report security vulnerabilities through public GitHub issues.** 12 | 13 | Instead, please report them to the Microsoft Security Response Center (MSRC) at [https://msrc.microsoft.com/create-report](https://aka.ms/security.md/msrc/create-report). 14 | 15 | If you prefer to submit without logging in, send email to [secure@microsoft.com](mailto:secure@microsoft.com). If possible, encrypt your message with our PGP key; please download it from the [Microsoft Security Response Center PGP Key page](https://aka.ms/security.md/msrc/pgp). 16 | 17 | You should receive a response within 24 hours. If for some reason you do not, please follow up via email to ensure we received your original message. Additional information can be found at [microsoft.com/msrc](https://www.microsoft.com/msrc). 18 | 19 | Please include the requested information listed below (as much as you can provide) to help us better understand the nature and scope of the possible issue: 20 | 21 | * Type of issue (e.g. buffer overflow, SQL injection, cross-site scripting, etc.) 22 | * Full paths of source file(s) related to the manifestation of the issue 23 | * The location of the affected source code (tag/branch/commit or direct URL) 24 | * Any special configuration required to reproduce the issue 25 | * Step-by-step instructions to reproduce the issue 26 | * Proof-of-concept or exploit code (if possible) 27 | * Impact of the issue, including how an attacker might exploit the issue 28 | 29 | This information will help us triage your report more quickly. 30 | 31 | If you are reporting for a bug bounty, more complete reports can contribute to a higher bounty award. Please visit our [Microsoft Bug Bounty Program](https://aka.ms/security.md/msrc/bounty) page for more details about our active programs. 32 | 33 | ## Preferred Languages 34 | 35 | We prefer all communications to be in English. 36 | 37 | ## Policy 38 | 39 | Microsoft follows the principle of [Coordinated Vulnerability Disclosure](https://aka.ms/security.md/cvd). 40 | 41 | 42 | -------------------------------------------------------------------------------- /examples/sparse_attention/bigbird.py: -------------------------------------------------------------------------------- 1 | # ------------------------------------------------------------------------- 2 | # Copyright (c) Microsoft Corporation. All rights reserved. 3 | # Licensed under the MIT License. 4 | # -------------------------------------------------------------------------- 5 | 6 | import context 7 | from sparse_attention_utils import * 8 | 9 | import kaleido 10 | from kaleido import operations as ops 11 | 12 | 13 | def norm( 14 | g1: Tensor['32, 32', float, 15 | 'cuda'], w1: Tensor['32, 96', float, 16 | 'cuda'], g2: Tensor['32, 32', float, 'cuda'] 17 | ) -> FractalTensor[Tensor['32, 32', float, 'cuda']]: 18 | v = ops.softmax(ops.cat((g1, w1, g2), 1), 1) 19 | v = ops.split(v, 5, 1) 20 | return v 21 | 22 | 23 | def attn_func( 24 | qs: FractalTensor[Tensor['32, 512', float, 'cuda']], 25 | ks: FractalTensor[Tensor['32, 512', float, 26 | 'cuda']], vs: FractalTensor[Tensor['32, 512', 27 | float, 'cuda']] 28 | ) -> FractalTensor[Tensor['32, 512', float, 'cuda']]: 29 | # windowed attention and global attention 30 | # NOTE: Multiple heads and random attention are OMITTED for brevity. 31 | wks, wvs = ops.shifted_slide(ops.zip(ks, vs), window_size=3) 32 | wys = ops.map(lambda x: x[0] @ ops.flatten(x[1]).T, 33 | ops.zip(qs[2:-2], wks[2:-2])) 34 | gys1 = ops.map(lambda x: x @ ks[0].T, qs[2:-2]) # left global attention 35 | gys2 = ops.map(lambda x: x @ ks[-1].T, qs[2:-2]) # right global attention 36 | 37 | normed_vecs = ops.map(lambda x: norm(*x), ops.zip(gys1, wys, gys2)) 38 | 39 | gvs1 = ops.map(lambda x: x[0] @ vs[0], normed_vecs) 40 | gvs2 = ops.map(lambda x: x[-1] @ vs[-1], normed_vecs) 41 | 42 | wvs = ops.map(lambda x: ops.flatten(x[0][1:-1]).T @ ops.flatten(x[1]), 43 | ops.zip(normed_vecs, wvs[2:-2])) 44 | vs = ops.map(lambda x: x[0] + x[1] + x[2], ops.zip(gvs1, gvs2, wvs)) 45 | return vs 46 | 47 | 48 | def bigbird( 49 | qss: FractalTensor[FractalTensor[Tensor['32, 512', float, 'cuda']]], 50 | kss: FractalTensor[FractalTensor[Tensor['32, 512', float, 'cuda']]], 51 | vss: FractalTensor[FractalTensor[Tensor['32, 512', float, 'cuda']]] 52 | ) -> FractalTensor[FractalTensor[Tensor['32, 512', float, 'cuda']]]: 53 | v = ops.map(lambda xs: attn_func(*xs), ops.zip(qss, kss, vss)) 54 | return v 55 | 56 | 57 | if __name__ == '__main__': 58 | batch_size = 16 59 | seq_len = 4096 60 | hidden = 512 61 | block_size = 32 62 | 63 | qss = create_blocked_input(batch_size, hidden, block_size, seq_len) 64 | kss = create_blocked_input(batch_size, hidden, block_size, seq_len) 65 | vss = create_blocked_input(batch_size, hidden, block_size, seq_len) 66 | 67 | bigbird(qss, kss, vss) 68 | --------------------------------------------------------------------------------