├── benchmarks
    ├── rnn
    │   ├── fractaltensor
    │   │   ├── README.md
    │   │   ├── stacked_lstm
    │   │   │   ├── images
    │   │   │   │   ├── lstm.png
    │   │   │   │   ├── access_to_generate_ysss.png
    │   │   │   │   ├── access_to_generate_ysss.pptx
    │   │   │   │   └── preprocess.py
    │   │   │   ├── regions
    │   │   │   │   └── regions.h
    │   │   │   ├── README.md
    │   │   │   └── CMakeLists.txt
    │   │   ├── figures
    │   │   │   ├── grid_lstm_fractaltensor.png
    │   │   │   └── grid_lstm_fractaltensor.pptx
    │   │   ├── cute_stacked_lstm
    │   │   │   ├── figures
    │   │   │   │   ├── etdg-lstm.png
    │   │   │   │   ├── access_map1.png
    │   │   │   │   ├── access_map2.png
    │   │   │   │   ├── access_map3.png
    │   │   │   │   └── access_map4.png
    │   │   │   ├── Makefile
    │   │   │   └── README.md
    │   │   ├── dilated_lstm
    │   │   │   ├── regions
    │   │   │   │   └── regions.h
    │   │   │   └── CMakeLists.txt
    │   │   ├── grid_lstm
    │   │   │   ├── Makefile
    │   │   │   ├── regions
    │   │   │   │   └── regions.h
    │   │   │   ├── run.sh
    │   │   │   └── CMakeLists.txt
    │   │   └── cute_dilated_lstm
    │   │   │   └── Makefile
    │   ├── baselines
    │   │   ├── figures
    │   │   │   ├── figures.pptx
    │   │   │   └── dilaited_lstm_pytorch.png
    │   │   ├── stacked_lstm
    │   │   │   ├── figures
    │   │   │   │   ├── stacked_lstm_perf_with_depth.pdf
    │   │   │   │   ├── for_plot.tsv
    │   │   │   │   ├── stacked_lstm_results.tsv
    │   │   │   │   └── perf_with_increased_depth_subplot1.tsv
    │   │   │   ├── tf_model
    │   │   │   │   └── __init__.py
    │   │   │   ├── pt_model
    │   │   │   │   └── __init__.py
    │   │   │   ├── triton_model
    │   │   │   │   └── __init__.py
    │   │   │   ├── README.md
    │   │   │   └── test_utils.py
    │   │   ├── grid_lstm
    │   │   │   ├── README.md
    │   │   │   ├── pt_model
    │   │   │   │   └── __init__.py
    │   │   │   ├── triton_model
    │   │   │   │   └── __init__.py
    │   │   │   ├── tf_model
    │   │   │   │   └── __init__.py
    │   │   │   ├── run_grid_lstm_pt.sh
    │   │   │   └── test_utils.py
    │   │   ├── stacked_dilated_rnn
    │   │   │   ├── tf_model
    │   │   │   │   ├── __init__.py
    │   │   │   │   └── model.py
    │   │   │   ├── triton_model
    │   │   │   │   └── __init__.py
    │   │   │   ├── pt_model
    │   │   │   │   └── __init__.py
    │   │   │   ├── README.md
    │   │   │   └── stacked_drnn_tensorflow.py
    │   │   └── README.md
    │   ├── cuDNN
    │   │   ├── Makefile
    │   │   ├── lstm_cell_cudnn.cu
    │   │   ├── stacked_lstm_cudnn.cu
    │   │   ├── utils.h
    │   │   ├── CMakeLists.txt
    │   │   └── main.cu
    │   └── tvm
    │   │   ├── utils.py
    │   │   └── CMakeLists.txt
    └── fused_two_hgemms
    │   ├── baseline
    │       └── README.md
    │   ├── fractaltensor
    │       ├── figures
    │       │   ├── access_maps.png
    │       │   ├── fused_two_gemms.png
    │       │   ├── etdg_for_two_gemms.png
    │       │   └── gemm_translated_to_macro_kernel.png
    │       └── Makefile
    │   └── README.md
├── docs
    ├── images
    │   ├── zip.png
    │   ├── stack.png
    │   ├── data_types.png
    │   ├── frontend_tree.png
    │   ├── primitive_type.png
    │   ├── access_by_depth_1.png
    │   ├── access_by_depth_2.png
    │   ├── fractaltensor_layout.png
    │   ├── grid_rnn_example
    │   │   ├── cell.png
    │   │   ├── scan_x.png
    │   │   ├── scan_y.png
    │   │   ├── grid_cell.png
    │   │   ├── grid_rnn.png
    │   │   └── code_structure_and_memory.png
    │   ├── index_a_FractalTensor_1.png
    │   ├── index_a_FractalTensor_2.png
    │   ├── type_expression_tensor.png
    │   ├── product_two_FractalTensor.png
    │   ├── slide_over_fractaltensor.png
    │   └── type_expression_FractalTensor.png
    └── fractaltensor_operations
    │   ├── information_query.md
    │   ├── access_primitives.md
    │   ├── memory_layout_of_fractaltensor.md
    │   ├── memory_operations.md
    │   └── extended_access_operations.md
├── assets
    ├── FractalTensor-logo.png
    └── FractalTensor_overview.png
├── requirements.txt
├── examples
    ├── dilated_rnn
    │   ├── example.gv.pdf
    │   ├── context.py
    │   └── dilated_rnn.py
    ├── hello_world
    │   ├── example.gv.pdf
    │   ├── context.py
    │   ├── utils.py
    │   └── hello_world.py
    ├── stacked_rnn
    │   ├── example.gv.pdf
    │   └── context.py
    ├── sparse_attention
    │   ├── figures
    │   │   └── bigbird-attn.png
    │   ├── context.py
    │   ├── torch_windowed_attention_demo.py
    │   ├── README.md
    │   └── bigbird.py
    ├── __init__.py
    ├── utils
    │   └── __init__.py
    ├── convolution
    │   ├── context.py
    │   └── utils.py
    ├── grid_rnn
    │   ├── context.py
    │   └── grid_rnn_utils.py
    ├── transformer
    │   └── context.py
    ├── flash_attention
    │   ├── context.py
    │   ├── flash_attention_utils.py
    │   └── README.md
    ├── rnn_attention
    │   ├── context.py
    │   └── rnn_attention_utils.py
    └── README.md
├── kaleido
    ├── parser
    │   ├── tests
    │   │   ├── figures
    │   │   │   ├── udf1.gv.pdf
    │   │   │   ├── udf2.gv.pdf
    │   │   │   ├── udf3.gv.pdf
    │   │   │   ├── assignment1.gv.pdf
    │   │   │   ├── assignment2.gv.pdf
    │   │   │   ├── assignment3.gv.pdf
    │   │   │   ├── assignment4.gv.pdf
    │   │   │   ├── assignment5.gv.pdf
    │   │   │   └── assignment6.gv.pdf
    │   │   ├── context.py
    │   │   └── utils.py
    │   ├── errors.py
    │   └── operations
    │   │   └── access_patterns.py
    ├── core
    │   ├── device
    │   │   ├── device_context.h
    │   │   ├── tests
    │   │   │   └── CMakeLists.txt
    │   │   ├── cuda_info.h
    │   │   ├── kernels
    │   │   │   ├── softmax_common.h
    │   │   │   ├── tile_transmitter.h
    │   │   │   ├── lstm
    │   │   │   │   ├── dilated_lstm
    │   │   │   │   │   └── region1.h
    │   │   │   │   └── stacked_lstm
    │   │   │   │   │   └── region1.h
    │   │   │   ├── gather_scatter.h
    │   │   │   └── softmax_v2.h
    │   │   ├── traits_base.h
    │   │   ├── cuda_timer.h
    │   │   ├── gpu_context.cc
    │   │   ├── gpu_context.h
    │   │   └── cuda_info.cc
    │   ├── tests
    │   │   ├── test_main.cc
    │   │   ├── CMakeLists.txt
    │   │   ├── test_cuda_info.cc
    │   │   ├── test_layout.cc
    │   │   └── test_allocator.cc
    │   ├── fractal_tensor.cc
    │   ├── operators
    │   │   ├── expect_eq_op.h
    │   │   ├── softmax_op.h
    │   │   ├── transpose_op.h
    │   │   ├── print_op.h
    │   │   ├── online_softmax_op.h
    │   │   ├── gather_nd_op.h
    │   │   ├── scatter_nd_op.h
    │   │   ├── concat_op.h
    │   │   ├── matmul_op.h
    │   │   ├── fill_op.h
    │   │   ├── gemm_batched_op.h
    │   │   ├── CMakeLists.txt
    │   │   ├── elementwise_op.h
    │   │   ├── softmax_op.cu
    │   │   ├── online_softmax_op.cu
    │   │   ├── launch_config.h
    │   │   ├── tests
    │   │   │   └── b2b_gemm_test_utils.h
    │   │   ├── fill_op.cu
    │   │   ├── gather_nd_op.cu
    │   │   └── scatter_nd_op.cu
    │   ├── allocator.h
    │   ├── tensor.cc
    │   ├── data_types.proto
    │   ├── init.cc
    │   ├── tensor_shape.cc
    │   ├── place.cc
    │   ├── config.h
    │   ├── tile_shape.h
    │   ├── fractal_tensor.h
    │   ├── layout.h
    │   ├── place.h
    │   ├── tensor_shape.h
    │   └── CMakeLists.txt
    ├── frontend
    │   ├── __init__.py
    │   ├── operations
    │   │   ├── tests
    │   │   │   ├── context.py
    │   │   │   ├── test_product.py
    │   │   │   ├── test_zip.py
    │   │   │   ├── test_constants.py
    │   │   │   ├── test_flatten.py
    │   │   │   ├── test_aggregate.py
    │   │   │   └── test_join.py
    │   │   ├── __init__.py
    │   │   └── tensor
    │   │   │   ├── arithmetic
    │   │   │       ├── contraction.py
    │   │   │       └── broadcast.py
    │   │   │   ├── data_movements.py
    │   │   │   └── reshape.py
    │   └── tests
    │   │   ├── context.py
    │   │   └── test_type_equivalence.py
    └── __init__.py
├── CODE_OF_CONDUCT.md
├── .gitignore
├── .clang-format
├── scripts
    ├── clang_format.hook
    └── format.sh
├── Makefile
├── cmake
    ├── python.cmake
    ├── external
    │   ├── pybind.cmake
    │   ├── tvm.cmake
    │   ├── cccl.cmake
    │   ├── cutlass.cmake
    │   ├── zlib.cmake
    │   ├── gflags.cmake
    │   └── glog.cmake
    └── third_party.cmake
├── LICENSE
├── SUPPORT.md
├── .pre-commit-config.yaml
├── CMakeLists.txt
└── SECURITY.md


/benchmarks/rnn/fractaltensor/README.md:
--------------------------------------------------------------------------------
1 | [TBD]
2 | 


--------------------------------------------------------------------------------
/benchmarks/fused_two_hgemms/baseline/README.md:
--------------------------------------------------------------------------------
1 | [TBD]
2 | 


--------------------------------------------------------------------------------
/docs/images/zip.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/microsoft/FractalTensor/HEAD/docs/images/zip.png


--------------------------------------------------------------------------------
/docs/images/stack.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/microsoft/FractalTensor/HEAD/docs/images/stack.png


--------------------------------------------------------------------------------
/docs/images/data_types.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/microsoft/FractalTensor/HEAD/docs/images/data_types.png


--------------------------------------------------------------------------------
/assets/FractalTensor-logo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/microsoft/FractalTensor/HEAD/assets/FractalTensor-logo.png


--------------------------------------------------------------------------------
/docs/images/frontend_tree.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/microsoft/FractalTensor/HEAD/docs/images/frontend_tree.png


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | anytree==2.8.0
2 | astpretty==2.1.0
3 | asttokens==2.0.5
4 | absl-py==1.0.0
5 | graphviz==0.17
6 | 


--------------------------------------------------------------------------------
/docs/images/primitive_type.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/microsoft/FractalTensor/HEAD/docs/images/primitive_type.png


--------------------------------------------------------------------------------
/assets/FractalTensor_overview.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/microsoft/FractalTensor/HEAD/assets/FractalTensor_overview.png


--------------------------------------------------------------------------------
/docs/images/access_by_depth_1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/microsoft/FractalTensor/HEAD/docs/images/access_by_depth_1.png


--------------------------------------------------------------------------------
/docs/images/access_by_depth_2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/microsoft/FractalTensor/HEAD/docs/images/access_by_depth_2.png


--------------------------------------------------------------------------------
/docs/images/fractaltensor_layout.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/microsoft/FractalTensor/HEAD/docs/images/fractaltensor_layout.png


--------------------------------------------------------------------------------
/docs/images/grid_rnn_example/cell.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/microsoft/FractalTensor/HEAD/docs/images/grid_rnn_example/cell.png


--------------------------------------------------------------------------------
/examples/dilated_rnn/example.gv.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/microsoft/FractalTensor/HEAD/examples/dilated_rnn/example.gv.pdf


--------------------------------------------------------------------------------
/examples/hello_world/example.gv.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/microsoft/FractalTensor/HEAD/examples/hello_world/example.gv.pdf


--------------------------------------------------------------------------------
/examples/stacked_rnn/example.gv.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/microsoft/FractalTensor/HEAD/examples/stacked_rnn/example.gv.pdf


--------------------------------------------------------------------------------
/docs/images/grid_rnn_example/scan_x.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/microsoft/FractalTensor/HEAD/docs/images/grid_rnn_example/scan_x.png


--------------------------------------------------------------------------------
/docs/images/grid_rnn_example/scan_y.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/microsoft/FractalTensor/HEAD/docs/images/grid_rnn_example/scan_y.png


--------------------------------------------------------------------------------
/docs/images/index_a_FractalTensor_1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/microsoft/FractalTensor/HEAD/docs/images/index_a_FractalTensor_1.png


--------------------------------------------------------------------------------
/docs/images/index_a_FractalTensor_2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/microsoft/FractalTensor/HEAD/docs/images/index_a_FractalTensor_2.png


--------------------------------------------------------------------------------
/docs/images/type_expression_tensor.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/microsoft/FractalTensor/HEAD/docs/images/type_expression_tensor.png


--------------------------------------------------------------------------------
/docs/images/grid_rnn_example/grid_cell.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/microsoft/FractalTensor/HEAD/docs/images/grid_rnn_example/grid_cell.png


--------------------------------------------------------------------------------
/docs/images/grid_rnn_example/grid_rnn.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/microsoft/FractalTensor/HEAD/docs/images/grid_rnn_example/grid_rnn.png


--------------------------------------------------------------------------------
/docs/images/product_two_FractalTensor.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/microsoft/FractalTensor/HEAD/docs/images/product_two_FractalTensor.png


--------------------------------------------------------------------------------
/docs/images/slide_over_fractaltensor.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/microsoft/FractalTensor/HEAD/docs/images/slide_over_fractaltensor.png


--------------------------------------------------------------------------------
/kaleido/parser/tests/figures/udf1.gv.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/microsoft/FractalTensor/HEAD/kaleido/parser/tests/figures/udf1.gv.pdf


--------------------------------------------------------------------------------
/kaleido/parser/tests/figures/udf2.gv.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/microsoft/FractalTensor/HEAD/kaleido/parser/tests/figures/udf2.gv.pdf


--------------------------------------------------------------------------------
/kaleido/parser/tests/figures/udf3.gv.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/microsoft/FractalTensor/HEAD/kaleido/parser/tests/figures/udf3.gv.pdf


--------------------------------------------------------------------------------
/benchmarks/rnn/baselines/figures/figures.pptx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/microsoft/FractalTensor/HEAD/benchmarks/rnn/baselines/figures/figures.pptx


--------------------------------------------------------------------------------
/docs/images/type_expression_FractalTensor.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/microsoft/FractalTensor/HEAD/docs/images/type_expression_FractalTensor.png


--------------------------------------------------------------------------------
/kaleido/parser/tests/figures/assignment1.gv.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/microsoft/FractalTensor/HEAD/kaleido/parser/tests/figures/assignment1.gv.pdf


--------------------------------------------------------------------------------
/kaleido/parser/tests/figures/assignment2.gv.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/microsoft/FractalTensor/HEAD/kaleido/parser/tests/figures/assignment2.gv.pdf


--------------------------------------------------------------------------------
/kaleido/parser/tests/figures/assignment3.gv.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/microsoft/FractalTensor/HEAD/kaleido/parser/tests/figures/assignment3.gv.pdf


--------------------------------------------------------------------------------
/kaleido/parser/tests/figures/assignment4.gv.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/microsoft/FractalTensor/HEAD/kaleido/parser/tests/figures/assignment4.gv.pdf


--------------------------------------------------------------------------------
/kaleido/parser/tests/figures/assignment5.gv.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/microsoft/FractalTensor/HEAD/kaleido/parser/tests/figures/assignment5.gv.pdf


--------------------------------------------------------------------------------
/kaleido/parser/tests/figures/assignment6.gv.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/microsoft/FractalTensor/HEAD/kaleido/parser/tests/figures/assignment6.gv.pdf


--------------------------------------------------------------------------------
/examples/sparse_attention/figures/bigbird-attn.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/microsoft/FractalTensor/HEAD/examples/sparse_attention/figures/bigbird-attn.png


--------------------------------------------------------------------------------
/benchmarks/rnn/fractaltensor/stacked_lstm/images/lstm.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/microsoft/FractalTensor/HEAD/benchmarks/rnn/fractaltensor/stacked_lstm/images/lstm.png


--------------------------------------------------------------------------------
/benchmarks/rnn/baselines/figures/dilaited_lstm_pytorch.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/microsoft/FractalTensor/HEAD/benchmarks/rnn/baselines/figures/dilaited_lstm_pytorch.png


--------------------------------------------------------------------------------
/docs/images/grid_rnn_example/code_structure_and_memory.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/microsoft/FractalTensor/HEAD/docs/images/grid_rnn_example/code_structure_and_memory.png


--------------------------------------------------------------------------------
/benchmarks/rnn/fractaltensor/figures/grid_lstm_fractaltensor.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/microsoft/FractalTensor/HEAD/benchmarks/rnn/fractaltensor/figures/grid_lstm_fractaltensor.png


--------------------------------------------------------------------------------
/benchmarks/fused_two_hgemms/fractaltensor/figures/access_maps.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/microsoft/FractalTensor/HEAD/benchmarks/fused_two_hgemms/fractaltensor/figures/access_maps.png


--------------------------------------------------------------------------------
/benchmarks/rnn/fractaltensor/figures/grid_lstm_fractaltensor.pptx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/microsoft/FractalTensor/HEAD/benchmarks/rnn/fractaltensor/figures/grid_lstm_fractaltensor.pptx


--------------------------------------------------------------------------------
/benchmarks/fused_two_hgemms/fractaltensor/figures/fused_two_gemms.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/microsoft/FractalTensor/HEAD/benchmarks/fused_two_hgemms/fractaltensor/figures/fused_two_gemms.png


--------------------------------------------------------------------------------
/benchmarks/rnn/fractaltensor/cute_stacked_lstm/figures/etdg-lstm.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/microsoft/FractalTensor/HEAD/benchmarks/rnn/fractaltensor/cute_stacked_lstm/figures/etdg-lstm.png


--------------------------------------------------------------------------------
/benchmarks/fused_two_hgemms/fractaltensor/figures/etdg_for_two_gemms.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/microsoft/FractalTensor/HEAD/benchmarks/fused_two_hgemms/fractaltensor/figures/etdg_for_two_gemms.png


--------------------------------------------------------------------------------
/benchmarks/rnn/fractaltensor/cute_stacked_lstm/figures/access_map1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/microsoft/FractalTensor/HEAD/benchmarks/rnn/fractaltensor/cute_stacked_lstm/figures/access_map1.png


--------------------------------------------------------------------------------
/benchmarks/rnn/fractaltensor/cute_stacked_lstm/figures/access_map2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/microsoft/FractalTensor/HEAD/benchmarks/rnn/fractaltensor/cute_stacked_lstm/figures/access_map2.png


--------------------------------------------------------------------------------
/benchmarks/rnn/fractaltensor/cute_stacked_lstm/figures/access_map3.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/microsoft/FractalTensor/HEAD/benchmarks/rnn/fractaltensor/cute_stacked_lstm/figures/access_map3.png


--------------------------------------------------------------------------------
/benchmarks/rnn/fractaltensor/cute_stacked_lstm/figures/access_map4.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/microsoft/FractalTensor/HEAD/benchmarks/rnn/fractaltensor/cute_stacked_lstm/figures/access_map4.png


--------------------------------------------------------------------------------
/benchmarks/rnn/fractaltensor/stacked_lstm/images/access_to_generate_ysss.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/microsoft/FractalTensor/HEAD/benchmarks/rnn/fractaltensor/stacked_lstm/images/access_to_generate_ysss.png


--------------------------------------------------------------------------------
/benchmarks/rnn/fractaltensor/stacked_lstm/images/access_to_generate_ysss.pptx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/microsoft/FractalTensor/HEAD/benchmarks/rnn/fractaltensor/stacked_lstm/images/access_to_generate_ysss.pptx


--------------------------------------------------------------------------------
/benchmarks/rnn/baselines/stacked_lstm/figures/stacked_lstm_perf_with_depth.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/microsoft/FractalTensor/HEAD/benchmarks/rnn/baselines/stacked_lstm/figures/stacked_lstm_perf_with_depth.pdf


--------------------------------------------------------------------------------
/benchmarks/fused_two_hgemms/fractaltensor/figures/gemm_translated_to_macro_kernel.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/microsoft/FractalTensor/HEAD/benchmarks/fused_two_hgemms/fractaltensor/figures/gemm_translated_to_macro_kernel.png


--------------------------------------------------------------------------------
/benchmarks/rnn/fractaltensor/dilated_lstm/regions/regions.h:
--------------------------------------------------------------------------------
1 | // Copyright (c) Microsoft Corporation. All rights reserved.
2 | // Licensed under the MIT License.
3 | 
4 | #pragma once
5 | 
6 | #include "region1.h"
7 | #include "region2.h"
8 | 


--------------------------------------------------------------------------------
/kaleido/core/device/device_context.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | namespace kaleido {
 4 | namespace core {
 5 | 
 6 | class DeviceContext {
 7 | public:
 8 |   virtual ~DeviceContext() {}
 9 | };
10 | }  // namespace core
11 | }  // namespace kaleido
12 | 


--------------------------------------------------------------------------------
/benchmarks/rnn/fractaltensor/stacked_lstm/regions/regions.h:
--------------------------------------------------------------------------------
1 | // Copyright (c) Microsoft Corporation. All rights reserved.
2 | // Licensed under the MIT License.
3 | 
4 | #pragma once
5 | 
6 | #include "region1.h"
7 | #include "region2.h"
8 | #include "region3.h"
9 | 


--------------------------------------------------------------------------------
/examples/__init__.py:
--------------------------------------------------------------------------------
1 | # -------------------------------------------------------------------------
2 | # Copyright (c) Microsoft Corporation. All rights reserved.
3 | # Licensed under the MIT License.
4 | # --------------------------------------------------------------------------
5 | 


--------------------------------------------------------------------------------
/benchmarks/rnn/fractaltensor/grid_lstm/Makefile:
--------------------------------------------------------------------------------
 1 | BENCH_NAME ?= grid_rnn
 2 | BUILD_DIR  := build
 3 | 
 4 | .PHONY: build clean
 5 | 
 6 | build:
 7 | 	@mkdir -p build && cd build && cmake .. && make -j12
 8 | 
 9 | $(BUILD_DIR)/$(BENCH_NAME): build
10 | 
11 | clean:
12 | 	@rm -rf build
13 | 


--------------------------------------------------------------------------------
/benchmarks/fused_two_hgemms/fractaltensor/Makefile:
--------------------------------------------------------------------------------
 1 | BENCH_NAME ?= back2back_hgemm
 2 | BUILD_DIR  := build
 3 | 
 4 | .PHONY: build clean
 5 | 
 6 | build:
 7 | 	@mkdir -p build && cd build && cmake .. && make -j12
 8 | 
 9 | $(BUILD_DIR)/$(BENCH_NAME): build
10 | 
11 | clean:
12 | 	@rm -rf build
13 | 


--------------------------------------------------------------------------------
/examples/utils/__init__.py:
--------------------------------------------------------------------------------
1 | # -------------------------------------------------------------------------
2 | # Copyright (c) Microsoft Corporation. All rights reserved.
3 | # Licensed under the MIT License.
4 | # --------------------------------------------------------------------------
5 | 
6 | from .data_utils import *
7 | 


--------------------------------------------------------------------------------
/benchmarks/rnn/baselines/stacked_lstm/figures/for_plot.tsv:
--------------------------------------------------------------------------------
1 | Test Name	Average Time	Elapsed Time	Throughput
2 | CuDNN	0.0251	0.7545	2544.7697
3 | PT	0.2123	6.3702	301.4025
4 | PT_JITed	0.0505	1.5161	1266.4127
5 | TF_GraphMode	0.0743	2.2293	861.2472
6 | TF_WhileOpLSTM	0.1068	3.2042	599.2203
7 | TF_AutoGraph	0.0501	1.5038	1276.7482
8 | 


--------------------------------------------------------------------------------
/kaleido/core/tests/test_main.cc:
--------------------------------------------------------------------------------
 1 | // Copyright (c) Microsoft Corporation. All rights reserved.
 2 | // Licensed under the MIT License.
 3 | 
 4 | #include <glog/logging.h>
 5 | #include <gtest/gtest.h>
 6 | 
 7 | int main(int argc, char** argv) {
 8 |   testing::InitGoogleTest(&argc, argv);
 9 |   google::InitGoogleLogging(argv[0]);
10 | 
11 |   return RUN_ALL_TESTS();
12 | }
13 | 


--------------------------------------------------------------------------------
/benchmarks/rnn/fractaltensor/grid_lstm/regions/regions.h:
--------------------------------------------------------------------------------
 1 | // Copyright (c) Microsoft Corporation. All rights reserved.
 2 | // Licensed under the MIT License.
 3 | 
 4 | #pragma once
 5 | 
 6 | #include "region1.h"
 7 | #include "region2.h"
 8 | #include "region3.h"
 9 | #include "region4.h"
10 | #include "region5.h"
11 | #include "region6.h"
12 | #include "region7.h"
13 | #include "region8.h"
14 | 


--------------------------------------------------------------------------------
/benchmarks/rnn/cuDNN/Makefile:
--------------------------------------------------------------------------------
 1 | BENCH_NAME ?= lstm_cell_cudnn
 2 | BUILD_DIR  := build
 3 | OUTPUT_FILE ?= ../c_cudnn_lstm_cell_bench.tsv
 4 | 
 5 | .PHONY: build bench clean
 6 | 
 7 | build:
 8 | 	@mkdir -p build && cd build && cmake .. && make -j
 9 | 
10 | $(BUILD_DIR)/$(BENCH_NAME): build
11 | 
12 | bench: $(BUILD_DIR)/$(BENCH_NAME)
13 | 	@./$(BUILD_DIR)/$(BENCH_NAME) $(OUTPUT_FILE)
14 | 
15 | clean:
16 | 	@rm -rf build
17 | 


--------------------------------------------------------------------------------
/examples/convolution/context.py:
--------------------------------------------------------------------------------
 1 | # -------------------------------------------------------------------------
 2 | # Copyright (c) Microsoft Corporation. All rights reserved.
 3 | # Licensed under the MIT License.
 4 | # --------------------------------------------------------------------------
 5 | 
 6 | import os
 7 | import sys
 8 | 
 9 | sys.path.insert(
10 |     0, os.path.abspath(os.path.join(os.path.dirname(__file__), '../..')))
11 | 


--------------------------------------------------------------------------------
/examples/dilated_rnn/context.py:
--------------------------------------------------------------------------------
 1 | # -------------------------------------------------------------------------
 2 | # Copyright (c) Microsoft Corporation. All rights reserved.
 3 | # Licensed under the MIT License.
 4 | # --------------------------------------------------------------------------
 5 | 
 6 | import os
 7 | import sys
 8 | 
 9 | sys.path.insert(
10 |     0, os.path.abspath(os.path.join(os.path.dirname(__file__), '../..')))
11 | 


--------------------------------------------------------------------------------
/examples/grid_rnn/context.py:
--------------------------------------------------------------------------------
 1 | # -------------------------------------------------------------------------
 2 | # Copyright (c) Microsoft Corporation. All rights reserved.
 3 | # Licensed under the MIT License.
 4 | # --------------------------------------------------------------------------
 5 | 
 6 | import os
 7 | import sys
 8 | 
 9 | sys.path.insert(
10 |     0, os.path.abspath(os.path.join(os.path.dirname(__file__), '../..')))
11 | 


--------------------------------------------------------------------------------
/examples/hello_world/context.py:
--------------------------------------------------------------------------------
 1 | # -------------------------------------------------------------------------
 2 | # Copyright (c) Microsoft Corporation. All rights reserved.
 3 | # Licensed under the MIT License.
 4 | # --------------------------------------------------------------------------
 5 | 
 6 | import os
 7 | import sys
 8 | 
 9 | sys.path.insert(
10 |     0, os.path.abspath(os.path.join(os.path.dirname(__file__), '../..')))
11 | 


--------------------------------------------------------------------------------
/examples/stacked_rnn/context.py:
--------------------------------------------------------------------------------
 1 | # -------------------------------------------------------------------------
 2 | # Copyright (c) Microsoft Corporation. All rights reserved.
 3 | # Licensed under the MIT License.
 4 | # --------------------------------------------------------------------------
 5 | 
 6 | import os
 7 | import sys
 8 | 
 9 | sys.path.insert(
10 |     0, os.path.abspath(os.path.join(os.path.dirname(__file__), '../..')))
11 | 


--------------------------------------------------------------------------------
/examples/transformer/context.py:
--------------------------------------------------------------------------------
 1 | # -------------------------------------------------------------------------
 2 | # Copyright (c) Microsoft Corporation. All rights reserved.
 3 | # Licensed under the MIT License.
 4 | # --------------------------------------------------------------------------
 5 | 
 6 | import os
 7 | import sys
 8 | 
 9 | sys.path.insert(
10 |     0, os.path.abspath(os.path.join(os.path.dirname(__file__), '../..')))
11 | 


--------------------------------------------------------------------------------
/kaleido/core/fractal_tensor.cc:
--------------------------------------------------------------------------------
 1 | // Copyright (c) Microsoft Corporation. All rights reserved.
 2 | // Licensed under the MIT License.
 3 | 
 4 | #include "kaleido/core/fractal_tensor.h"
 5 | 
 6 | #include <iostream>
 7 | 
 8 | namespace kaleido {
 9 | namespace core {
10 | 
11 | std::string FractalTensor::DebugString() const {
12 |   return type_desc_.DebugString();
13 | }
14 | 
15 | }  // namespace core
16 | }  // namespace kaleido
17 | 


--------------------------------------------------------------------------------
/examples/flash_attention/context.py:
--------------------------------------------------------------------------------
 1 | # -------------------------------------------------------------------------
 2 | # Copyright (c) Microsoft Corporation. All rights reserved.
 3 | # Licensed under the MIT License.
 4 | # --------------------------------------------------------------------------
 5 | 
 6 | import os
 7 | import sys
 8 | 
 9 | sys.path.insert(
10 |     0, os.path.abspath(os.path.join(os.path.dirname(__file__), '../..')))
11 | 


--------------------------------------------------------------------------------
/examples/rnn_attention/context.py:
--------------------------------------------------------------------------------
 1 | # -------------------------------------------------------------------------
 2 | # Copyright (c) Microsoft Corporation. All rights reserved.
 3 | # Licensed under the MIT License.
 4 | # --------------------------------------------------------------------------
 5 | 
 6 | import os
 7 | import sys
 8 | 
 9 | sys.path.insert(
10 |     0, os.path.abspath(os.path.join(os.path.dirname(__file__), '../..')))
11 | 


--------------------------------------------------------------------------------
/examples/sparse_attention/context.py:
--------------------------------------------------------------------------------
 1 | # -------------------------------------------------------------------------
 2 | # Copyright (c) Microsoft Corporation. All rights reserved.
 3 | # Licensed under the MIT License.
 4 | # --------------------------------------------------------------------------
 5 | 
 6 | import os
 7 | import sys
 8 | 
 9 | sys.path.insert(
10 |     0, os.path.abspath(os.path.join(os.path.dirname(__file__), '../..')))
11 | 


--------------------------------------------------------------------------------
/kaleido/core/device/tests/CMakeLists.txt:
--------------------------------------------------------------------------------
1 | # -------------------------------------------------------------------------
2 | # Copyright (c) Microsoft Corporation. All rights reserved. Licensed under the
3 | # MIT License.
4 | # --------------------------------------------------------------------------
5 | 
6 | set(TEST_ROOT ${PROJECT_SOURCE_DIR}/kaleido/core/device/tests)
7 | 
8 | nv_test(test_tile_copy SRCS ${TEST_ROOT}/test_tile_copy.cu)
9 | 


--------------------------------------------------------------------------------
/benchmarks/rnn/fractaltensor/cute_dilated_lstm/Makefile:
--------------------------------------------------------------------------------
 1 | BENCH_NAME ?= dilated_lstm
 2 | BUILD_DIR  := build
 3 | OUTPUT_FILE ?= ../../dilated_lstm_bench.tsv
 4 | 
 5 | .PHONY: build bench clean
 6 | 
 7 | build:
 8 | 	@mkdir -p build && cd build && cmake .. && make -j
 9 | 
10 | $(BUILD_DIR)/$(BENCH_NAME): build
11 | 
12 | bench: $(BUILD_DIR)/$(BENCH_NAME)
13 | 	@./$(BUILD_DIR)/$(BENCH_NAME) $(OUTPUT_FILE)
14 | 
15 | clean:
16 | 	@rm -rf build
17 | 


--------------------------------------------------------------------------------
/benchmarks/rnn/fractaltensor/cute_stacked_lstm/Makefile:
--------------------------------------------------------------------------------
 1 | BENCH_NAME ?= stacked_lstm
 2 | BUILD_DIR  := build
 3 | OUTPUT_FILE ?= ../../cute_stacked_lstm_bench.tsv
 4 | 
 5 | .PHONY: build bench clean
 6 | 
 7 | build:
 8 | 	@mkdir -p build && cd build && cmake .. && make -j
 9 | 
10 | $(BUILD_DIR)/$(BENCH_NAME): build
11 | 
12 | bench: $(BUILD_DIR)/$(BENCH_NAME)
13 | 	@./$(BUILD_DIR)/$(BENCH_NAME) $(OUTPUT_FILE)
14 | 
15 | clean:
16 | 	@rm -rf build
17 | 


--------------------------------------------------------------------------------
/examples/README.md:
--------------------------------------------------------------------------------
1 | # Restrictions of using Python's syntax in implementing the examples
2 | 
3 | - The parser parses dataflow relations among variable generations and usages from
4 | the source code. Functions can be nested, but the current implementation of the parser ONLY
5 | resolves name alias in the function's local scope. Name resolver does not search
6 | the enclosing scope. A better implementation of name resolver will remove this limitation.
7 | 


--------------------------------------------------------------------------------
/kaleido/core/operators/expect_eq_op.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include "kaleido/core/tensor.h"
 4 | 
 5 | namespace kaleido {
 6 | namespace core {
 7 | namespace ops {
 8 | 
 9 | template <typename DeviceContext, typename Place, typename T>
10 | class ExpectEqOp {
11 | public:
12 |   void operator()(const Tensor& x, const Tensor& y, float epsilon = 1e-5);
13 | };
14 | 
15 | }  // namespace ops
16 | }  // namespace core
17 | }  // namespace kaleido
18 | 


--------------------------------------------------------------------------------
/kaleido/core/allocator.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include <cstddef>
 4 | 
 5 | namespace kaleido {
 6 | namespace core {
 7 | 
 8 | // TODO(ying): make Place a template parameter.
 9 | // template <typename Place>
10 | class Allocator {
11 | public:
12 |   virtual ~Allocator() = default;
13 | 
14 |   virtual void* Allocate(const size_t& nbytes) = 0;
15 |   virtual void Deallocate(void* ptr) = 0;
16 | };
17 | }  // namespace core
18 | }  // namespace kaleido
19 | 


--------------------------------------------------------------------------------
/benchmarks/rnn/baselines/grid_lstm/README.md:
--------------------------------------------------------------------------------
 1 | # gridLSTM
 2 | 
 3 | ## Hyper-parameters
 4 | 
 5 | 1. `batch_size`=20
 6 | 2. `seq_len`=64
 7 | 3. `hidden_size`=`input_size`=128
 8 | 4. `rnn_cell`=`LSTM`
 9 | 5. `iters` = 20, `warmup` = 10
10 | 
11 | ## Result
12 |   
13 | |Name|PyTorch Average Time| TF_graph Average Time|
14 | |:--|:--|:--|
15 | |gridlstm_gpu:0_forward| 2.6266 |2.5567|
16 | |gridlstm_cpu_forward| 8.6012 |3.7226|
17 | 
18 | > tf_graph: using tf.compat.v1.session
19 | 


--------------------------------------------------------------------------------
/kaleido/core/operators/softmax_op.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include "kaleido/core/tensor.h"
 4 | 
 5 | namespace kaleido {
 6 | namespace core {
 7 | namespace ops {
 8 | 
 9 | template <typename DeviceContext, typename Place, typename T>
10 | class SoftmaxOp {
11 | public:
12 |   void operator()(const DeviceContext& context, const Tensor& x, Tensor& y,
13 |                   int dim);
14 | };
15 | 
16 | }  // namespace ops
17 | }  // namespace core
18 | }  // namespace kaleido
19 | 


--------------------------------------------------------------------------------
/kaleido/core/operators/transpose_op.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include "kaleido/core/tensor.h"
 4 | 
 5 | namespace kaleido {
 6 | namespace core {
 7 | namespace ops {
 8 | 
 9 | template <typename DeviceContext, typename Place, typename T>
10 | class TransposeOp {
11 | public:
12 |   void operator()(const Tensor& input, Tensor& output,
13 |                   std::vector<size_t> dims);
14 | };
15 | 
16 | }  // namespace ops
17 | }  // namespace core
18 | }  // namespace kaleido
19 | 


--------------------------------------------------------------------------------
/kaleido/parser/tests/context.py:
--------------------------------------------------------------------------------
 1 | # -------------------------------------------------------------------------
 2 | # Copyright (c) Microsoft Corporation. All rights reserved.
 3 | # Licensed under the MIT License.
 4 | # --------------------------------------------------------------------------
 5 | 
 6 | import os
 7 | import sys
 8 | 
 9 | sys.path.insert(
10 |     0, os.path.abspath(os.path.join(os.path.dirname(__file__), '../../..')))
11 | 
12 | import random
13 | import unittest
14 | 


--------------------------------------------------------------------------------
/CODE_OF_CONDUCT.md:
--------------------------------------------------------------------------------
 1 | # Microsoft Open Source Code of Conduct
 2 | 
 3 | This project has adopted the [Microsoft Open Source Code of Conduct](https://opensource.microsoft.com/codeofconduct/).
 4 | 
 5 | Resources:
 6 | 
 7 | - [Microsoft Open Source Code of Conduct](https://opensource.microsoft.com/codeofconduct/)
 8 | - [Microsoft Code of Conduct FAQ](https://opensource.microsoft.com/codeofconduct/faq/)
 9 | - Contact [opencode@microsoft.com](mailto:opencode@microsoft.com) with questions or concerns
10 | 


--------------------------------------------------------------------------------
/benchmarks/rnn/baselines/stacked_lstm/tf_model/__init__.py:
--------------------------------------------------------------------------------
 1 | # -------------------------------------------------------------------------
 2 | # Copyright (c) Microsoft Corporation. All rights reserved.
 3 | # Licensed under the MIT License.
 4 | # --------------------------------------------------------------------------
 5 | 
 6 | import os
 7 | import sys
 8 | 
 9 | sys.path.insert(
10 |     0, os.path.abspath(os.path.join(os.path.dirname(__file__), '../..')))
11 | 
12 | from . import rnn, rnn2
13 | 


--------------------------------------------------------------------------------
/kaleido/core/operators/print_op.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include "kaleido/core/tensor.h"
 4 | 
 5 | namespace kaleido {
 6 | namespace core {
 7 | namespace ops {
 8 | 
 9 | template <typename DeviceContext, typename Place, typename T>
10 | class PrintOp {
11 | public:
12 |   std::string operator()(const Tensor& input, int precision = 3, int count = -1,
13 |                          int pos = -1) const;
14 | };
15 | 
16 | }  // namespace ops
17 | }  // namespace core
18 | }  // namespace kaleido
19 | 


--------------------------------------------------------------------------------
/kaleido/core/operators/online_softmax_op.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include "kaleido/core/tensor.h"
 4 | 
 5 | namespace kaleido {
 6 | namespace core {
 7 | namespace ops {
 8 | 
 9 | template <typename DeviceContext, typename Place, typename T>
10 | class OnlineNormalizedSoftmaxOp {
11 | public:
12 |   void operator()(const DeviceContext& context, const Tensor& x, Tensor& y,
13 |                   int dim);
14 | };
15 | 
16 | }  // namespace ops
17 | }  // namespace core
18 | }  // namespace kaleido
19 | 


--------------------------------------------------------------------------------
/kaleido/core/operators/gather_nd_op.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include "kaleido/core/tensor.h"
 4 | 
 5 | namespace kaleido {
 6 | namespace core {
 7 | namespace ops {
 8 | 
 9 | template <typename DeviceContext, typename Place, typename T>
10 | class GatherNdOp {
11 | public:
12 |   void operator()(const DeviceContext& context, Tensor& output,
13 |                   const Tensor& input, const Tensor& indices);
14 | };
15 | 
16 | }  // namespace ops
17 | }  // namespace core
18 | }  // namespace kaleido
19 | 


--------------------------------------------------------------------------------
/kaleido/core/tensor.cc:
--------------------------------------------------------------------------------
 1 | // Copyright (c) Microsoft Corporation. All rights reserved.
 2 | // Licensed under the MIT License.
 3 | 
 4 | #include "kaleido/core/tensor.h"
 5 | 
 6 | #include <iostream>
 7 | 
 8 | namespace kaleido {
 9 | namespace core {
10 | 
11 | std::string Tensor::DebugString() const {
12 |   std::stringstream ss;
13 |   ss << "Tensor {" << std::endl << type_desc_.DebugString() << std::endl << "}";
14 |   return ss.str();
15 | }
16 | 
17 | }  // namespace core
18 | }  // namespace kaleido
19 | 


--------------------------------------------------------------------------------
/kaleido/frontend/__init__.py:
--------------------------------------------------------------------------------
 1 | # -------------------------------------------------------------------------
 2 | # Copyright (c) Microsoft Corporation. All rights reserved.
 3 | # Licensed under the MIT License.
 4 | # --------------------------------------------------------------------------
 5 | 
 6 | from __future__ import absolute_import, division, print_function
 7 | 
 8 | from kaleido.frontend.fractal_tensor import *
 9 | from kaleido.frontend.tensor import *
10 | from kaleido.frontend.types import *
11 | 


--------------------------------------------------------------------------------
/kaleido/core/operators/scatter_nd_op.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include "kaleido/core/tensor.h"
 4 | 
 5 | namespace kaleido {
 6 | namespace core {
 7 | namespace ops {
 8 | 
 9 | template <typename DeviceContext, typename Place, typename T>
10 | class ScatterNdAddOp {
11 | public:
12 |   void operator()(const DeviceContext& context, Tensor& data,
13 |                   const Tensor& updates, const Tensor& indices);
14 | };
15 | 
16 | }  // namespace ops
17 | }  // namespace core
18 | }  // namespace kaleido
19 | 


--------------------------------------------------------------------------------
/kaleido/core/device/cuda_info.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | #include <cuda_runtime.h>
 3 | 
 4 | #include <string>
 5 | namespace kaleido {
 6 | namespace core {
 7 | 
 8 | int GetGPUDeviceCount();
 9 | 
10 | int GetGPUComputeCapability(int id);
11 | 
12 | int GetGPUMultiProcessors(int id);
13 | 
14 | int GetGPUMaxThreadsPerMultiProcessor(int id);
15 | 
16 | int GetGPUMaxThreadsPerBlock(int id);
17 | 
18 | dim3 GetGpuMaxGridDimSize(int);
19 | 
20 | std::string GetDeviceName();
21 | 
22 | }  // namespace core
23 | }  // namespace kaleido
24 | 


--------------------------------------------------------------------------------
/kaleido/core/operators/concat_op.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include "kaleido/core/tensor.h"
 4 | 
 5 | namespace kaleido {
 6 | namespace core {
 7 | namespace ops {
 8 | 
 9 | template <typename DeviceContext, typename Place, typename T>
10 | class ConcatOp {
11 | public:
12 |   void operator()(const DeviceContext& context,
13 |                   const std::vector<Tensor>& inputs, Tensor& output,
14 |                   size_t dim);
15 | };
16 | 
17 | }  // namespace ops
18 | }  // namespace core
19 | }  // namespace kaleido
20 | 


--------------------------------------------------------------------------------
/kaleido/frontend/operations/tests/context.py:
--------------------------------------------------------------------------------
 1 | # -------------------------------------------------------------------------
 2 | # Copyright (c) Microsoft Corporation. All rights reserved.
 3 | # Licensed under the MIT License.
 4 | # --------------------------------------------------------------------------
 5 | 
 6 | import os
 7 | import sys
 8 | 
 9 | sys.path.insert(
10 |     0, os.path.abspath(os.path.join(os.path.dirname(__file__), '../../../..')))
11 | 
12 | import random
13 | import unittest
14 | 
15 | import kaleido
16 | 


--------------------------------------------------------------------------------
/benchmarks/rnn/baselines/stacked_lstm/pt_model/__init__.py:
--------------------------------------------------------------------------------
 1 | # -------------------------------------------------------------------------
 2 | # Copyright (c) Microsoft Corporation. All rights reserved.
 3 | # Licensed under the MIT License.
 4 | # --------------------------------------------------------------------------
 5 | 
 6 | import os
 7 | import sys
 8 | 
 9 | sys.path.insert(
10 |     0, os.path.abspath(os.path.join(os.path.dirname(__file__), '../..')))
11 | 
12 | from .rnn import small_model
13 | 
14 | __all__ = [
15 |     "small_model",
16 | ]
17 | 


--------------------------------------------------------------------------------
/benchmarks/rnn/baselines/stacked_lstm/triton_model/__init__.py:
--------------------------------------------------------------------------------
 1 | # -------------------------------------------------------------------------
 2 | # Copyright (c) Microsoft Corporation. All rights reserved.
 3 | # Licensed under the MIT License.
 4 | # --------------------------------------------------------------------------
 5 | 
 6 | import os
 7 | import sys
 8 | 
 9 | from .rnn import StackedLSTM
10 | 
11 | sys.path.insert(
12 |     0, os.path.abspath(os.path.join(os.path.dirname(__file__), '../..')))
13 | 
14 | __all__ = [
15 |     "StackedLSTM",
16 | ]
17 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | build/
 2 | *.log
 3 | !requirements*.txt
 4 | *.tgz
 5 | *.gz
 6 | *.pyc
 7 | .ipynb_checkpoints/
 8 | __pycache__/
 9 | .vs/
10 | .vscode/
11 | .data/
12 | venv/
13 | .idea/
14 | .checkpoints/
15 | *.pb.h
16 | *.pb.cc
17 | *_pb2.py
18 | tensorboard/
19 | benchmarks/attention/baseline/MultiHeadAttention/log
20 | 
21 | # generated by compiling tex files.
22 | *.aux
23 | *.bbl
24 | *.blg
25 | *.idx
26 | *.ind
27 | *.lof
28 | *.lot
29 | *.out
30 | *.toc
31 | *.acn
32 | *.acr
33 | *.alg
34 | *.glg
35 | *.glo
36 | *.gls
37 | *.ist
38 | *.fls
39 | *.gv
40 | 


--------------------------------------------------------------------------------
/benchmarks/rnn/baselines/grid_lstm/pt_model/__init__.py:
--------------------------------------------------------------------------------
 1 | # -------------------------------------------------------------------------
 2 | # Copyright (c) Microsoft Corporation. All rights reserved.
 3 | # Licensed under the MIT License.
 4 | # --------------------------------------------------------------------------
 5 | 
 6 | import os
 7 | import sys
 8 | 
 9 | sys.path.insert(
10 |     0, os.path.abspath(os.path.join(os.path.dirname(__file__), '../..')))
11 | 
12 | from .model import StackedGridModel
13 | 
14 | __all__ = [
15 |     "StackedGridModel",
16 | ]
17 | 


--------------------------------------------------------------------------------
/benchmarks/rnn/baselines/stacked_dilated_rnn/tf_model/__init__.py:
--------------------------------------------------------------------------------
 1 | # -------------------------------------------------------------------------
 2 | # Copyright (c) Microsoft Corporation. All rights reserved.
 3 | # Licensed under the MIT License.
 4 | # --------------------------------------------------------------------------
 5 | 
 6 | import os
 7 | import sys
 8 | 
 9 | sys.path.insert(
10 |     0, os.path.abspath(os.path.join(os.path.dirname(__file__), '../..')))
11 | 
12 | from .model import StackedDRNN
13 | 
14 | __all__ = [
15 |     'StackedDRNN',
16 | ]
17 | 


--------------------------------------------------------------------------------
/kaleido/core/operators/matmul_op.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include "kaleido/core/tensor.h"
 4 | 
 5 | namespace kaleido {
 6 | namespace core {
 7 | namespace ops {
 8 | 
 9 | template <typename DeviceContext, typename Place, typename T>
10 | class MatMulOp {
11 | public:
12 |   void operator()(const DeviceContext& context, const Tensor& A, bool trans_a,
13 |                   const Tensor& B, bool trans_b, Tensor& C, T alf = 1.,
14 |                   T bet = 0.);
15 | };
16 | 
17 | }  // namespace ops
18 | }  // namespace core
19 | }  // namespace kaleido
20 | 


--------------------------------------------------------------------------------
/benchmarks/rnn/baselines/grid_lstm/triton_model/__init__.py:
--------------------------------------------------------------------------------
 1 | # -------------------------------------------------------------------------
 2 | # Copyright (c) Microsoft Corporation. All rights reserved.
 3 | # Licensed under the MIT License.
 4 | # --------------------------------------------------------------------------
 5 | 
 6 | import os
 7 | import sys
 8 | 
 9 | sys.path.insert(
10 |     0, os.path.abspath(os.path.join(os.path.dirname(__file__), '../..')))
11 | 
12 | from .model import StackedGridModel
13 | 
14 | __all__ = [
15 |     "StackedGridModel",
16 | ]
17 | 


--------------------------------------------------------------------------------
/benchmarks/rnn/baselines/stacked_dilated_rnn/triton_model/__init__.py:
--------------------------------------------------------------------------------
 1 | # -------------------------------------------------------------------------
 2 | # Copyright (c) Microsoft Corporation. All rights reserved.
 3 | # Licensed under the MIT License.
 4 | # --------------------------------------------------------------------------
 5 | 
 6 | import os
 7 | import sys
 8 | 
 9 | from .rnn import StackedDRNN
10 | 
11 | sys.path.insert(
12 |     0, os.path.abspath(os.path.join(os.path.dirname(__file__), '../..')))
13 | 
14 | __all__ = [
15 |     "StackedDRNN",
16 | ]
17 | 


--------------------------------------------------------------------------------
/kaleido/core/operators/fill_op.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include "kaleido/core/tensor.h"
 4 | 
 5 | namespace kaleido {
 6 | namespace core {
 7 | namespace ops {
 8 | 
 9 | template <typename DeviceContext, typename Place, typename T>
10 | class FillOp {
11 | public:
12 |   void operator()(Tensor& input);
13 |   void operator()(Tensor& input, float value);
14 |   void operator()(Tensor& input, float mean, float stddev);
15 |   void operator()(Tensor& input, const std::string& mode, float scale = 1.);
16 | };
17 | 
18 | }  // namespace ops
19 | }  // namespace core
20 | }  // namespace kaleido
21 | 


--------------------------------------------------------------------------------
/kaleido/frontend/tests/context.py:
--------------------------------------------------------------------------------
 1 | # -------------------------------------------------------------------------
 2 | # Copyright (c) Microsoft Corporation. All rights reserved.
 3 | # Licensed under the MIT License.
 4 | # --------------------------------------------------------------------------
 5 | 
 6 | import os
 7 | import sys
 8 | 
 9 | sys.path.insert(
10 |     0, os.path.abspath(os.path.join(os.path.dirname(__file__), '../../..')))
11 | 
12 | import random
13 | import unittest
14 | 
15 | import kaleido
16 | from kaleido import FractalTensor, FractalTensorStorage, Tensor, TensorStorage
17 | 


--------------------------------------------------------------------------------
/benchmarks/rnn/baselines/stacked_dilated_rnn/pt_model/__init__.py:
--------------------------------------------------------------------------------
 1 | # -------------------------------------------------------------------------
 2 | # Copyright (c) Microsoft Corporation. All rights reserved.
 3 | # Licensed under the MIT License.
 4 | # --------------------------------------------------------------------------
 5 | 
 6 | import os
 7 | import sys
 8 | 
 9 | sys.path.insert(
10 |     0, os.path.abspath(os.path.join(os.path.dirname(__file__), '../..')))
11 | 
12 | from .model import StackedDRNN, StackedDRNNJIT
13 | 
14 | __all__ = [
15 |     'StackedDRNNJIT',
16 |     'StackedDRNN',
17 | ]
18 | 


--------------------------------------------------------------------------------
/kaleido/core/data_types.proto:
--------------------------------------------------------------------------------
 1 | syntax = "proto3";
 2 | 
 3 | package kaleido.core;
 4 | 
 5 | message VarType {
 6 |     enum DataType {
 7 |         BOOL = 0;
 8 |         INT32 = 1;
 9 |         INT64 = 2;
10 |         FP32 = 3;
11 |         FP64 = 4;
12 |     }
13 | 
14 |     DataType type = 1;
15 | 
16 |     message TensorTypeDesc {
17 |         DataType dtype = 1;
18 |         repeated int64 dims = 2;
19 |         string place = 3;
20 |     }
21 | 
22 |     message FractalTensorTypeDesc {
23 |         TensorTypeDesc dtype = 1;
24 |         int64 depth = 2;
25 |         repeated bool is_static = 3;
26 |     }
27 | }
28 | 


--------------------------------------------------------------------------------
/kaleido/core/device/kernels/softmax_common.h:
--------------------------------------------------------------------------------
 1 | // Copyright (c) Microsoft Corporation. All rights reserved.
 2 | // Licensed under the MIT License.
 3 | 
 4 | #pragma once
 5 | 
 6 | namespace kaleido {
 7 | namespace core {
 8 | namespace cuda_kernel {
 9 | 
10 | template <typename T>
11 | struct MD {
12 |   T m;
13 |   T d;
14 | };
15 | 
16 | template <>
17 | struct __align__(8) MD<float> {
18 |   float m;
19 |   float d;
20 | };
21 | 
22 | template <>
23 | struct __align__(16) MD<double> {
24 |   double m;
25 |   double d;
26 | };
27 | 
28 | }  // namespace cuda_kernel
29 | }  // namespace core
30 | }  // namespace kaleido
31 | 


--------------------------------------------------------------------------------
/kaleido/core/operators/gemm_batched_op.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include "kaleido/core/tensor.h"
 4 | 
 5 | #include <vector>
 6 | 
 7 | namespace kaleido {
 8 | namespace core {
 9 | namespace ops {
10 | 
11 | template <typename DeviceContext, typename Place, typename T>
12 | class GemmBatchedOp {
13 | public:
14 |   void operator()(const DeviceContext& context, const std::vector<Tensor>& A,
15 |                   bool trans_a, const std::vector<Tensor>& B, bool trans_b,
16 |                   std::vector<Tensor>& C, T alf = 1., T bet = 0.);
17 | };
18 | 
19 | }  // namespace ops
20 | }  // namespace core
21 | }  // namespace kaleido
22 | 


--------------------------------------------------------------------------------
/kaleido/core/init.cc:
--------------------------------------------------------------------------------
 1 | // Copyright (c) Microsoft Corporation. All rights reserved.
 2 | // Licensed under the MIT License.
 3 | 
 4 | #include <glog/logging.h>
 5 | #include <pybind11/pybind11.h>
 6 | 
 7 | #include <mutex>
 8 | 
 9 | namespace py = pybind11;
10 | 
11 | namespace kaleido {
12 | namespace core {
13 | 
14 | std::once_flag glog_init_flag;
15 | 
16 | void InitGLOG(const std::string& prog_name) {
17 |   std::call_once(glog_init_flag, [&]() {
18 |     google::InitGoogleLogging(strdup(prog_name.c_str()));
19 |   });
20 | }
21 | 
22 | PYBIND11_MODULE(_core, m) { m.def("init_glog", InitGLOG); }
23 | 
24 | }  // namespace core
25 | }  // namespace kaleido
26 | 


--------------------------------------------------------------------------------
/.clang-format:
--------------------------------------------------------------------------------
 1 | # Run manually to reformat a file:
 2 | # clang-format -i --style=file <file>
 3 | BasedOnStyle: Google
 4 | ColumnLimit: 80
 5 | IndentWidth: 2
 6 | AccessModifierOffset: -2
 7 | DerivePointerAlignment: false
 8 | KeepEmptyLinesAtTheStartOfBlocks: false
 9 | SortIncludes: true
10 | IncludeBlocks: Regroup
11 | IncludeCategories:
12 |   - Regex: '<([A-Za-z0-9\Q/-_\E])+>'
13 |     Priority: 4
14 |   - Regex: '<(catch2|boost)\/'
15 |     Priority: 3
16 |   - Regex: '<([A-Za-z0-9.\Q/-_\E])+>'
17 |     Priority: 2
18 |   - Regex: '"([A-Za-z0-9.\Q/-_\E])+"'
19 |     Priority: 1
20 | 
21 | AllowShortLoopsOnASingleLine: true
22 | AllowShortIfStatementsOnASingleLine: true
23 | Cpp11BracedListStyle: true
24 | 


--------------------------------------------------------------------------------
/kaleido/core/operators/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | # -------------------------------------------------------------------------
 2 | # Copyright (c) Microsoft Corporation. All rights reserved. Licensed under the
 3 | # MIT License.
 4 | # --------------------------------------------------------------------------
 5 | 
 6 | op_library(fill_op DEPS ${CUDA_curand_LIBRARY})
 7 | op_library(concat_op)
 8 | op_library(transpose_op)
 9 | op_library(print_op)
10 | op_library(matmul_op)
11 | op_library(softmax_op)
12 | op_library(gemm_batched_op)
13 | op_library(elementwise_op)
14 | op_library(gather_nd_op)
15 | op_library(scatter_nd_op)
16 | op_library(online_softmax_op)
17 | op_library(expect_eq_op)
18 | add_subdirectory(tests)
19 | 


--------------------------------------------------------------------------------
/scripts/clang_format.hook:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | #-------------------------------------------------------------------------
 3 | # Copyright (c) Microsoft Corporation. All rights reserved.
 4 | # Licensed under the MIT License.
 5 | # --------------------------------------------------------------------------
 6 | set -e
 7 | 
 8 | readonly VERSION="18.1.5"
 9 | 
10 | version=$(clang-format -version)
11 | 
12 | if ! [[ $version == *"$VERSION"* ]]; then
13 |     echo "clang-format version check failed."
14 |     echo "a version contains '$VERSION' is needed, but get '$version'"
15 |     echo "you can install the right version, and make an soft-link to '\$PATH' env"
16 |     exit -1
17 | fi
18 | 
19 | clang-format $@
20 | 


--------------------------------------------------------------------------------
/benchmarks/rnn/baselines/grid_lstm/tf_model/__init__.py:
--------------------------------------------------------------------------------
 1 | # -------------------------------------------------------------------------
 2 | # Copyright (c) Microsoft Corporation. All rights reserved.
 3 | # Licensed under the MIT License.
 4 | # --------------------------------------------------------------------------
 5 | 
 6 | import os
 7 | import sys
 8 | 
 9 | sys.path.insert(
10 |     0, os.path.abspath(os.path.join(os.path.dirname(__file__), '../..')))
11 | 
12 | from .model import (BaseWhileOpGridLSTMNet, FineGrainedOpGridLSTMNet,
13 |                     WhileOpGridLSTMNet)
14 | 
15 | __all__ = [
16 |     "WhileOpGridLSTMNet",
17 |     "BaseWhileOpGridLSTMNet",
18 |     "FineGrainedOpGridLSTMNet",
19 | ]
20 | 


--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
 1 | #-------------------------------------------------------------------------
 2 | # Copyright (c) Microsoft Corporation. All rights reserved.
 3 | # Licensed under the MIT License.
 4 | # --------------------------------------------------------------------------
 5 | 
 6 | CUDNN_HOME ?=
 7 | BUILD_DIR := build
 8 | 
 9 | .PHONY: build clean
10 | 
11 | build:
12 | 	@mkdir -p $(BUILD_DIR)/
13 | 	@cd build && cmake ../ -D PYTHON_EXECUTABLE:FILEPATH=`which python3` \
14 | 	-D CUDNN_INCLUDE_DIR=$(CUDNN_HOME)/include \
15 | 	-D CUDNN_LIBRARY=$(CUDNN_HOME)/lib/libcudnn.so && make -j$(nproc)
16 | 
17 | $(BUILD_DIR)/kaleido: 
18 | 	@$(MAKE) build
19 | 
20 | clean:
21 | 	@rm -f unittest.log
22 | 	@rm -rf $(BUILD_DIR)
23 | 


--------------------------------------------------------------------------------
/kaleido/core/operators/elementwise_op.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include "kaleido/core/tensor.h"
 4 | 
 5 | namespace kaleido {
 6 | namespace core {
 7 | namespace ops {
 8 | 
 9 | enum ElementwiseType {
10 |   kUnary = 1,
11 |   kBinary = 2,
12 |   kTernary = 3,
13 |   kArityFour = 4,
14 |   kAny = -1  //
15 | };
16 | 
17 | template <typename DeviceContext, typename Place, ElementwiseType ET,
18 |           typename T, typename Functor>
19 | class ElementwiseOp {
20 | public:
21 |   void operator()(const DeviceContext& context,
22 |                   const std::vector<Tensor>& inputs, Tensor& output,
23 |                   Functor func);
24 | };
25 | 
26 | }  // namespace ops
27 | }  // namespace core
28 | }  // namespace kaleido
29 | 


--------------------------------------------------------------------------------
/benchmarks/rnn/baselines/stacked_lstm/README.md:
--------------------------------------------------------------------------------
 1 | # Hyper-parameters
 2 | 
 3 | 1.  `num_layers` = 8, 8 LSTM layers are stacked
 4 | 1.  LSTM's `hidden_dim` = `output_dim` = 512
 5 | 1.  All training samples have a fixed length: `seq_len_` = 100
 6 | 1.  `batch_size` = 64
 7 | 1.  `warm_up` = 10, `iteration` = 30
 8 | 
 9 | Explanation for some implementations:
10 | 
11 | |Name|Explanation|
12 | |:--|:--|
13 | |Fine-grained Lstm Cell V1|Compute LSTM's Four gates separatedly.|
14 | |Fine-grained Lstm Cell V2|Manually batch GEMMs in LSTM's four gates into a large GEMM.|
15 | |Static LSTM cell in TensorFlow|LSTM cell as a single operator.|
16 | 
17 | <p align="center">
18 | <img src="figures/stacked_lstm_perf_with_depth.png" width=50%>
19 | </p>
20 | 


--------------------------------------------------------------------------------
/benchmarks/fused_two_hgemms/README.md:
--------------------------------------------------------------------------------
 1 | <p align="center">
 2 | <img src="fractaltensor/figures/fused_two_gemms.png"  width=70%><br>
 3 | Fig. Compose back-to-back GEMMs using parallel operator nesting.
 4 | </p>
 5 | 
 6 | <p align="center">
 7 | <img src="fractaltensor/figures/etdg_for_two_gemms.png"  width=70%><br>
 8 | Fig. Extended task dependence graph representations for back-to-back GEMMs.
 9 | </p>
10 | 
11 | <p align="center">
12 | <img src="fractaltensor/figures/access_maps.png" width=70%><br>
13 | Fig. AccessMap annotation attached to the extended task dependence graph.
14 | </p>
15 | 
16 | <p align="center">
17 | <img src="fractaltensor/figures/gemm_translated_to_macro_kernel.png" width=70%><br>
18 | Fig. Translate into heirarchical dataflow on the CUDA device.
19 | </p>
20 | 


--------------------------------------------------------------------------------
/benchmarks/rnn/fractaltensor/grid_lstm/run.sh:
--------------------------------------------------------------------------------
 1 | // Copyright (c) Microsoft Corporation. All rights reserved.
 2 | // Licensed under the MIT License.
 3 | 
 4 | #!/bin/bash
 5 | 
 6 | # overall for figure 7.
 7 | ./build/grid_rnn 32 256 10 1
 8 | ./build/grid_rnn 32 512 10 0
 9 | ./build/grid_rnn 32 1024 10 0
10 | 
11 | depths='1 2 4 8 16 32'
12 | hiddens='256 1024' # for middle size and large size
13 | # scale with depth in Figure 9
14 | for h in $hiddens; do
15 |     for d in $depths; do
16 |         ./build/grid_rnn $d $h 10 0
17 |     done
18 | done
19 | 
20 | hiddens='256 1024' # for middle size and large size
21 | # scale with length in Figure 9
22 | lengths='5, 7, 10'
23 | for h in $hiddens; do
24 |     for l in $lengths; do
25 |         ./build/grid_rnn 32 $h $l 0
26 |     done
27 | done
28 | 


--------------------------------------------------------------------------------
/kaleido/core/tensor_shape.cc:
--------------------------------------------------------------------------------
 1 | // Copyright (c) Microsoft Corporation. All rights reserved.
 2 | // Licensed under the MIT License.
 3 | 
 4 | #include "kaleido/core/tensor_shape.h"
 5 | 
 6 | namespace kaleido {
 7 | namespace core {
 8 | 
 9 | bool TensorShape::IsEuqalShape(const TensorShape& b) const {
10 |   if (b.ndim() != ndim()) return false;
11 |   for (size_t i = 0; i < ndim(); ++i) {
12 |     if (b.dim_size(i) != dim_size(i)) return false;
13 |   }
14 |   return true;
15 | }
16 | 
17 | std::string TensorShape::DebugString() const {
18 |   std::stringstream ss;
19 |   ss << "shape : [";
20 |   for (size_t i = 0; i < dim_ - 1; ++i) ss << dim_sizes_[i] << ", ";
21 |   ss << dim_sizes_[dim_ - 1] << "]";
22 |   return ss.str();
23 | }
24 | 
25 | }  // namespace core
26 | }  // namespace kaleido
27 | 


--------------------------------------------------------------------------------
/kaleido/core/place.cc:
--------------------------------------------------------------------------------
 1 | // Copyright (c) Microsoft Corporation. All rights reserved.
 2 | // Licensed under the MIT License.
 3 | 
 4 | #include "kaleido/core/place.h"
 5 | 
 6 | #include <iostream>
 7 | 
 8 | namespace kaleido {
 9 | namespace core {
10 | 
11 | class PlacePrinter : public boost::static_visitor<> {
12 | public:
13 |   explicit PlacePrinter(std::ostream& os) : os_(os) {}
14 | 
15 |   void operator()(const CPUPlace&) { os_ << "CPU"; }
16 |   void operator()(const CUDAPlace& p) { os_ << "CUDA:" << p.device; }
17 | 
18 | private:
19 |   std::ostream& os_;
20 | };
21 | 
22 | std::ostream& operator<<(std::ostream& out, const Place& place) {
23 |   PlacePrinter printer(out);
24 |   boost::apply_visitor(printer, place);
25 |   return out;
26 | }
27 | 
28 | }  // namespace core
29 | }  // namespace kaleido
30 | 


--------------------------------------------------------------------------------
/kaleido/core/config.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #if defined(__CUDA_ARCH__)
 4 | #define HOST_DEVICE __forceinline__ __host__ __device__
 5 | #define DEVICE __forceinline__ __device__
 6 | #define HOST __forceinline__ __host__
 7 | #else
 8 | #define HOST_DEVICE inline
 9 | #define DEVICE inline
10 | #define HOST inline
11 | #endif
12 | 
13 | #if defined(__CUDACC_RTC__)
14 | #include <cuda/std/cstddef>
15 | #include <cuda/std/cstdint>
16 | #include <cuda/std/limits>
17 | #include <cuda/std/type_traits>
18 | #include <cuda/std/utility>
19 | 
20 | #define STL_NAMESPACE cuda::std
21 | 
22 | #else
23 | #include <cstddef>  // ptrdiff_t
24 | #include <cstdint>  // uintptr_t
25 | #include <limits>   // numeric_limits
26 | #include <type_traits>
27 | #include <utility>  // tuple_size, tuple_element
28 | 
29 | #define STL_NAMESPACE std
30 | #endif
31 | 


--------------------------------------------------------------------------------
/scripts/format.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | #-------------------------------------------------------------------------
 4 | # Copyright (c) Microsoft Corporation. All rights reserved.
 5 | # Licensed under the MIT License.
 6 | # --------------------------------------------------------------------------
 7 | # Format Python files using yapf
 8 | echo "Running yapf..."
 9 | find . -type f -name "*.py" \
10 |     ! -path "./build/*" \
11 |     ! -path "./.git/*" \
12 |     ! -path "*.egg-info/*" \
13 |     -print0 | xargs -0 yapf --in-place
14 | 
15 | # Format Python imports using isort
16 | echo "Running isort..."
17 | isort .
18 | 
19 | 
20 | 
21 | find kaleido/ -name "*.cc" -o -name "*.cpp" -o -name "*.h" -o -name "*.cu" | xargs clang-format -i
22 | find benchmarks/ -name "*.cpp" -o -name "*.h" -o -name "*.cu" | xargs clang-format -i
23 | 


--------------------------------------------------------------------------------
/docs/fractaltensor_operations/information_query.md:
--------------------------------------------------------------------------------
 1 | <!-- vscode-markdown-toc -->
 2 | 
 3 | - [Information query](#information-query)
 4 |   - [length](#length)
 5 |   - [depth](#depth)
 6 | 
 7 | <!-- vscode-markdown-toc-config
 8 | 
 9 |     numbering=true
10 |     autoSave=true
11 |     /vscode-markdown-toc-config -->
12 | 
13 | <!-- /vscode-markdown-toc -->
14 | 
15 | ## Information query
16 | 
17 | ### length
18 | 
19 | $$\mathbf{length} ::\Psi n.[\alpha]^d_n \rightarrow \text{int}$$
20 | 
21 | ```python
22 | length(x: FractalTensor[T]) -> List[int]
23 | ```
24 | 
25 | `length` is only available after data is feed to a `FractalTensor` variable, otherwise, return an empty list.
26 | 
27 | ### depth
28 | 
29 | $$\mathbf{depth} ::\Psi n.[\alpha]^d_n \rightarrow \text{int}$$
30 | 
31 | ```python
32 | depth(x: FractalTensor[T]) -> int
33 | ```
34 | 


--------------------------------------------------------------------------------
/benchmarks/rnn/fractaltensor/cute_stacked_lstm/README.md:
--------------------------------------------------------------------------------
 1 | <p align="center">
 2 | <img src="figures/etdg-lstm.png" width=60%><br>
 3 | Fig. The ETDG representation for staked LSTM.
 4 | </p>
 5 | 
 6 | <p align="center">
 7 | <img src="figures/access_map1.png" width=60%><br>
 8 | Fig. The original egde annotation for S0 and S1 before program transformation.
 9 | </p>
10 | 
11 | <p align="center">
12 | <img src="figures/access_map2.png" width=60%><br>
13 | Fig. The original egde annotation for S2 and S3 before program transformation.
14 | </p>
15 | 
16 | <p align="center">
17 | <img src="figures/access_map3.png" width=60%><br>
18 | Fig. The egde annotation for S0 and S1 after program transformation.
19 | </p>
20 | 
21 | <p align="center">
22 | <img src="figures/access_map4.png" width=60%><br>
23 | Fig. The egde annotation for S2 and S3 after program transformation.
24 | 


--------------------------------------------------------------------------------
/kaleido/core/device/traits_base.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | #include <cutlass/numeric_size.h>
 3 | 
 4 | namespace kaleido {
 5 | namespace core {
 6 | 
 7 | // FIXME(ying). The swizzle function requires a data tile with a minimal
 8 | // shape of <8, 32> for the <2, 3, 3> case, and a minimal shape of <8, 64> for
 9 | // the <3, 3, 3> case. Here requires some check to ensure that the data tile
10 | // meets these requirements before using this function.
11 | template <const int N>
12 | static constexpr int kSwizzle = (N == 32 ? 2 : 3);
13 | 
14 | template <typename Element>
15 | struct TraitsBase {
16 |   static constexpr int kAccessInBits = 128;  // 128 bits
17 |   static constexpr int kElmentBits = cutlass::sizeof_bits<Element>::value;
18 |   static constexpr int kNumPerAccess = kAccessInBits / kElmentBits;
19 | };
20 | 
21 | }  // namespace core
22 | }  // namespace kaleido
23 | 


--------------------------------------------------------------------------------
/kaleido/core/tile_shape.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include "kaleido/core/config.h"
 4 | 
 5 | #include <cute/container/tuple.hpp>
 6 | #include <cute/int_tuple.hpp>
 7 | 
 8 | namespace kaleido {
 9 | namespace core {
10 | 
11 | template <size_t... T>
12 | using TileShape = cute::tuple<std::integral_constant<size_t, T>...>;
13 | 
14 | // FIXME(ying): Be careful that names like `rank` is quite common.
15 | // It is easy to conflict with cute's builtin function.
16 | template <typename TileShape>
17 | __device__ static constexpr size_t rank = cute::rank_v<TileShape>;
18 | 
19 | template <const size_t I, typename TileShape>
20 | __device__ static constexpr size_t dim_size = cute::get<I>(TileShape{});
21 | 
22 | template <typename TileShape>
23 | __device__ static constexpr int64_t get_numel = cute::size(TileShape{});
24 | 
25 | }  // namespace core
26 | }  // namespace kaleido
27 | 


--------------------------------------------------------------------------------
/cmake/python.cmake:
--------------------------------------------------------------------------------
 1 | # -------------------------------------------------------------------------
 2 | # Copyright (c) Microsoft Corporation. All rights reserved. Licensed under the
 3 | # MIT License.
 4 | # --------------------------------------------------------------------------
 5 | 
 6 | # FIXME(Ying): This may lead to runtime error if users have multiple locally
 7 | # installed Pythons. To avoid runtime error, it is better to explicitly specify
 8 | # which python is in use through: cmake -DPYTHON_EXECUTABLE:FILEPATH=`which
 9 | # python3`
10 | 
11 | find_package(PythonLibs REQUIRED)
12 | 
13 | message(STATUS "Python include dir: ${PYTHON_INCLUDE_DIR}")
14 | message(STATUS "Python library: ${PYTHON_LIBRARY}")
15 | 
16 | add_library(python SHARED IMPORTED GLOBAL)
17 | set_property(TARGET python PROPERTY IMPORTED_LOCATION ${PYTHON_LIBRARIES})
18 | include_directories(${PYTHON_INCLUDE_DIRS})
19 | 


--------------------------------------------------------------------------------
/kaleido/core/tests/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | # -------------------------------------------------------------------------
 2 | # Copyright (c) Microsoft Corporation. All rights reserved. Licensed under the
 3 | # MIT License.
 4 | # --------------------------------------------------------------------------
 5 | 
 6 | set(TEST_ROOT ${PROJECT_SOURCE_DIR}/kaleido/core/tests)
 7 | 
 8 | cc_test_build(test_allocator SRCS ${TEST_ROOT}/test_allocator.cc DEPS
 9 |               fractaltensor_core)
10 | cc_test_build(
11 |   test_tensor_and_fractaltensor
12 |   SRCS
13 |   ${TEST_ROOT}/test_tensor_and_fractaltensor.cc
14 |   DEPS
15 |   fractaltensor_core
16 |   print_op
17 |   fill_op)
18 | 
19 | cc_test_build(test_cuda_info SRCS ${TEST_ROOT}/test_cuda_info.cc DEPS
20 |               fractaltensor_core)
21 | 
22 | cc_test_build(test_layout SRCS ${TEST_ROOT}/test_layout.cc DEPS
23 |               fractaltensor_core)
24 | 


--------------------------------------------------------------------------------
/benchmarks/rnn/tvm/utils.py:
--------------------------------------------------------------------------------
 1 | # -------------------------------------------------------------------------
 2 | # Copyright (c) Microsoft Corporation. All rights reserved.
 3 | # Licensed under the MIT License.
 4 | # --------------------------------------------------------------------------
 5 | 
 6 | import logging
 7 | 
 8 | 
 9 | def get_logger(log_file_name='tvm_codegen.txt', log_level=logging.DEBUG):
10 |     logger = logging.getLogger()
11 |     logger.setLevel(log_level)
12 |     formatter = logging.Formatter(
13 |         '%(asctime)s - %(name)s - %(levelname)s: - %(message)s',
14 |         datefmt='%Y-%m-%d %H:%M:%S')
15 | 
16 |     fh = logging.FileHandler(log_file_name)
17 |     fh.setLevel(log_level)
18 |     fh.setFormatter(formatter)
19 | 
20 |     ch = logging.StreamHandler()
21 |     ch.setLevel(log_level)
22 |     ch.setFormatter(formatter)
23 | 
24 |     logger.addHandler(ch)
25 |     logger.addHandler(fh)
26 |     return logger
27 | 


--------------------------------------------------------------------------------
/kaleido/__init__.py:
--------------------------------------------------------------------------------
 1 | # -------------------------------------------------------------------------
 2 | # Copyright (c) Microsoft Corporation. All rights reserved.
 3 | # Licensed under the MIT License.
 4 | # --------------------------------------------------------------------------
 5 | 
 6 | from __future__ import absolute_import, division, print_function
 7 | 
 8 | import sys
 9 | 
10 | from kaleido.frontend import operations
11 | from kaleido.frontend.fractal_tensor import *
12 | from kaleido.frontend.tensor import Parameter, Tensor
13 | from kaleido.frontend.types import *
14 | from kaleido.parser import *
15 | 
16 | del absolute_import
17 | del division
18 | del print_function
19 | 
20 | # FIXME(Ying): please manually create the soft link of the build dynamic library
21 | # in the build directory.
22 | # It needs a standarded way to distribute the package and import
23 | # bindings in future.
24 | # import _core
25 | 
26 | # _core.init_glog(sys.argv[0])
27 | 


--------------------------------------------------------------------------------
/kaleido/core/device/cuda_timer.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include "kaleido/core/device/cuda_utils.h"
 4 | 
 5 | namespace kaleido {
 6 | namespace core {
 7 | class CudaTimer {
 8 | private:
 9 |   cudaEvent_t start;
10 |   cudaEvent_t stop;
11 |   cudaStream_t stream;
12 | 
13 | public:
14 |   CudaTimer() {
15 |     CudaCheck(cudaEventCreate(&start));
16 |     CudaCheck(cudaEventCreate(&stop));
17 |   }
18 | 
19 |   ~CudaTimer() {
20 |     CudaCheck(cudaEventDestroy(start));
21 |     CudaCheck(cudaEventDestroy(stop));
22 |   }
23 | 
24 |   void Start(cudaStream_t st = 0) {
25 |     stream = st;
26 |     CudaCheck(cudaEventRecord(start, stream));
27 |   }
28 | 
29 |   float Stop() {
30 |     float milliseconds = 0.;
31 |     CudaCheck(cudaEventRecord(stop, stream));
32 |     CudaCheck(cudaEventSynchronize(stop));
33 |     CudaCheck(cudaEventElapsedTime(&milliseconds, start, stop));
34 |     return milliseconds;
35 |   }
36 | };
37 | }  // namespace core
38 | }  // namespace kaleido
39 | 


--------------------------------------------------------------------------------
/examples/convolution/utils.py:
--------------------------------------------------------------------------------
 1 | # -------------------------------------------------------------------------
 2 | # Copyright (c) Microsoft Corporation. All rights reserved.
 3 | # Licensed under the MIT License.
 4 | # --------------------------------------------------------------------------
 5 | 
 6 | import random
 7 | from typing import NamedTuple, Tuple
 8 | 
 9 | import torch
10 | 
11 | import kaleido
12 | from kaleido import FractalTensor, FractalTensorStorage, Tensor, TensorStorage
13 | from kaleido import operations as ops
14 | 
15 | 
16 | def gen_image_batch(tensor_shape: Tuple[int],
17 |                     batch_size: int,
18 |                     device='cpu') -> FractalTensor[FractalTensor[Tensor]]:
19 |     """Returns a batch of image in format of NCHW."""
20 |     x = FractalTensor(
21 |         TensorStorage(tensor_shape, kaleido.float32, device=device))
22 |     x.indices = list(range(batch_size))
23 |     x.initialize(torch.rand, *x.flatten_shape, device=device)
24 |     return x
25 | 


--------------------------------------------------------------------------------
/benchmarks/rnn/tvm/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | cmake_minimum_required(VERSION 3.0)
 2 | project(tvm_test CXX C)
 3 | 
 4 | set(CMAKE_CXX_STANDARD 14)
 5 | set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fPIC -Wl,--no-undefined")
 6 | set(CMAKE_CXX_FLAGS_DEBUG
 7 |     "$ENV{CXXFLAGS} -O0 -fPIC -Wall -Wno-sign-compare -g2 -ggdb")
 8 | set(CMAKE_CXX_FLAGS_RELEASE "$ENV{CXXFLAGS} -fPIC -O3 -Wall -Wno-sign-compare")
 9 | 
10 | find_package(CUDA QUIET REQUIRED)
11 | include_directories(${CUDA_INCLUDE_DIRS})
12 | 
13 | set(TVM_ROOT
14 |     "${CMAKE_CURRENT_SOURCE_DIR}/../../../build/third_party/tvm/src/extern_tvm/"
15 | )
16 | 
17 | include_directories(${TVM_ROOT}/include ${TVM_ROOT}/3rdparty/dmlc-core/include
18 |                     ${TVM_ROOT}/3rdparty/dlpack/include)
19 | link_directories(${TVM_ROOT})
20 | 
21 | add_executable(main main.cc)
22 | target_link_libraries(
23 |   main
24 |   tvm_runtime
25 |   cuda
26 |   ${CUDA_CUDART_LIBRARY}
27 |   ${CUDA_LIBRARIES}
28 |   ${CUDA_CUBLAS_LIBRARIES}
29 |   ${CUDA_curand_LIBRARY})
30 | 


--------------------------------------------------------------------------------
/kaleido/parser/errors.py:
--------------------------------------------------------------------------------
 1 | # -------------------------------------------------------------------------
 2 | # Copyright (c) Microsoft Corporation. All rights reserved.
 3 | # Licensed under the MIT License.
 4 | # --------------------------------------------------------------------------
 5 | 
 6 | __all__ = [
 7 |     'ParseError',
 8 |     'UnsupportedConstruct',
 9 |     'UnsupportedType',
10 |     'AnnotationError',
11 |     'UnknownPrimitiveOps',
12 |     'ShapeError',
13 | ]
14 | 
15 | 
16 | class ParseError(Exception):
17 |     pass
18 | 
19 | 
20 | class UnsupportedConstruct(ParseError):
21 |     """Exception for unsupported Python construct."""
22 | 
23 |     def __init__(self, msg=None):
24 |         self.msg = f"Unspport Python construct {msg}."
25 | 
26 | 
27 | class UnsupportedType(ParseError):
28 |     pass
29 | 
30 | 
31 | class AnnotationError(ParseError):
32 |     pass
33 | 
34 | 
35 | class UnknownPrimitiveOps(ParseError):
36 |     pass
37 | 
38 | 
39 | class ShapeError(ParseError):
40 |     pass
41 | 


--------------------------------------------------------------------------------
/benchmarks/rnn/baselines/stacked_dilated_rnn/README.md:
--------------------------------------------------------------------------------
 1 | # Stacked Dilated LSTM
 2 | 
 3 | ## PyTorch implementation
 4 | 
 5 | <p align="center">
 6 | <img src="../figures/dilaited_lstm_pytorch.png" width=50%>
 7 | </p>
 8 | 
 9 | ## Hyper-parameters
10 | 
11 | 1. `batch_size` = 32
12 | 2. `seq_len` = 100
13 | 3. `layers_num`= 6, the corresponding `dilation`=`[1, 2, 4, 8, 16, 32]`
14 | 4. `input_size`= 64, while size means number of dims
15 | 5. `hidden_size` = `output_size` = 64
16 | 6. `rnn_cell` = `LSTM`
17 | 
18 | ## Results
19 | 
20 | `counting_iteration_num` = 50, `warmup_iteration_num` = 20
21 | 
22 | |Test Name|Average Time(s)|Elapsed Time(s)|Throughput(seq/s)|
23 | |:--|:--|:--|:--|
24 | |PyTroch Imperative|0.0085    |0.1698 |3768.5783|
25 | |PyTorch_JITed|0.0059      |0.1176 |5443.7788|
26 | |PyTorch Pad per Layer (cannot be JITed)|0.0092     |0.1843 |3472.5731|
27 | |TensorFlow Eager|0.0656  |3.2781 |488.0861|
28 | |TensorFlow Auto-graph|0.0073  |0.3648 |4386.2863|
29 | |TensorFlow Graph-mode|0.0051    |0.2575 |6214.1577|
30 | 


--------------------------------------------------------------------------------
/benchmarks/rnn/fractaltensor/stacked_lstm/README.md:
--------------------------------------------------------------------------------
 1 | # Methods
 2 | 
 3 | Below is a small test case to illustrate the idea:
 4 | 
 5 | - batch_size = 2
 6 | - length = 7
 7 | - depth = 4
 8 | - `ysss` is laid out in `[depth, length, batch_size]`
 9 | 
10 | <p align="center">
11 | <img src="images/access_to_generate_ysss.png" width=60%>
12 | </p>
13 | 
14 | In this small, it is easy to observe that, within a hyperplance, parallel iterations that access `ysss`, `xs` and `hs` **follows a fixed stride**, thus `xs @ W` and `hs @ U` can be translated into `stridedBMM`.
15 | 
16 | |No.|ysss|xs|hs|
17 | |:-:|:-:|:-:|:-:|
18 | |3-0|[16]|[2]|[14]|
19 | |3-1|[18,30]|[4,16]|[16,28]|
20 | |3-2|[20,32,44]|[6,18,30]|[18,30,42]|
21 | |3-3|[22,32,44]|[8,20,32]|[20,32,44]|
22 | |3-4|[24,36,48]|[10,22,34]|[22,34,46]|
23 | |3-5|[26,38,50]|[12,24,36]|[24,36,48]|
24 | |3-6|[40,52]|[26,38]|[38,50]|
25 | |3-7|[54]|[40]|[52]|
26 | 
27 | <p align="center">
28 | <img src="images/lstm.png" width=50%><br>
29 | Fig. The dataflow graph representation for the stacked LSTM network.
30 | </p>
31 | 


--------------------------------------------------------------------------------
/kaleido/core/device/gpu_context.cc:
--------------------------------------------------------------------------------
 1 | // Copyright (c) Microsoft Corporation. All rights reserved.
 2 | // Licensed under the MIT License.
 3 | 
 4 | #include "kaleido/core/device/gpu_context.h"
 5 | 
 6 | #include "kaleido/core/device/cuda_info.h"
 7 | 
 8 | namespace kaleido {
 9 | namespace core {
10 | 
11 | GPUContext::GPUContext() {
12 |   CublasCheck(cublasCreate(&cublas_handle_));
13 |   CublasCheck(cublasSetPointerMode(cublas_handle_, CUBLAS_POINTER_MODE_HOST));
14 |   // CudnnCheck(cudnnCreate(&cudnn_handle_));
15 | 
16 |   compute_capability_ = GetGPUComputeCapability(0);
17 |   multi_process_ = GetGPUMultiProcessors(0);
18 |   max_threads_per_mp_ = GetGPUMaxThreadsPerMultiProcessor(0);
19 |   max_threads_per_block_ = GetGPUMaxThreadsPerBlock(0);
20 |   max_grid_dim_size_ = GetGpuMaxGridDimSize(0);
21 | 
22 |   device_name_ = GetDeviceName();
23 | }
24 | 
25 | GPUContext::~GPUContext() {
26 |   CublasCheck(cublasDestroy(cublas_handle_));
27 |   // CudnnCheck(cudnnDestroy(cudnn_handle_));
28 | }
29 | 
30 | }  // namespace core
31 | }  // namespace kaleido
32 | 


--------------------------------------------------------------------------------
/kaleido/core/tests/test_cuda_info.cc:
--------------------------------------------------------------------------------
 1 | // Copyright (c) Microsoft Corporation. All rights reserved.
 2 | // Licensed under the MIT License.
 3 | 
 4 | #include "kaleido/core/device/cuda_info.h"
 5 | 
 6 | #include <gtest/gtest.h>
 7 | 
 8 | #include <iostream>
 9 | 
10 | namespace kaleido {
11 | namespace core {
12 | 
13 | TEST(test, TEST_GET_CUDA_DEVICE_INFO) {
14 |   std::cout << "cuda device count: " << GetGPUDeviceCount() << std::endl;
15 |   std::cout << "Compute Capability: " << GetGPUComputeCapability(0)
16 |             << std::endl;
17 |   std::cout << "Multiprocessors: " << GetGPUMultiProcessors(0) << std::endl;
18 |   std::cout << "Max threads per MP: " << GetGPUMaxThreadsPerMultiProcessor(0)
19 |             << std::endl;
20 |   std::cout << "Max threads per blocks: " << GetGPUMaxThreadsPerBlock(0)
21 |             << std::endl;
22 |   auto grid_size = GetGpuMaxGridDimSize(0);
23 |   std::cout << "Max grid size (x, y, z): " << grid_size.x << ", " << grid_size.y
24 |             << ", " << grid_size.z << std::endl;
25 | }
26 | 
27 | }  // namespace core
28 | }  // namespace kaleido
29 | 


--------------------------------------------------------------------------------
/benchmarks/rnn/baselines/grid_lstm/run_grid_lstm_pt.sh:
--------------------------------------------------------------------------------
 1 | // Copyright (c) Microsoft Corporation. All rights reserved.
 2 | // Licensed under the MIT License.
 3 | 
 4 | #!/bin/bash
 5 | 
 6 | seq_len=10
 7 | batch_size=32
 8 | 
 9 | # overall
10 | hiddens='256 512 1024'
11 | for hidden in $hiddens; do
12 |     python3 gridlstm_pt.py --seq_len=$seq_len \
13 |         --batch_size=$batch_size \
14 |         --hidden_size=$hidden \
15 |         --depth=32
16 | done
17 | 
18 | # scale with depth
19 | depths='1 2 4 8 16 32'
20 | hiddens='256 1024'
21 | for hidden in $hiddens; do
22 |     for depth in $depths; do
23 |         python3 gridlstm_pt.py --seq_len=$seq_len \
24 |             --batch_size=$batch_size \
25 |             --hidden_size=$hidden \
26 |             --depth=$depth
27 |     done
28 | done
29 | 
30 | # scale with length
31 | lengths='5 7 10'
32 | hiddens='256 1024'
33 | for length in $lengths; do
34 |     for hidden in $hiddens; do
35 |         python3 gridlstm_pt.py --seq_len=$seq_len \
36 |             --batch_size=32 \
37 |             --hidden_size=$hidden \
38 |             --depth=32
39 |     done
40 | done
41 | 


--------------------------------------------------------------------------------
/docs/fractaltensor_operations/access_primitives.md:
--------------------------------------------------------------------------------
 1 | <!-- vscode-markdown-toc -->
 2 | 
 3 | - [Access primitives of FractalTensor](#access-primitives-of-FractalTensor)
 4 |   - [\*index](#index)
 5 |   - [\*slice](#slice)
 6 |   - [\*gather (permute elements)](#gather-permute-elements)
 7 | 
 8 | <!-- vscode-markdown-toc-config
 9 | 	numbering=true
10 | 	autoSave=true
11 | 	/vscode-markdown-toc-config -->
12 | <!-- /vscode-markdown-toc -->
13 | 
14 | 
15 | # Access primitives of FractalTensor
16 | 
17 | Primitive access operations have first-class implementations in the backend.
18 | 
19 | ## \*index
20 | 
21 | $$\mathbf{index} ::\Psi n.[\alpha]^d_n \rightarrow \Psi m.[\alpha]^{d-1}_m$$
22 | 
23 | ```python
24 | index(x: FractalTensor[T], i: int) -> T
25 | ```
26 | 
27 | Access a `FractalTensor` variable using the `[]` operator is equivalent to call `index` .
28 | 
29 | ##  \*slice
30 | 
31 | ```python
32 | slice(x: FractalTensor[T], start: int, end: int, stride: int) -> FractalTensor[T]
33 | ```
34 | 
35 | ##  \*gather (permute elements)
36 | 
37 | ```python
38 | gather(x: FractalTensor[T], indices:Tuple[int]) -> FractalTensor[T]
39 | ```
40 | 


--------------------------------------------------------------------------------
/kaleido/core/device/kernels/tile_transmitter.h:
--------------------------------------------------------------------------------
 1 | // Copyright (c) Microsoft Corporation. All rights reserved.
 2 | // Licensed under the MIT License.
 3 | 
 4 | #pragma once
 5 | 
 6 | #include <cute/algorithm/copy.hpp>
 7 | #include <cute/tensor.hpp>
 8 | #include <cutlass/aligned_buffer.h>
 9 | #include <cutlass/array.h>
10 | #include <cutlass/cutlass.h>
11 | #include <cutlass/layout/matrix.h>
12 | #include <cutlass/layout/pitch_linear.h>
13 | #include <cutlass/matrix_shape.h>
14 | #include <cutlass/numeric_types.h>
15 | #include <cutlass/transform/pitch_linear_thread_map.h>
16 | #include <cutlass/transform/threadblock/predicated_tile_iterator.h>
17 | #include <cutlass/transform/threadblock/regular_tile_iterator_pitch_linear.h>
18 | #include <cutlass/transform/threadblock/regular_tile_iterator_tensor_op.h>
19 | 
20 | namespace kaleido {
21 | namespace core {
22 | namespace cuda_kernel {
23 | 
24 | enum class TileLayout {
25 |   RowMajor = 0,
26 |   ColumnMajor = 1,
27 |   SwizzledRowMajor = 2,  // shared memory layout
28 |   SwizzledColumnMajor = 3
29 | };
30 | 
31 | }  // namespace cuda_kernel
32 | }  // namespace core
33 | }  // namespace kaleido
34 | 


--------------------------------------------------------------------------------
/kaleido/frontend/operations/__init__.py:
--------------------------------------------------------------------------------
 1 | # -------------------------------------------------------------------------
 2 | # Copyright (c) Microsoft Corporation. All rights reserved.
 3 | # Licensed under the MIT License.
 4 | # --------------------------------------------------------------------------
 5 | 
 6 | from __future__ import absolute_import, division, print_function
 7 | 
 8 | from kaleido.frontend.operations.conversion import *
 9 | from kaleido.frontend.operations.fractaltensor.access import *
10 | from kaleido.frontend.operations.fractaltensor.functional.aggregate import *
11 | from kaleido.frontend.operations.fractaltensor.functional.apply_to_each import *
12 | from kaleido.frontend.operations.tensor.arithmetic.broadcast import *
13 | from kaleido.frontend.operations.tensor.arithmetic.contraction import *
14 | from kaleido.frontend.operations.tensor.arithmetic.elementwise import *
15 | from kaleido.frontend.operations.tensor.arithmetic.reduction import *
16 | from kaleido.frontend.operations.tensor.constants import *
17 | from kaleido.frontend.operations.tensor.data_movements import *
18 | from kaleido.frontend.operations.tensor.reshape import *
19 | 


--------------------------------------------------------------------------------
/benchmarks/rnn/baselines/stacked_lstm/figures/stacked_lstm_results.tsv:
--------------------------------------------------------------------------------
 1 | Test Name	Average Time (s)	Elapsed Time(s)	Throughput(seq per sec)
 2 | pt_cudnn_lstm	0.0251	0.7545	2544.7697
 3 | pt_finegrained_op_v1_cuda:0	0.4980	14.9410	128.5051
 4 | pt_finegrained_op_v2_cuda:0	0.2123	6.3702	301.4025
 5 | pt_finegrained_op_v1_cuda:0_JIT	0.1387	4.1602	461.5115
 6 | pt_finegrained_op_v2_cuda:0_JIT	0.0505	1.5161	1266.4127
 7 | tf_graph_cudnnlstm	0.0374	1.1219	1711.4016
 8 | tf_graph_fine_grained_op_lstm_v1_gpu	0.1267	3.8025	504.9329
 9 | tf_graph_fine_grained_op_lstm_v2_gpu	0.0743	2.2293	861.2472
10 | tf_graph_static_lstm_cell_gpu	0.0823	2.4683	777.8715
11 | tf_graph_whileOpLstm_gpu	0.1068	3.2042	599.2203
12 | tf_eager_cudnnlstm	0.0751	2.2541	851.7819
13 | tf_eager_fine_grained_op_lstm_v1_gpu	4.5640	136.9190	14.0229
14 | tf_eager_fine_grained_op_lstm_v2_gpu	3.2110	96.3287	19.9318
15 | tf_eager_static_lstm_cell_gpu	2.3811	71.4322	26.8786
16 | tf_autograph_cudnnlstm	0.0328	0.9851	1949.1312
17 | tf_autograph_fine_grained_op_lstm_v1_gpu	0.0857	2.5710	746.7896
18 | tf_autograph_fine_grained_op_lstm_v2_gpu	0.0501	1.5038	1276.7482
19 | tf_autograph_static_lstm_cell_gpu	0.0503	1.5080	1273.2241
20 | 


--------------------------------------------------------------------------------
/kaleido/core/tests/test_layout.cc:
--------------------------------------------------------------------------------
 1 | // Copyright (c) Microsoft Corporation. All rights reserved.
 2 | // Licensed under the MIT License.
 3 | 
 4 | #include "kaleido/core/cuda_allocator.h"
 5 | #include "kaleido/core/layout.h"
 6 | 
 7 | #include <gtest/gtest.h>
 8 | 
 9 | #include <iostream>
10 | 
11 | namespace kaleido {
12 | namespace core {
13 | 
14 | TEST(test, TestLayout) {
15 |   const int kRow = 3;
16 |   const int kCol = 7;
17 |   using L1 = RowMajor<kRow, kCol>;
18 |   L1 row_major;
19 | 
20 |   std::cout << "num_rows: " << num_rows<L1> << std::endl
21 |             << "num_cols: " << num_cols<L1>;
22 | 
23 |   for (int row_id = 0; row_id < num_rows<L1>; ++row_id) {
24 |     for (int col_id = 0; col_id < num_cols<L1>; ++col_id) {
25 |       EXPECT_EQ(row_major(row_id, col_id), row_id * kCol + col_id);
26 |     }
27 |   }
28 | 
29 |   using L2 = ColMajor<kRow, kCol>;
30 |   L2 col_major;
31 |   for (int row_id = 0; row_id < num_rows<L2>; ++row_id) {
32 |     for (int col_id = 0; col_id < num_rows<L2>; ++col_id) {
33 |       EXPECT_EQ(col_major(row_id, col_id), row_id + col_id * kRow);
34 |     }
35 |   }
36 | }
37 | 
38 | }  // namespace core
39 | }  // namespace kaleido
40 | 


--------------------------------------------------------------------------------
/kaleido/core/device/kernels/lstm/dilated_lstm/region1.h:
--------------------------------------------------------------------------------
 1 | // Copyright (c) Microsoft Corporation. All rights reserved.
 2 | // Licensed under the MIT License.
 3 | 
 4 | #pragma once
 5 | 
 6 | #include "kaleido/core/device/kernels/lstm.h"
 7 | #include "kaleido/core/tensor.h"
 8 | 
 9 | namespace kaleido::core::cuda_kernel {
10 | 
11 | template <typename Element, typename InstructionShape, typename ValueMnk,
12 |           typename WarpArrangement, typename CtaTileShape, typename WholeShape>
13 | float DilatedLstmRegion1(Element* csss, Element* hsss, const Element* xss,
14 |                          const Element* ws, const Element* us,
15 |                          const Element* init, int seq_length) {
16 |   float elapsed_time = 0.0;
17 | 
18 |   const Element* x = xss;
19 |   const Element* w = ws;
20 |   const Element* u = us;
21 | 
22 |   cuda_kernel::CuteLSTMLayer<Element, InstructionShape, ValueMnk,
23 |                              WarpArrangement, CtaTileShape, WholeShape>
24 |       cute_lstm_layer;
25 | 
26 |   elapsed_time += cute_lstm_layer(w, x, u, init, init, csss, hsss, seq_length);
27 | 
28 |   return elapsed_time;
29 | }
30 | }  // namespace kaleido::core::cuda_kernel
31 | 


--------------------------------------------------------------------------------
/kaleido/core/fractal_tensor.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | #include "kaleido/core/allocator.h"
 3 | #include "kaleido/core/types.h"
 4 | 
 5 | namespace kaleido {
 6 | namespace core {
 7 | 
 8 | class FractalTensor {
 9 | public:
10 |   FractalTensor(const FractalTensorTypeDesc& desc,
11 |                 std::shared_ptr<Allocator> alloc)
12 |       : type_desc_(desc), alloc_(alloc), data_(nullptr) {
13 |     long byteCount = type_desc_.GetNumBytes();
14 |     if (byteCount) data_ = alloc_->Allocate(byteCount);
15 |   };
16 | 
17 |   ~FractalTensor() = default;
18 | 
19 |   std::string DebugString() const;
20 | 
21 |   template <typename T>
22 |   const T* data() const {
23 |     return reinterpret_cast<T*>(data_);
24 |   }
25 | 
26 |   template <typename T>
27 |   T* mutable_data() {
28 |     return reinterpret_cast<T*>(data_);
29 |   }
30 | 
31 | private:
32 |   FractalTensorTypeDesc type_desc_;
33 |   std::shared_ptr<Allocator> alloc_;
34 | 
35 |   void* data_;
36 | };
37 | static inline std::ostream& operator<<(std::ostream& os,
38 |                                        const FractalTensor& ft) {
39 |   os << ft.DebugString();
40 |   return os;
41 | }
42 | 
43 | }  // namespace core
44 | }  // namespace kaleido
45 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 |     MIT License
 2 | 
 3 |     Copyright (c) Microsoft Corporation.
 4 | 
 5 |     Permission is hereby granted, free of charge, to any person obtaining a copy
 6 |     of this software and associated documentation files (the "Software"), to deal
 7 |     in the Software without restriction, including without limitation the rights
 8 |     to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 |     copies of the Software, and to permit persons to whom the Software is
10 |     furnished to do so, subject to the following conditions:
11 | 
12 |     The above copyright notice and this permission notice shall be included in all
13 |     copies or substantial portions of the Software.
14 | 
15 |     THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 |     IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 |     FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 |     AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 |     LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 |     OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 |     SOFTWARE
22 | 


--------------------------------------------------------------------------------
/cmake/external/pybind.cmake:
--------------------------------------------------------------------------------
 1 | # -------------------------------------------------------------------------
 2 | # Copyright (c) Microsoft Corporation. All rights reserved. Licensed under the
 3 | # MIT License.
 4 | # --------------------------------------------------------------------------
 5 | 
 6 | include(ExternalProject)
 7 | 
 8 | set(PYBIND_PREFIX_DIR ${THIRD_PARTY_PATH}/pybind)
 9 | set(PYBIND_SOURCE_DIR ${PYBIND_PREFIX_DIR}/src/extern_pybind)
10 | set(PYBIND_REPOSITORY https://github.com/pybind/pybind11.git)
11 | set(PYBIND_TAG v2.11.1)
12 | 
13 | cache_third_party(
14 |   extern_pybind
15 |   REPOSITORY
16 |   ${PYBIND_REPOSITORY}
17 |   TAG
18 |   ${PYBIND_TAG}
19 |   DIR
20 |   PYBIND_SOURCE_DIR)
21 | 
22 | set(PYBIND_INCLUDE_DIR ${PYBIND_SOURCE_DIR}/include)
23 | include_directories(${PYBIND_INCLUDE_DIR})
24 | 
25 | ExternalProject_Add(
26 |   extern_pybind
27 |   ${EXTERNAL_PROJECT_LOG_ARGS}
28 |   ${SHALLOW_CLONE}
29 |   "${PYBIND_DOWNLOAD_CMD}"
30 |   PREFIX ${PYBIND_PREFIX_DIR}
31 |   SOURCE_DIR ${PYBIND_SOURCE_DIR}
32 |   UPDATE_COMMAND ""
33 |   CONFIGURE_COMMAND ""
34 |   BUILD_COMMAND ""
35 |   INSTALL_COMMAND ""
36 |   TEST_COMMAND "")
37 | 
38 | add_library(pybind INTERFACE)
39 | add_dependencies(pybind extern_pybind)
40 | 


--------------------------------------------------------------------------------
/benchmarks/rnn/baselines/README.md:
--------------------------------------------------------------------------------
 1 | # Test Environment
 2 | 
 3 | ``` {.text}
 4 | OS: Ubuntu 16.04.7 LTS
 5 | TensorFlow version: 2.2.3, compiled by gcc 5.0
 6 | PyTorch v1.9.0
 7 | CUDA Version 10.2
 8 | CUDNN Version 7.6.5
 9 | ```
10 | ## CPU information
11 | 
12 | ```bash
13 | lscpu
14 | ```
15 | 
16 | ``` {.text}
17 | Architecture:          x86_64
18 | CPU op-mode(s):        32-bit, 64-bit
19 | Byte Order:            Little Endian
20 | CPU(s):                12   # virtual CPU
21 | On-line CPU(s) list:   0-11
22 | Thread(s) per core:    2
23 | Core(s) per socket:    6
24 | Socket(s):             1
25 | NUMA node(s):          1
26 | Vendor ID:             GenuineIntel
27 | CPU family:            6
28 | Model:                 63
29 | Model name:            Intel(R) Core(TM) i7-5930K CPU @ 3.50GHz
30 | Stepping:              2
31 | CPU MHz:               1200.117
32 | CPU max MHz:           3700.0000
33 | CPU min MHz:           1200.0000
34 | BogoMIPS:              7000.36
35 | Virtualization:        VT-x
36 | L1d cache:             32K
37 | L1i cache:             32K
38 | L2 cache:              256K
39 | L3 cache:              15360K
40 | NUMA node0 CPU(s):     0-11
41 | ```
42 | 
43 | ### GPU information
44 | 
45 | GeForce RTX 2080 Ti, Compute Capability 7.5
46 | 


--------------------------------------------------------------------------------
/examples/hello_world/utils.py:
--------------------------------------------------------------------------------
 1 | # -------------------------------------------------------------------------
 2 | # Copyright (c) Microsoft Corporation. All rights reserved.
 3 | # Licensed under the MIT License.
 4 | # --------------------------------------------------------------------------
 5 | 
 6 | from typing import List, Tuple
 7 | 
 8 | import torch
 9 | 
10 | import kaleido
11 | from kaleido import FractalTensor, FractalTensorStorage, Tensor, TensorStorage
12 | from kaleido import operations as ops
13 | 
14 | seq_len = 10
15 | batch_size = 7
16 | 
17 | hidden_dim = 512
18 | depth = 4
19 | 
20 | device = 'cpu'
21 | 
22 | 
23 | def create_params(shape, depth):
24 |     x = FractalTensor(TensorStorage(shape, kaleido.float32, device=device))
25 |     x.indices = list(range(depth))
26 |     x.initialize(torch.rand, *x.flatten_shape, device=device)
27 |     return x
28 | 
29 | 
30 | xss = FractalTensor(
31 |     FractalTensorStorage(
32 |         TensorStorage((1, hidden_dim), kaleido.float32, device=device)))
33 | xss.indices = [list(range(seq_len)) for _ in range(batch_size)]
34 | xss.initialize(torch.rand, *xss.flatten_shape, device=device)
35 | 
36 | Ws = create_params([hidden_dim, hidden_dim], depth)
37 | Us = create_params([hidden_dim, hidden_dim], depth)
38 | 


--------------------------------------------------------------------------------
/cmake/external/tvm.cmake:
--------------------------------------------------------------------------------
 1 | # -------------------------------------------------------------------------
 2 | # Copyright (c) Microsoft Corporation. All rights reserved. Licensed under the
 3 | # MIT License.
 4 | # --------------------------------------------------------------------------
 5 | 
 6 | include(ExternalProject)
 7 | 
 8 | set(TVM_PREFIX_DIR ${THIRD_PARTY_PATH}/tvm)
 9 | set(TVM_SOURCE_DIR ${TVM_PREFIX_DIR}/src/extern_tvm)
10 | 
11 | set(TVM_REPOSITORY https://github.com/apache/tvm.git)
12 | set(TVM_TAG v0.8.0)
13 | 
14 | cache_third_party(
15 |   extern_tvm
16 |   REPOSITORY
17 |   ${TVM_REPOSITORY}
18 |   TAG
19 |   ${TVM_TAG}
20 |   DIR
21 |   TVM_SOURCE_DIR)
22 | 
23 | set(TVM_INCLUDE_DIR ${TVM_SOURCE_DIR}/include)
24 | include_directories(${TVM_INCLUDE_DIR})
25 | 
26 | ExternalProject_Add(
27 |   extern_tvm
28 |   ${EXTERNAL_PROJECT_LOG_ARGS}
29 |   ${SHALLOW_CLONE}
30 |   "${TVM_DOWNLOAD_CMD}"
31 |   PREFIX ${TVM_PREFIX_DIR}
32 |   SOURCE_DIR ${TVM_SOURCE_DIR}
33 |   BUILD_IN_SOURCE 1
34 |   COMMAND "cp ${TVM_SOURCE_DIR}/cmake/config.cmake ${TVM_SOURCE_DIR}"
35 |   UPDATE_COMMAND ""
36 |   CONFIGURE_COMMAND ${CMAKE_COMMAND} -DCMAKE_POSITION_INDEPENDENT_CODE=ON .
37 |   BUILD_COMMAND $(MAKE) -j$(nproc)
38 |   INSTALL_COMMAND ""
39 |   TEST_COMMAND "")
40 | 


--------------------------------------------------------------------------------
/benchmarks/rnn/baselines/grid_lstm/test_utils.py:
--------------------------------------------------------------------------------
 1 | # -------------------------------------------------------------------------
 2 | # Copyright (c) Microsoft Corporation. All rights reserved.
 3 | # Licensed under the MIT License.
 4 | # --------------------------------------------------------------------------
 5 | 
 6 | import tensorflow as tf
 7 | 
 8 | 
 9 | def get_config():
10 |     config = tf.compat.v1.ConfigProto(
11 |         gpu_options=tf.compat.v1.GPUOptions(
12 |             allow_growth=True, per_process_gpu_memory_fraction=0.2))
13 | 
14 |     config.log_device_placement = False
15 |     config.allow_soft_placement = True
16 | 
17 |     config.intra_op_parallelism_threads = 0
18 |     config.inter_op_parallelism_threads = 56
19 | 
20 |     return config
21 | 
22 | 
23 | def device(dtype="cpu"):
24 |     """Return the TF device string.
25 | 
26 |     Args:
27 |         dtype: String, "cpu" or "gpu".
28 | 
29 |     Raises:
30 |         ValueError: if dtype is an unknown device.
31 |     """
32 |     if dtype == "cpu":
33 |         return "/device:CPU:0"
34 |     elif dtype == "gpu":
35 |         assert tf.test.is_gpu_available(cuda_only=True)
36 |         return "/device:GPU:0"
37 |     else:
38 |         raise ValueError("Unknown device type. Should be cpu or gpu.")
39 | 


--------------------------------------------------------------------------------
/kaleido/frontend/operations/tests/test_product.py:
--------------------------------------------------------------------------------
 1 | # -------------------------------------------------------------------------
 2 | # Copyright (c) Microsoft Corporation. All rights reserved.
 3 | # Licensed under the MIT License.
 4 | # --------------------------------------------------------------------------
 5 | 
 6 | from __future__ import print_function
 7 | 
 8 | from context import *
 9 | 
10 | 
11 | class TestProduct(unittest.TestCase):
12 |     def setUp(self):
13 |         random.seed(12345)
14 | 
15 |         data1 = list(range(11))
16 |         self.x = kaleido.FractalTensor.from_pylist(data1)
17 | 
18 |         data2 = list(range(7))
19 |         self.y = kaleido.FractalTensor.from_pylist(data2)
20 | 
21 |     def test_product(self):
22 |         xss, yss = kaleido.operations.product(self.x, self.y)
23 |         self.assertTrue(isinstance(xss, kaleido.FractalTensor))
24 |         self.assertTrue(isinstance(yss, kaleido.FractalTensor))
25 | 
26 |         for i, (xs, ys) in enumerate(kaleido.operations.zip(xss, yss)):
27 |             for j, (x, y) in enumerate(kaleido.operations.zip(xs, ys)):
28 |                 self.assertEqual(x.data.item(), j)
29 |                 self.assertEqual(y.data.item(), i)
30 | 
31 | 
32 | if __name__ == '__main__':
33 |     unittest.main()
34 | 


--------------------------------------------------------------------------------
/benchmarks/rnn/baselines/stacked_lstm/test_utils.py:
--------------------------------------------------------------------------------
 1 | # -------------------------------------------------------------------------
 2 | # Copyright (c) Microsoft Corporation. All rights reserved.
 3 | # Licensed under the MIT License.
 4 | # --------------------------------------------------------------------------
 5 | 
 6 | import tensorflow as tf
 7 | 
 8 | 
 9 | def get_config():
10 |     config = tf.compat.v1.ConfigProto(
11 |         gpu_options=tf.compat.v1.GPUOptions(
12 |             allow_growth=True, per_process_gpu_memory_fraction=0.2))
13 | 
14 |     config.log_device_placement = False
15 |     config.allow_soft_placement = True
16 | 
17 |     config.intra_op_parallelism_threads = 0
18 |     config.inter_op_parallelism_threads = 56
19 | 
20 |     return config
21 | 
22 | 
23 | def device(dtype='cpu'):
24 |     '''Return the TF device string.
25 | 
26 |     Args:
27 |         dtype: String, 'cpu' or 'gpu'.
28 | 
29 |     Raises:
30 |         ValueError: if dtype is an unknown device.
31 |     '''
32 | 
33 |     if dtype == 'cpu':
34 |         return '/device:CPU:0'
35 |     elif dtype == 'gpu':
36 |         assert len(tf.config.list_physical_devices('GPU'))
37 |         return '/device:GPU:0'
38 |     else:
39 |         raise ValueError('Unknown device type. Should be cpu or gpu.')
40 | 


--------------------------------------------------------------------------------
/cmake/external/cccl.cmake:
--------------------------------------------------------------------------------
 1 | # -------------------------------------------------------------------------
 2 | # Copyright (c) Microsoft Corporation. All rights reserved. Licensed under the
 3 | # MIT License.
 4 | # --------------------------------------------------------------------------
 5 | 
 6 | include(ExternalProject)
 7 | 
 8 | set(CCCL_PREFIX_DIR ${THIRD_PARTY_PATH}/cccl)
 9 | set(CCCL_SOURCE_DIR ${CCCL_PREFIX_DIR}/src/extern_cccl)
10 | set(CCCL_REPOSITORY https://github.com/NVIDIA/cccl.git)
11 | set(CCCL_TAG v2.3.0-rc0)
12 | 
13 | cache_third_party(
14 |   extern_cccl
15 |   REPOSITORY
16 |   ${CCCL_REPOSITORY}
17 |   TAG
18 |   ${CCCL_TAG}
19 |   DIR
20 |   CCCL_SOURCE_DIR)
21 | 
22 | set(CUB_INCLUDE_DIR ${CCCL_SOURCE_DIR}/cub)
23 | set(THRUST_INCLUDE_DIR ${CCCL_SOURCE_DIR}/thrust)
24 | set(LIBCUDACXX_INCLUDE_DIR ${CCCL_SOURCE_DIR}/libcudacxx)
25 | 
26 | ExternalProject_Add(
27 |   extern_cccl
28 |   ${EXTERNAL_PROJECT_LOG_ARGS}
29 |   ${SHALLOW_CLONE}
30 |   "${CCCL_DOWNLOAD_CMD}"
31 |   PREFIX ${CCCL_PREFIX_DIR}
32 |   SOURCE_DIR ${CCCL_SOURCE_DIR}
33 |   UPDATE_COMMAND ""
34 |   CONFIGURE_COMMAND ""
35 |   BUILD_COMMAND ""
36 |   INSTALL_COMMAND ""
37 |   TEST_COMMAND "")
38 | 
39 | include_directories(${CUB_INCLUDE_DIR})
40 | include_directories(${THRUST_INCLUDE_DIR})
41 | include_directories(${LIBCUDACXX_INCLUDE_DIR})
42 | 


--------------------------------------------------------------------------------
/SUPPORT.md:
--------------------------------------------------------------------------------
 1 | # TODO: The maintainer of this repo has not yet edited this file
 2 | 
 3 | **REPO OWNER**: Do you want Customer Service & Support (CSS) support for this product/project?
 4 | 
 5 | - **No CSS support:** Fill out this template with information about how to file issues and get help.
 6 | - **Yes CSS support:** Fill out an intake form at [aka.ms/onboardsupport](https://aka.ms/onboardsupport). CSS will work with/help you to determine next steps.
 7 | - **Not sure?** Fill out an intake as though the answer were "Yes". CSS will help you decide.
 8 | 
 9 | *Then remove this first heading from this SUPPORT.MD file before publishing your repo.*
10 | 
11 | # Support
12 | 
13 | ## How to file issues and get help  
14 | 
15 | This project uses GitHub Issues to track bugs and feature requests. Please search the existing 
16 | issues before filing new issues to avoid duplicates.  For new issues, file your bug or 
17 | feature request as a new Issue.
18 | 
19 | For help and questions about using this project, please **REPO MAINTAINER: INSERT INSTRUCTIONS HERE 
20 | FOR HOW TO ENGAGE REPO OWNERS OR COMMUNITY FOR HELP. COULD BE A STACK OVERFLOW TAG OR OTHER
21 | CHANNEL. WHERE WILL YOU HELP PEOPLE?**.
22 | 
23 | ## Microsoft Support Policy  
24 | 
25 | Support for this **PROJECT or PRODUCT** is limited to the resources listed above.
26 | 


--------------------------------------------------------------------------------
/examples/flash_attention/flash_attention_utils.py:
--------------------------------------------------------------------------------
 1 | # -------------------------------------------------------------------------
 2 | # Copyright (c) Microsoft Corporation. All rights reserved.
 3 | # Licensed under the MIT License.
 4 | # --------------------------------------------------------------------------
 5 | 
 6 | import torch
 7 | 
 8 | import kaleido
 9 | from kaleido import FractalTensor, FractalTensorStorage, TensorStorage
10 | 
11 | 
12 | def create_input(head_dim: int,
13 |                  num_heads: int,
14 |                  seq_len: int,
15 |                  batch_size: int,
16 |                  block_dim: int,
17 |                  device: str = 'cpu'):
18 |     # depth-1: batch_size
19 |     # depth-2: num_heads
20 |     # depth-3: block_num = seq_length / block_dim
21 |     xsss = FractalTensor(
22 |         FractalTensorStorage(
23 |             FractalTensorStorage(
24 |                 TensorStorage((block_dim, head_dim),
25 |                               kaleido.float32,
26 |                               device=device))))
27 |     indices = []
28 |     block_num = int(seq_len / block_dim)
29 |     for _ in range(batch_size):
30 |         indices.append([list(range(block_num)) for _ in range(num_heads)])
31 |     xsss.indices = indices
32 |     xsss.initialize(torch.rand, *xsss.flatten_shape, device=device)
33 |     return xsss
34 | 


--------------------------------------------------------------------------------
/examples/sparse_attention/torch_windowed_attention_demo.py:
--------------------------------------------------------------------------------
 1 | # -------------------------------------------------------------------------
 2 | # Copyright (c) Microsoft Corporation. All rights reserved.
 3 | # Licensed under the MIT License.
 4 | # --------------------------------------------------------------------------
 5 | 
 6 | import torch
 7 | import torch.nn.functional as F
 8 | 
 9 | 
10 | def test(qss, kss, vss):
11 |     # Q, K, V: [batch_size, block_num, block_size, hidden]
12 |     Q = qss[:, :, 2:-2:, :]
13 |     K = torch.cat((kss[:, :, 1:-3, :], kss[:, :, 2:-2, :], kss[:, :, 3:-1, :]),
14 |                   2)
15 |     V = torch.cat((vss[:, :, 1:-3, :], vss[:, :, 2:-2, :], vss[:, :, 3:-1, :]),
16 |                   2)
17 |     QK = torch.einsum("blqd,blkd->blqk", Q, K)
18 |     attn_weights = F.softmax(QK, -1)
19 |     attn_vecs = torch.einsum("blqk,blkd->blqd", attn_weights, V)
20 |     return attn_vecs
21 | 
22 | 
23 | if __name__ == '__main__':
24 |     batch_size = 16
25 |     block_size = 16
26 |     seq_len = 512
27 |     hidden_dim = 128
28 |     block_num = seq_len // block_size
29 | 
30 |     queries = torch.rand(batch_size, block_num, block_size, hidden_dim)
31 |     keys = torch.rand(batch_size, block_num, block_size, hidden_dim)
32 |     values = torch.rand(batch_size, block_num, block_size, hidden_dim)
33 | 
34 |     test(queries, keys, values)
35 | 


--------------------------------------------------------------------------------
/benchmarks/rnn/cuDNN/lstm_cell_cudnn.cu:
--------------------------------------------------------------------------------
 1 | // Copyright (c) Microsoft Corporation. All rights reserved.
 2 | // Licensed under the MIT License.
 3 | 
 4 | #include "utils.h"
 5 | 
 6 | #include <assert.h>
 7 | 
 8 | #include <array>
 9 | #include <fstream>
10 | #include <iomanip>
11 | #include <sstream>
12 | #include <string>
13 | 
14 | int main(int argc, char* argv[]) {
15 |   assert(argc == 2);
16 |   const char* filename = argv[1];
17 | 
18 |   std::ofstream fout;
19 |   fout.setf(std::ios::fixed);
20 |   fout.precision(4);
21 | 
22 |   fout.open(filename, std::ios::out);
23 | 
24 |   srand(1234);
25 |   constexpr std::array<int, 4> hidden_sizes = {128, 256, 512, 1024};
26 |   constexpr std::array<int, 4> batch_sizes = {32, 64, 128, 256};
27 |   const int seq_length = 1;
28 |   const int depth = 1;
29 | 
30 |   fout << "[depth, seq_length, batch_size, hidden_size]\tAvgTime(ms)"
31 |        << std::endl;
32 | 
33 |   for (auto hidden_size : hidden_sizes) {
34 |     for (auto batch_size : batch_sizes) {
35 |       genSeqs(batch_size, seq_length, false);
36 |       float cudnn_time = TestCuDNNLSTM(batch_size, hidden_size, seq_length,
37 |                                        depth, hidden_size);
38 | 
39 |       fout << "[" << depth << ", " << seq_length << ", " << batch_size << ", "
40 |            << hidden_size << "]\t" << cudnn_time << std::endl;
41 |     }
42 |   }
43 | }
44 | 


--------------------------------------------------------------------------------
/kaleido/core/layout.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include <cute/layout.hpp>
 4 | 
 5 | namespace kaleido {
 6 | namespace core {
 7 | 
 8 | using namespace cute;
 9 | 
10 | // In the row major layout, the contiguous dimension in memory is the
11 | // last dimension.
12 | template <const int row, const int col, const int stride = col>
13 | using RowMajor =
14 |     cute::Layout<Shape<Int<row>, Int<col>>, Stride<Int<stride>, _1>>;
15 | 
16 | __device__ auto make_row_major_layout(const int row, const int col,
17 |                                       const int stride) {
18 |   return cute::make_layout(make_shape(row, col), make_stride(stride, 1));
19 | }
20 | 
21 | // In the column major layout, the contiguous dimension in memory is the
22 | // first dimension.
23 | template <const int row, const int col, const int stride = row>
24 | using ColMajor =
25 |     cute::Layout<Shape<Int<row>, Int<col>>, Stride<_1, Int<stride>>>;
26 | 
27 | __device__ auto make_col_major_layout(const int row, const int col,
28 |                                       const int stride) {
29 |   return cute::make_layout(make_shape(row, col), make_stride(1, stride));
30 | }
31 | 
32 | template <typename Layout>
33 | static constexpr size_t num_rows = cute::size<0>(Layout{});
34 | 
35 | template <typename Layout> /*  */
36 | static constexpr size_t num_cols = cute::size<1>(Layout{});
37 | 
38 | }  // namespace core
39 | }  // namespace kaleido
40 | 


--------------------------------------------------------------------------------
/kaleido/core/operators/softmax_op.cu:
--------------------------------------------------------------------------------
 1 | // Copyright (c) Microsoft Corporation. All rights reserved.
 2 | // Licensed under the MIT License.
 3 | 
 4 | #include "kaleido/core/device/gpu_context.h"
 5 | #include "kaleido/core/device/kernels/softmax.h"
 6 | #include "kaleido/core/operators/softmax_op.h"
 7 | #include "kaleido/core/types.h"
 8 | 
 9 | namespace kaleido {
10 | namespace core {
11 | namespace ops {
12 | 
13 | template <typename T>
14 | class SoftmaxOp<GPUContext, CUDAPlace, T> {
15 | public:
16 |   void operator()(const GPUContext& context, const Tensor& x, Tensor& y,
17 |                   int dim = 0) {
18 |     assert(x.ndim() == 2 && x.ndim() == 2 && x.shape() == y.shape());
19 |     if (dim == 1) LOG(FATAL) << "Not implmented yet.";
20 | 
21 |     const int kThreadsPerBlock = context.GetMaxThreadsPerBlock();
22 |     int width = x.dim_size(1);
23 |     int height = x.dim_size(0);
24 | 
25 |     int block_num =
26 |         width > kThreadsPerBlock
27 |             ? kThreadsPerBlock
28 |             : pow(2, static_cast<int>(log2(static_cast<float>(width))));
29 | 
30 |     dim3 block(block_num, 1);
31 |     dim3 grid(height, 1);
32 | 
33 |     cuda_kernel::KeMatrixSoftMax<<<grid, block, 0>>>(
34 |         x.data<T>(), y.mutable_data<T>(), width);
35 |   }
36 | };
37 | 
38 | template class SoftmaxOp<GPUContext, CUDAPlace, float>;
39 | 
40 | }  // namespace ops
41 | }  // namespace core
42 | }  // namespace kaleido
43 | 


--------------------------------------------------------------------------------
/kaleido/frontend/operations/tests/test_zip.py:
--------------------------------------------------------------------------------
 1 | # -------------------------------------------------------------------------
 2 | # Copyright (c) Microsoft Corporation. All rights reserved.
 3 | # Licensed under the MIT License.
 4 | # --------------------------------------------------------------------------
 5 | 
 6 | from __future__ import print_function
 7 | 
 8 | from context import *
 9 | 
10 | 
11 | class TestZip(unittest.TestCase):
12 |     def setUp(self):
13 |         random.seed(12345)
14 | 
15 |         data = list(range(7))
16 |         self.xs1 = kaleido.FractalTensor.from_pylist(data)
17 | 
18 |         data = list(range(7, 14, 1))
19 |         self.xs2 = kaleido.FractalTensor.from_pylist(data)
20 | 
21 |     def test_zipped_ta(self):
22 |         zipped = kaleido.operations.zip(self.xs1, self.xs2)
23 |         self.assertTrue(isinstance(zipped, kaleido.Iterative))
24 | 
25 |         for x1, x2 in zipped:
26 |             self.assertEqual(x2.data - x1.data, 7)
27 | 
28 |     def test_nested_zip(self):
29 |         zipped = kaleido.operations.zip(
30 |             kaleido.operations.zip(self.xs1, self.xs2), self.xs1)
31 |         self.assertTrue(isinstance(zipped, kaleido.Iterative))
32 | 
33 |         for xz, z in zipped:
34 |             x, y = xz
35 |             self.assertEqual(y.data - x.data, 7)
36 |             self.assertEqual(y.data - z.data, 7)
37 | 
38 | 
39 | if __name__ == '__main__':
40 |     unittest.main()
41 | 


--------------------------------------------------------------------------------
/benchmarks/rnn/cuDNN/stacked_lstm_cudnn.cu:
--------------------------------------------------------------------------------
 1 | // Copyright (c) Microsoft Corporation. All rights reserved.
 2 | // Licensed under the MIT License.
 3 | 
 4 | #include "utils.h"
 5 | 
 6 | #include <assert.h>
 7 | 
 8 | #include <array>
 9 | #include <fstream>
10 | #include <iomanip>
11 | #include <sstream>
12 | #include <string>
13 | 
14 | int main(int argc, char* argv[]) {
15 |   assert(argc == 2);
16 |   const char* filename = argv[1];
17 | 
18 |   std::ofstream fout;
19 |   fout.setf(std::ios::fixed);
20 |   fout.precision(4);
21 | 
22 |   fout.open(filename, std::ios::out);
23 | 
24 |   srand(1234);
25 |   constexpr std::array<size_t, 5> hidden_sizes = {64, 128, 256, 512, 1024};
26 |   constexpr std::array<size_t, 2> batch_sizes = {32, 64};
27 |   constexpr size_t seq_length = 16;
28 |   constexpr std::array<size_t, 6> depths = {1, 2, 4, 8, 16, 32};
29 | 
30 |   fout << "[depth, seq_length, batch_size, hidden_size]\tAvgTime(ms)"
31 |        << std::endl;
32 | 
33 |   for (auto depth : depths) {
34 |     for (auto hidden_size : hidden_sizes) {
35 |       for (auto batch_size : batch_sizes) {
36 |         genSeqs(batch_size, seq_length, false);
37 |         float cudnn_time = TestCuDNNLSTM(batch_size, hidden_size, seq_length,
38 |                                          depth, hidden_size);
39 | 
40 |         fout << "[" << depth << ", " << seq_length << ", " << batch_size << ", "
41 |              << hidden_size << "]\t" << cudnn_time << std::endl;
42 |       }
43 |     }
44 |   }
45 | }
46 | 


--------------------------------------------------------------------------------
/kaleido/core/operators/online_softmax_op.cu:
--------------------------------------------------------------------------------
 1 | // Copyright (c) Microsoft Corporation. All rights reserved.
 2 | // Licensed under the MIT License.
 3 | 
 4 | #include "kaleido/core/device/gpu_context.h"
 5 | #include "kaleido/core/device/kernels/online_softmax.h"
 6 | #include "kaleido/core/operators/online_softmax_op.h"
 7 | #include "kaleido/core/types.h"
 8 | 
 9 | namespace kaleido {
10 | namespace core {
11 | namespace ops {
12 | 
13 | template <typename T>
14 | class OnlineNormalizedSoftmaxOp<GPUContext, CUDAPlace, T> {
15 | public:
16 |   void operator()(const GPUContext& context, const Tensor& x, Tensor& y,
17 |                   int dim = 0) {
18 |     assert(x.ndim() == 2 && x.ndim() == 2 && x.shape() == y.shape());
19 |     if (dim == 1) LOG(FATAL) << "Not implmented yet.";
20 | 
21 |     const int kThreadsPerBlock = context.GetMaxThreadsPerBlock();
22 |     int width = x.dim_size(1);
23 |     int height = x.dim_size(0);
24 | 
25 |     int block_num =
26 |         width > kThreadsPerBlock
27 |             ? kThreadsPerBlock
28 |             : pow(2, static_cast<int>(log2(static_cast<float>(width))));
29 | 
30 |     dim3 block(block_num, 1);
31 |     dim3 grid(height, 1);
32 | 
33 |     cuda_kernel::KeOnlineNormalizedSoftMax<<<grid, block, 0>>>(
34 |         x.data<T>(), y.mutable_data<T>(), width);
35 |   }
36 | };
37 | 
38 | template class OnlineNormalizedSoftmaxOp<GPUContext, CUDAPlace, float>;
39 | 
40 | }  // namespace ops
41 | }  // namespace core
42 | }  // namespace kaleido
43 | 


--------------------------------------------------------------------------------
/kaleido/frontend/operations/tensor/arithmetic/contraction.py:
--------------------------------------------------------------------------------
 1 | # -------------------------------------------------------------------------
 2 | # Copyright (c) Microsoft Corporation. All rights reserved.
 3 | # Licensed under the MIT License.
 4 | # --------------------------------------------------------------------------
 5 | 
 6 | from __future__ import absolute_import, division, print_function
 7 | 
 8 | from typing import Tuple
 9 | 
10 | import torch
11 | 
12 | import kaleido
13 | from kaleido.frontend.operations.base import Contraction
14 | 
15 | __all__ = [
16 |     'mm',
17 |     'outer',
18 | ]
19 | 
20 | 
21 | class MatMul(Contraction):
22 |     """ (tensor contraction: reduce + map) matrix multiplication
23 |     
24 |     y = a $\otimes$ b
25 |     """
26 | 
27 |     def __call__(self, x: kaleido.Tensor, y: kaleido.Tensor) -> kaleido.Tensor:
28 |         t = super(MatMul, self).__call__(x, y)
29 | 
30 |         t.data = x.data @ y.data
31 |         t._type._shape = list(t.data.shape)
32 |         t.recompute_strides()
33 |         return t
34 | 
35 | 
36 | mm = MatMul()
37 | 
38 | 
39 | class Outer(Contraction):
40 |     """ Outer product of two vector."""
41 | 
42 |     def __call__(self, x: kaleido.Tensor, y: kaleido.Tensor) -> kaleido.Tensor:
43 |         t = super(Outer, self).__call__(x, y)
44 | 
45 |         t.data = torch.outer(x.data, y.data)
46 |         t._type._shape = list(t.data.shape)
47 | 
48 |         t.recompute_strides()
49 |         return t
50 | 
51 | 
52 | outer = Outer()
53 | 


--------------------------------------------------------------------------------
/.pre-commit-config.yaml:
--------------------------------------------------------------------------------
 1 | minimum_pre_commit_version: 3.0.0
 2 | 
 3 | repos:
 4 | -   repo: https://github.com/Lucas-C/pre-commit-hooks.git
 5 |     rev: v1.5.5
 6 |     hooks:
 7 |     -   id: remove-crlf
 8 |         files: (?!.*third_party)^.*$ | (?!.*book)^.*$
 9 | -   repo: https://github.com/pre-commit/mirrors-yapf.git
10 |     rev: v0.32.0
11 |     hooks:
12 |     -   id: yapf
13 |         additional_dependencies: [toml]
14 |         files: (.*\.(py|bzl)|BUILD|.*\.BUILD|WORKSPACE)$
15 | -   repo: https://github.com/pycqa/isort
16 |     rev: 5.13.2
17 |     hooks:
18 |     -   id: isort
19 |         name: isort (python)
20 | -   repo: https://github.com/pre-commit/pre-commit-hooks
21 |     rev: v4.6.0
22 |     hooks:
23 |     -   id: check-added-large-files
24 |     -   id: check-merge-conflict
25 |     -   id: check-symlinks
26 |     -   id: detect-private-key
27 |         files: (?!.*third_party)^.*$ | (?!.*book)^.*$
28 |     -   id: end-of-file-fixer
29 |     -   id: check-yaml
30 |     -   id: check-toml
31 |     -   id: check-ast
32 |     -   id: check-executables-have-shebangs
33 |     -   id: check-shebang-scripts-are-executable
34 |     -   id: detect-private-key
35 |     -   id: debug-statements
36 | -   repo: local
37 |     hooks:
38 |     -   id: clang-format-with-version-check
39 |         name: clang-format
40 |         description: Format files with ClangFormat.
41 |         entry: bash ./scripts/clang_format.hook -i
42 |         language: system
43 |         files: \.(c|cc|cxx|cpp|cu|h|cuh|hpp|hxx)$
44 | 


--------------------------------------------------------------------------------
/examples/flash_attention/README.md:
--------------------------------------------------------------------------------
 1 | # Algorithm Idea
 2 | 
 3 | ## The intuition
 4 | 
 5 | Given a query $q_i \in \mathbb{R}^d$, and a lists of keys and values $k_1,\cdots,k_n$ and $v_1, \cdots, v_n \in \mathbb{R}^d$ of length $n$
 6 | 
 7 | 
 8 | $$\begin{align}
 9 | s_i &= \text{dot}(q, k_i) \\
10 | s_{i}' &= \frac{e^{s_i}}{\sum_je^{s_j}} \\
11 | \text{attention}(q,k,v) &= \sum_i{v_is_i'} \\
12 | \end{align}$$
13 | 
14 | The summation in equation (2) could be moved to the very end of the attention operation (3)
15 | 
16 | $$\begin{align*}
17 | s_i &= \text{dot}(q, k_i) \\
18 | s_{i}' &= e^{s_i} \\
19 | \text{attention}(q,k,v) &= \frac{\sum_i{v_is_i'}}{\sum_je^{s_j}}
20 | \end{align*}$$
21 | 
22 | The processing process can be written as:
23 | 
24 | $$\begin{align*}
25 | s_i &= \text{dot}(q,k_i)  \\
26 | v^* &\leftarrow v^* + v_ie^{s_i} \\
27 | s^* &\leftarrow s^* + e^{s_i} \\
28 | \text{attention}(q,k,v) &= \frac{v^*}{s^*} 
29 | \end{align*}$$
30 | 
31 | ## Numerical Stability
32 | 
33 | intialize $v^* \in \mathbb{R}^d = 0$, $s* \in R = 0$, $m = -\text{inf}$
34 | 
35 | $$\begin{align*}
36 | s_i &= \text{dot}(q,k_i)  \\
37 | m_i &= \text{max}(m^*,s_i)\\
38 | v^* &\leftarrow v^*e^{m^*-m_i} + v_ie^{s_i-m_i} \\
39 | s^* &\leftarrow s^*e^{m^*-m_i} + e^{s_i-m_i} \\
40 | m^* &\leftarrow m_i \\
41 | \text{attention}(q,k,v) &= \frac{v^*}{s^*} \\
42 | \end{align*}$$
43 | 
44 | # Reference
45 | 
46 | 1. Rabe, Markus N., and Charles Staats. "[Self-attention Does Not Need $ O (n^ 2) $ Memory](https://arxiv.org/pdf/2112.05682.pdf)." arXiv preprint arXiv:2112.05682 (2021).
47 | 


--------------------------------------------------------------------------------
/examples/grid_rnn/grid_rnn_utils.py:
--------------------------------------------------------------------------------
 1 | # -------------------------------------------------------------------------
 2 | # Copyright (c) Microsoft Corporation. All rights reserved.
 3 | # Licensed under the MIT License.
 4 | # --------------------------------------------------------------------------
 5 | 
 6 | import random
 7 | 
 8 | import torch
 9 | 
10 | import kaleido
11 | from examples.utils import gen_dataset
12 | from kaleido import Tensor
13 | 
14 | device = 'cpu'
15 | # device = 'cuda'
16 | 
17 | depth = 3
18 | batch_size = 16
19 | vocab_size = 5000
20 | hidden_dim = 128
21 | 
22 | MIN_LEN = 3
23 | MAX_LEN = 7
24 | 
25 | src_words = gen_dataset(batch_size, vocab_size, device=device)
26 | trg_words = gen_dataset(batch_size, vocab_size, device=device)
27 | 
28 | src_emb = Tensor((vocab_size, hidden_dim), kaleido.float32, device=device)
29 | src_emb.initialize(torch.rand, *src_emb.shape, device=device)
30 | 
31 | trg_emb = Tensor((vocab_size, hidden_dim), kaleido.float32, device=device)
32 | trg_emb.initialize(torch.rand, *trg_emb.shape, device=device)
33 | 
34 | 
35 | def create_cell():
36 |     i2h = Tensor((2 * hidden_dim, hidden_dim), kaleido.float32, device=device)
37 |     i2h.initialize(torch.rand, *i2h.shape, device=device)
38 | 
39 |     h2h = Tensor((hidden_dim, hidden_dim), kaleido.float32, device=device)
40 |     h2h.initialize(torch.rand, *h2h.shape, device=device)
41 | 
42 |     bias = Tensor((1, hidden_dim), kaleido.float32, device=device)
43 |     bias.initialize(torch.rand, *bias.shape, device=device)
44 |     return {'i2h': i2h, 'h2h': h2h, 'bias': bias}
45 | 


--------------------------------------------------------------------------------
/kaleido/core/operators/launch_config.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | namespace kaleido {
 4 | namespace core {
 5 | namespace ops {
 6 | 
 7 | inline bool IsPow2(unsigned int x) { return ((x & (x - 1)) == 0); }
 8 | 
 9 | inline unsigned int NextPow2(unsigned int x) {
10 |   --x;
11 |   x |= x >> 1;
12 |   x |= x >> 2;
13 |   x |= x >> 4;
14 |   x |= x >> 8;
15 |   x |= x >> 16;
16 |   return ++x;
17 | }
18 | 
19 | inline unsigned int Log2Floor(unsigned int x) {
20 |   if (x == 0) return -1U;
21 |   int log = 0;
22 |   unsigned int value = x;
23 |   for (int i = 4; i >= 0; --i) {
24 |     int shift = (1 << i);
25 |     unsigned int n = value >> shift;
26 |     if (n != 0) {
27 |       value = n;
28 |       log += shift;
29 |     }
30 |   }
31 |   assert(value == 1);
32 |   return log;
33 | }
34 | 
35 | template <typename T, typename X, typename Y>
36 | inline T DivUp(const X x, const Y y) {
37 |   return static_cast<T>((x + y - 1) / y);
38 | }
39 | 
40 | void GetGpuLaunchConfig1D(const GPUContext& ctx, int64_t numel, int* threads,
41 |                           int* blocks) {
42 |   int num_threads = ctx.GetMaxThreadsPerBlock();
43 |   int sm_count = ctx.GetSMCount();
44 | 
45 |   if (numel / (sm_count << 1) < num_threads)
46 |     num_threads = NextPow2(numel / (sm_count << 1));
47 |   else if (numel / (sm_count << 2) < num_threads)
48 |     num_threads = NextPow2(numel / (sm_count << 2));
49 | 
50 |   *threads = std::max(64, num_threads);
51 |   *blocks = DivUp<int, int, int>(numel, *threads);
52 | }
53 | 
54 | }  // namespace ops
55 | }  // namespace core
56 | }  // namespace kaleido
57 | 


--------------------------------------------------------------------------------
/kaleido/core/operators/tests/b2b_gemm_test_utils.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | /*
 4 |   A and D are laid out in row-major fashion
 5 |   B and C are laid out in column-major fashion
 6 | 
 7 |   A[m, k] @ B[k, n]
 8 |   D[m, p] = P[m, n] @ C[n, p]
 9 | */
10 | template <typename Element>
11 | void cublas_two_hgemms(cublasHandle_t& handle, const kaleido::core::Tensor& A,
12 |                        const kaleido::core::Tensor& B,
13 |                        const kaleido::core::Tensor& C, kaleido::core::Tensor& P,
14 |                        kaleido::core::Tensor& D) {
15 |   int kM = A.dim_size(0);
16 |   int kN = B.dim_size(1);
17 |   int kK = A.dim_size(1);
18 |   int kP = C.dim_size(1);
19 | 
20 |   // cuBLAS gemm as the groundtruth
21 |   kaleido::core::cuda_kernel::CublasGemm<Element> hgemm;
22 | 
23 |   Element alf = static_cast<Element>(1.);
24 |   Element bet = static_cast<Element>(0.);
25 | 
26 |   // P = A @ B
27 |   // P^T = B^T @ A^T
28 |   hgemm(handle, CUBLAS_OP_T, CUBLAS_OP_N /* transb*/, kN, kM, kK, &alf,
29 |         B.data<Element>(), B.dim_size(0), A.data<Element>(), A.dim_size(1),
30 |         &bet, P.mutable_data<Element>(), P.dim_size(1));
31 | 
32 |   // D = P @ C, D and P are laid out in row-major fashion, while C is in
33 |   // column major fashion. Operands of cuBLAS is by default in column fashion.
34 |   // D^T = C^T @ P^T; [p, m] = [p, n] @ [n, m]
35 |   hgemm(handle, CUBLAS_OP_T, CUBLAS_OP_N /* transb*/, kP, kM, kN, &alf,
36 |         C.data<Element>(), C.dim_size(0), P.data<Element>(), P.dim_size(1),
37 |         &bet, D.mutable_data<Element>(), D.dim_size(1));
38 | }
39 | 


--------------------------------------------------------------------------------
/kaleido/core/place.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include "boost/variant.hpp"
 4 | 
 5 | namespace kaleido {
 6 | namespace core {
 7 | 
 8 | struct CPUPlace {
 9 |   CPUPlace() {}
10 | 
11 |   inline bool operator==(const CPUPlace&) const { return true; }
12 |   inline bool operator!=(const CPUPlace&) const { return false; }
13 |   inline bool operator<(const CPUPlace&) const { return false; }
14 | };
15 | 
16 | struct CUDAPlace {
17 |   CUDAPlace() : CUDAPlace(0) {}
18 |   explicit CUDAPlace(int d) : device(d) {}
19 | 
20 |   inline int GetDeviceId() const { return device; }
21 |   inline bool operator==(const CUDAPlace& o) const {
22 |     return device == o.device;
23 |   }
24 |   inline bool operator!=(const CUDAPlace& o) const { return !(*this == o); }
25 |   inline bool operator<(const CUDAPlace& o) const { return device < o.device; }
26 | 
27 |   int device;
28 | };
29 | 
30 | class Place : public boost::variant<CUDAPlace, CPUPlace> {
31 | private:
32 |   using PlaceBase = boost::variant<CUDAPlace, CPUPlace>;
33 | 
34 | public:
35 |   Place() = default;
36 |   Place(const CPUPlace& cpu_place) : PlaceBase(cpu_place) {}
37 |   Place(const CUDAPlace& cuda_place) : PlaceBase(cuda_place) {}
38 | 
39 |   bool operator<(const Place& place) const {
40 |     return PlaceBase::operator<(static_cast<const PlaceBase&>(place));
41 |   }
42 | 
43 |   bool operator==(const Place& place) const {
44 |     return PlaceBase::operator==(static_cast<const PlaceBase&>(place));
45 |   }
46 | };
47 | 
48 | std::ostream& operator<<(std::ostream&, const Place&);
49 | 
50 | }  // namespace core
51 | }  // namespace kaleido
52 | 


--------------------------------------------------------------------------------
/examples/sparse_attention/README.md:
--------------------------------------------------------------------------------
 1 | # Hyper-parameters for the BERT model
 2 | 
 3 | - batch size of 256 sequences
 4 | - each sequence has 512 tokens
 5 | 
 6 | ||BERT base|BERT large|
 7 | |:--|:--|:--|
 8 | |number of Transformer blocks ($L$)|12|24|
 9 | |hidden size ($H$)|768|1024|
10 | |self-attention head ($A$)|12|16|
11 | |feed-forward/filter = 4$H$ |3072|4096|
12 | |total parameters|110MB|340MB|
13 | 
14 | # BigBird attending pattern
15 | 
16 | <p align="center">
17 | <img src="figures/bigbird-attn.png" width=60%>
18 | </p>
19 | 
20 | 
21 | The implementation refers to:
22 | 
23 | 1. https://github.com/google-research/bigbird
24 | 1. https://github.com/sanghuynh1501/bigbird_pytorch
25 | 
26 | Pseudo-codes for the blocked windowed-attention
27 | 
28 | ```c++
29 | qss: List<List<Matrix>>  // batch_size, block_num, [block_size, hidden]
30 | kss: List<List<Matrix>>  // batch_size, block_num, [hidden, block_size]
31 | vss: List<List<Matrix>>  // batch_size, block_num, [block_size, hidden]
32 | 
33 | wss: List<List<Matrix>>  // batch_size, block_num, [block_size, hidden]
34 | 
35 | for 0 <= i < len(qss)  // iterate over `batch_size`
36 |   for 2 <= j < len(qss[i]) - 2  // iterate over `block_num`
37 |       for -1 <= k <= 1  // iterate over window size
38 |           // [block_size, hidden] @ [hidden, block_size]
39 |           ss1[k + 1] = qss[i][j] @ kss[i][j + k]
40 |       
41 |       // ss1 has a shape of [block_size, block_size * 3]
42 |       ss2 = softmax(ss1[:])
43 | 
44 |       // [block_size , block_size * 3] @ [block_size * 3, hidden] 
45 |       wss[i][j] = ss2 @ vss[i][j - 1 : j + 1]
46 | ```
47 | 


--------------------------------------------------------------------------------
/docs/fractaltensor_operations/memory_layout_of_fractaltensor.md:
--------------------------------------------------------------------------------
 1 | # Memory layout of jagged FractalTensor
 2 | 
 3 | `FractalTensor` is a nested collection whose elements could be a set of `FractalTensor` variables. Underneath the hood, a `FractalTensor` is just a convenient way of describing large blocks of computer memory, so that the elements contained could be efficiently iterated over and manipulated.
 4 | 
 5 | Recap the constraint of `FractalTensor` :
 6 | 
 7 | 1. All `FractalTensor` elements (could be integers, tensors, FractalTensors) are homogenous.
 8 | 1. If two `FractalTensor` types have different depths, they are treated as inequivalent types.
 9 | 
10 | When `FractalTensor` is nested, it is easy to conclude that all tensors contained in a `FractalTensor` have the same depths. The indices of a `FractalTensor` is organized as a tree. At the compile-time, only the depth of the `FractalTensor` is known. The exact structure of the indices tree is not known.
11 | 
12 | <p align="center">
13 | <img src="../images/fractaltensor_layout.png"><br>
14 | Fig 1. The meomory layout of a FractalTensor variable x.
15 | </p>
16 | 
17 | `FractalTensor` supports random read access. Elements contained in a `FractalTensor` can be indexed using the `[]` operator.
18 | 
19 | Example: `a = x[1]` :
20 | 
21 | <p align="center">
22 | <img src="../images/index_a_FractalTensor_1.png" width="35%"><br>
23 | Fig 2. index the FractalTensor variable x.
24 | </p>
25 | 
26 | Example: `a = x[2][1][3]` :
27 | 
28 | <p align="center">
29 | <img src="../images/index_a_FractalTensor_2.png" width="15%"><br>
30 | Fig 3. index the FractalTensor variable x.
31 | </p>
32 | 


--------------------------------------------------------------------------------
/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | # -------------------------------------------------------------------------
 2 | # Copyright (c) Microsoft Corporation. All rights reserved. Licensed under the
 3 | # MIT License.
 4 | # --------------------------------------------------------------------------
 5 | 
 6 | cmake_minimum_required(VERSION 3.18) # cutlass 3.2 requires cmake 3.18+
 7 | 
 8 | set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} "${CMAKE_CURRENT_SOURCE_DIR}/cmake")
 9 | list(APPEND CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH}
10 |             "${CMAKE_SOURCE_DIR}/cmake/Modules/")
11 | 
12 | set(PYPARSER_SOURCE_DIR ${CMAKE_CURRENT_SOURCE_DIR})
13 | set(PYPARSER_BINARY_DIR ${CMAKE_CURRENT_BINARY_DIR})
14 | set(PYPARSER_PYTHON_BUILD_DIR "${CMAKE_CURRENT_BINARY_DIR}/python/build")
15 | 
16 | project(kaleido CXX C)
17 | message(STATUS "CXX compiler: ${CMAKE_CXX_COMPILER}, version: "
18 |                "${CMAKE_CXX_COMPILER_ID} ${CMAKE_CXX_COMPILER_VERSION}")
19 | message(STATUS "C compiler: ${CMAKE_C_COMPILER}, version: "
20 |                "${CMAKE_C_COMPILER_ID} ${CMAKE_C_COMPILER_VERSION}")
21 | 
22 | find_package(Threads REQUIRED)
23 | find_package(CUDA REQUIRED)
24 | find_package(CuDNN REQUIRED)
25 | 
26 | set(Boost_USE_STATIC_LIBS OFF)
27 | set(Boost_USE_MULTITHREADED ON)
28 | set(Boost_USE_STATIC_RUNTIME OFF)
29 | find_package(Boost 1.45.0 COMPONENTS filesystem regex)
30 | 
31 | if(Boost_FOUND)
32 |   include_directories(${Boost_INCLUDE_DIR})
33 |   add_definitions("-DHAS_BOOST")
34 | else()
35 |   message(FATAL_ERROR "Cannot find Boost.")
36 | endif()
37 | 
38 | include(generic)
39 | include(python)
40 | include(third_party)
41 | 
42 | add_subdirectory(kaleido/core)
43 | 


--------------------------------------------------------------------------------
/cmake/external/cutlass.cmake:
--------------------------------------------------------------------------------
 1 | # -------------------------------------------------------------------------
 2 | # Copyright (c) Microsoft Corporation. All rights reserved. Licensed under the
 3 | # MIT License.
 4 | # --------------------------------------------------------------------------
 5 | 
 6 | include(ExternalProject)
 7 | 
 8 | set(CUTLASS_PREFIX_DIR ${THIRD_PARTY_PATH}/cutlass)
 9 | set(CUTLASS_SOURCE_DIR ${CUTLASS_PREFIX_DIR}/src/extern_cutlass)
10 | set(CUTLASS_REPOSITORY https://github.com/NVIDIA/cutlass.git)
11 | set(CUTLASS_TAG v3.2.2)
12 | 
13 | cache_third_party(
14 |   extern_cutlass
15 |   REPOSITORY
16 |   ${CUTLASS_REPOSITORY}
17 |   TAG
18 |   ${CUTLASS_TAG}
19 |   DIR
20 |   CUTLASS_SOURCE_DIR)
21 | 
22 | set(CUTLASS_INCLUDE_DIR "${CUTLASS_SOURCE_DIR}/include")
23 | include_directories(${CUTLASS_INCLUDE_DIR})
24 | include_directories("${CUTLASS_SOURCE_DIR}/tools/util/include")
25 | 
26 | ExternalProject_Add(
27 |   extern_cutlass
28 |   ${EXTERNAL_PROJECT_LOG_ARGS}
29 |   ${SHALLOW_CLONE}
30 |   "${CUTLASS_DOWNLOAD_CMD}"
31 |   PREFIX ${CUTLASS_PREFIX_DIR}
32 |   SOURCE_DIR ${CUTLASS_SOURCE_DIR}
33 |   UPDATE_COMMAND ""
34 |   CONFIGURE_COMMAND ""
35 |   BUILD_COMMAND ""
36 |   INSTALL_COMMAND ""
37 |   TEST_COMMAND "")
38 | 
39 | add_library(cutlass INTERFACE)
40 | target_include_directories(cutlass INTERFACE ${ROOT_DIR}/include
41 |                                              ${ROOT_DIR}/tools/util/include)
42 | target_link_libraries(cutlass INTERFACE CUDA::cudart)
43 | target_include_directories(cutlass
44 |                            INTERFACE ${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES})
45 | add_dependencies(cutlass extern_cutlass)
46 | 


--------------------------------------------------------------------------------
/kaleido/frontend/operations/tests/test_constants.py:
--------------------------------------------------------------------------------
 1 | # -------------------------------------------------------------------------
 2 | # Copyright (c) Microsoft Corporation. All rights reserved.
 3 | # Licensed under the MIT License.
 4 | # --------------------------------------------------------------------------
 5 | 
 6 | from __future__ import print_function
 7 | 
 8 | from context import *
 9 | 
10 | 
11 | class TestConstantsOp(unittest.TestCase):
12 |     def setUp(self):
13 |         random.seed(12345)
14 | 
15 |     def test_arange(self):
16 |         x1 = kaleido.operations.slices(kaleido.operations.arange(11), dim=0)
17 |         self.assertTrue(isinstance(x1, kaleido.FractalTensor))
18 |         self.assertTrue(
19 |             isinstance(x1.element_type.element_type,
20 |                        kaleido.frontend.types.Int))
21 |         y = list(range(11))
22 |         for i, x in enumerate(x1):
23 |             self.assertEqual(y[i], x.data.item())
24 | 
25 |         x2 = kaleido.operations.slices(kaleido.operations.arange(5, 7), dim=0)
26 |         y = list(range(5, 7))
27 |         for i, x in enumerate(x2):
28 |             self.assertEqual(y[i], x.data.item())
29 | 
30 |         x3 = kaleido.operations.slices(
31 |             kaleido.operations.arange(5, 24, 3), dim=0)
32 |         y = list(range(5, 24, 3))
33 |         for i, x in enumerate(x3):
34 |             self.assertEqual(y[i], x.data.item())
35 | 
36 |     def test_constants(self):
37 |         for d in ['cpu', 'cuda']:
38 |             x = kaleido.operations.zeros(shape=(3, 7), dtype='float', device=d)
39 |             y = kaleido.operations.ones(shape=(3, 7), dtype='float', device=d)
40 | 
41 | 
42 | if __name__ == '__main__':
43 |     unittest.main()
44 | 


--------------------------------------------------------------------------------
/kaleido/core/device/kernels/lstm/stacked_lstm/region1.h:
--------------------------------------------------------------------------------
 1 | // Copyright (c) Microsoft Corporation. All rights reserved.
 2 | // Licensed under the MIT License.
 3 | 
 4 | #pragma once
 5 | #include "kaleido/core/device/cuda_timer.h"
 6 | #include "kaleido/core/device/kernels/lstm.h"
 7 | 
 8 | namespace kaleido::core::cuda_kernel {
 9 | template <typename Element, typename InstructionShape, typename ValueMnk,
10 |           typename WarpArragement, typename CtaTileShape, typename WholeShape>
11 | float StackedLstmRegion1(Element* hsss, Element* csss, const Element* xss,
12 |                          const Element* ws, const Element* us, const int depth,
13 |                          const int seq_length, const int batch_size,
14 |                          const int hidden_size) {
15 |   CudaTimer timer;
16 |   const Element* x = xss;
17 |   Element* init;
18 |   cudaMalloc((void**)&init, sizeof(Element) * hidden_size * batch_size);
19 |   // Fill zero
20 |   cudaMemset(reinterpret_cast<void*>(init), 0,
21 |              sizeof(Element) * hidden_size * batch_size);
22 | 
23 |   const Element* c_init = init;
24 |   const Element* h_init = init;
25 |   const Element* w = ws;
26 |   const Element* u = us;
27 |   Element* css = csss;
28 |   Element* hss = hsss;
29 | 
30 |   // TODO: NotFused version.
31 |   using CuteFusedLSTMLayer =
32 |       cuda_kernel::CuteLSTMLayer<Element, InstructionShape, ValueMnk,
33 |                                  WarpArragement, CtaTileShape, WholeShape>;
34 | 
35 |   CuteFusedLSTMLayer cute_fused_lstm_layer;
36 | 
37 |   float time =
38 |       cute_fused_lstm_layer(w, x, u, c_init, h_init, css, hss, seq_length);
39 | 
40 |   CudaCheck(cudaFree(init));
41 | 
42 |   return time;
43 | }
44 | }  // namespace kaleido::core::cuda_kernel
45 | 


--------------------------------------------------------------------------------
/benchmarks/rnn/baselines/stacked_lstm/figures/perf_with_increased_depth_subplot1.tsv:
--------------------------------------------------------------------------------
 1 | Depth	TestName	AvgTime	Throughput	Ratio
 2 | 1	PT-JIT	1.96780	16261.78517	1.00000
 3 | 4	PT-JIT	7.77687	4114.76474	3.95206
 4 | 8	PT-JIT	15.53171	2060.30052	7.89292
 5 | 12	PT-JIT	24.19508	1322.58278	12.29548
 6 | 16	PT-JIT	32.10150	996.83803	16.31337
 7 | 20	PT-JIT	41.20204	776.66064	20.93808
 8 | 1	TF-WhileOpLSTM	3.90668	8191.10010	1.00000
 9 | 4	TF-WhileOpLSTM	13.55371	2360.97774	3.46937
10 | 8	TF-WhileOpLSTM	27.79051	1151.47235	7.11359
11 | 12	TF-WhileOpLSTM	43.66595	732.83646	11.17726
12 | 16	TF-WhileOpLSTM	62.98433	508.06287	16.12222
13 | 20	TF-WhileOpLSTM	80.80373	396.02133	20.68348
14 | 1	TF-GraphMode	2.55039	12547.11352	1.00000
15 | 4	TF-GraphMode	8.89519	3597.44963	3.48778
16 | 8	TF-GraphMode	24.10036	1327.78104	9.44969
17 | 12	TF-GraphMode	30.35351	1054.24393	11.90153
18 | 16	TF-GraphMode	43.90387	728.86518	17.21459
19 | 20	TF-GraphMode	59.90020	534.22195	23.48671
20 | 1	TF-AutoGraph	2.50235	12787.99954	1.00000
21 | 4	TF-AutoGraph	6.49833	4924.34298	2.59689
22 | 8	TF-AutoGraph	12.64750	2530.14397	5.05426
23 | 12	TF-AutoGraph	18.24804	1753.61285	7.29237
24 | 16	TF-AutoGraph	55.36173	578.01660	22.12393
25 | 20	TF-AutoGraph	53.05073	603.19619	21.20040
26 | 1	TVM-Ansor	1.1106	28813.254097	1.000000
27 | 4	TVM-Ansor	3.7581	8514.941061	3.383847
28 | 8	TVM-Ansor	7.4149	4315.634735	6.676481
29 | 12	TVM-Ansor	11.1161	2878.707460	10.009094
30 | 16	TVM-Ansor	14.8240	2158.661630	13.347740
31 | 20	TVM-Ansor	18.4955	1730.150577	16.653611
32 | 1	CuDNN	0.37390	85583.48580	1.00000
33 | 4	CuDNN	1.07919	29651.76547	2.88629
34 | 8	CuDNN	2.02688	15787.78251	5.42087
35 | 12	CuDNN	2.97586	10753.18958	7.95889
36 | 16	CuDNN	4.06898	7864.38144	10.88242
37 | 20	CuDNN	4.85620	6589.51845	12.98782
38 | 


--------------------------------------------------------------------------------
/kaleido/frontend/tests/test_type_equivalence.py:
--------------------------------------------------------------------------------
 1 | # -------------------------------------------------------------------------
 2 | # Copyright (c) Microsoft Corporation. All rights reserved.
 3 | # Licensed under the MIT License.
 4 | # --------------------------------------------------------------------------
 5 | 
 6 | from __future__ import print_function
 7 | 
 8 | from context import *
 9 | 
10 | from kaleido.frontend.types import Bool, Int, Real
11 | 
12 | device = 'cpu'
13 | 
14 | 
15 | class Test1(unittest.TestCase):
16 | 
17 |     def test_basic_type(self):
18 |         self.assertFalse(Real(64).is_equal_type(Int(32)))
19 |         self.assertFalse(Real(64).is_equal_type(Real(32)))
20 |         self.assertFalse(Bool().is_equal_type(Real(16)))
21 | 
22 |         self.assertTrue(Real(64).is_equal_type(Real(64)))
23 |         self.assertTrue(Int(16).is_equal_type(Int(16)))
24 | 
25 |     def test_tensor_type(self):
26 |         x = TensorStorage((2, 64), kaleido.float32, device=device)
27 | 
28 |         self.assertTrue(x.is_equal_type(x))
29 |         self.assertFalse(
30 |             x.is_equal_type(TensorStorage((1, 3), kaleido.float32, device)))
31 | 
32 |         y = FractalTensorStorage(
33 |             TensorStorage((2, 64), kaleido.float32, device=device))
34 |         self.assertTrue(y.is_equal_type(y))
35 |         self.assertTrue(
36 |             y.is_equal_type(
37 |                 FractalTensorStorage(
38 |                     TensorStorage((2, 64), kaleido.float32, device=device))))
39 | 
40 |         self.assertFalse(
41 |             y.is_equal_type(
42 |                 FractalTensorStorage(
43 |                     TensorStorage((4, 7), kaleido.float32, device=device))))
44 | 
45 | 
46 | if __name__ == '__main__':
47 |     unittest.main()
48 | 


--------------------------------------------------------------------------------
/kaleido/parser/tests/utils.py:
--------------------------------------------------------------------------------
 1 | # -------------------------------------------------------------------------
 2 | # Copyright (c) Microsoft Corporation. All rights reserved.
 3 | # Licensed under the MIT License.
 4 | # --------------------------------------------------------------------------
 5 | 
 6 | import ast
 7 | import inspect
 8 | import textwrap
 9 | 
10 | import astpretty
11 | import asttokens
12 | import torch
13 | 
14 | import kaleido
15 | from kaleido import FractalTensor, Tensor
16 | from kaleido.frontend.types import FractalTensorStorage, TensorStorage
17 | 
18 | __all__ = [
19 |     'get_ast',
20 |     'print_ast',
21 |     'create_fractaltensor',
22 |     'create_depth2_fractaltensor',
23 | ]
24 | 
25 | 
26 | def get_ast(func):
27 |     source = inspect.getsource(func)
28 |     source = textwrap.dedent(source)
29 |     col_offset = len(source.split("\n")[0]) - len(source.split("\n")[0])
30 | 
31 |     _, file_lineno = inspect.getsourcelines(func)
32 | 
33 |     return ast.increment_lineno(
34 |         asttokens.ASTTokens(source, parse=True).tree, file_lineno)
35 | 
36 | 
37 | def print_ast(func):
38 |     astpretty.pprint(get_ast(func))
39 | 
40 | 
41 | def create_fractaltensor(size, length):
42 |     xs = FractalTensor(TensorStorage(size, kaleido.float32, device='cpu'))
43 |     xs.indices = list(range(length))
44 |     xs.initialize(torch.rand, *xs.flatten_shape)
45 |     return xs
46 | 
47 | 
48 | def create_depth2_fractaltensor(size, length1, length2):
49 |     xss = FractalTensor(
50 |         FractalTensorStorage(TensorStorage(size, kaleido.float32,
51 |                                            device='cpu')))
52 |     xss.indices = [list(range(length1)) for _ in range(length2)]
53 |     xss.initialize(torch.rand, *xss.flatten_shape)
54 |     return xss
55 | 


--------------------------------------------------------------------------------
/kaleido/frontend/operations/tensor/data_movements.py:
--------------------------------------------------------------------------------
 1 | # -------------------------------------------------------------------------
 2 | # Copyright (c) Microsoft Corporation. All rights reserved.
 3 | # Licensed under the MIT License.
 4 | # --------------------------------------------------------------------------
 5 | 
 6 | from __future__ import absolute_import, division, print_function
 7 | 
 8 | from typing import Tuple
 9 | 
10 | import torch
11 | 
12 | import kaleido
13 | from kaleido import Tensor
14 | from kaleido.frontend.operations.base import Access
15 | 
16 | __all__ = [
17 |     'cat',
18 |     'permute',
19 |     'stack',
20 | ]
21 | 
22 | 
23 | class Cat(Access):
24 | 
25 |     def __call__(self, xs: Tuple, dim: int = 0):
26 | 
27 |         assert (len(xs))
28 | 
29 |         v = torch.cat([x.data for x in xs], dim=dim)
30 |         t = Tensor(v.shape, xs[0]._type._dtype, device=xs[0].device)
31 |         t.data = v
32 |         return t
33 | 
34 | 
35 | cat = Cat()
36 | 
37 | 
38 | class Permute(Access):
39 | 
40 |     def __call__(self, x: Tensor, axes: Tuple[int]) -> Tensor:
41 |         assert len(axes) == x.ndim
42 |         shape = [x.shape[i] for i in axes]
43 |         t = kaleido.Tensor(shape, x._type._dtype, device=x.device)
44 | 
45 |         t.data = x.data.permute(*axes)
46 |         t._type._shape = list(t.data.shape)
47 |         t.recompute_strides()
48 |         return t
49 | 
50 | 
51 | permute = Permute()
52 | 
53 | 
54 | class Stack(Access):
55 | 
56 |     def __call__(self, xs: Tuple, dim: int = 0):
57 | 
58 |         assert (len(xs))
59 | 
60 |         v = torch.stack([x.data for x in xs], dim=dim)
61 |         t = Tensor(v.shape, xs[0]._type._dtype, device=xs[0].device)
62 |         t.data = v
63 |         return t
64 | 
65 | 
66 | stack = Stack()
67 | 


--------------------------------------------------------------------------------
/benchmarks/rnn/fractaltensor/stacked_lstm/images/preprocess.py:
--------------------------------------------------------------------------------
 1 | # -------------------------------------------------------------------------
 2 | # Copyright (c) Microsoft Corporation. All rights reserved.
 3 | # Licensed under the MIT License.
 4 | # --------------------------------------------------------------------------
 5 | 
 6 | import sys
 7 | 
 8 | test_num = int(sys.argv[1])
 9 | print(f'test number : {test_num}')
10 | 
11 | data1 = open(f'stacked_lstm_unfused_elem_fused_bmm_{test_num}.csv',
12 |              'r').read().rstrip().split('\n')
13 | data2 = open(f'stacked_lstm_fused_elem_fused_bmm_{test_num}.csv',
14 |              'r').read().rstrip().split('\n')
15 | 
16 | length = len(data1)
17 | header = data1[0]
18 | 
19 | with open(f'stacked_lstm{test_num}.csv', 'w') as f:
20 |     f.write('%s\n' % (header))
21 |     for i in range(1, length, 2):
22 |         unfused_elem_fused_bmm = data1[i]
23 |         unfused_elem_fused_bmm = unfused_elem_fused_bmm.replace(
24 |             'FractalTensor', 'FT_unfused-elem_fused-bmm')
25 |         cudnn1 = float(data1[i + 1].split('|')[-2])
26 | 
27 |         # depth = int(data1[i].split('|')[2].replace('[', '').replace(
28 |         #     ']', '').split(',')[-1])
29 |         # if i >= 3 and depth % 2: continue
30 | 
31 |         fused_elem_fused_bmm = data2[i]
32 |         fused_elem_fused_bmm = fused_elem_fused_bmm.replace(
33 |             'FractalTensor', 'FT_fused-elem_fused-bmm')
34 |         cudnn2 = float(data2[i + 1].split('|')[-2])
35 | 
36 |         cudnn = (cudnn1 + cudnn2) / 2.
37 |         cudnn_str = data1[i + 1].split('|')
38 |         cudnn_str = '|'.join(cudnn_str[0:-2]) + '|%.3f' % (cudnn) + '|'
39 | 
40 |         f.write('%s\n' % (unfused_elem_fused_bmm))
41 |         f.write('%s\n' % (fused_elem_fused_bmm))
42 |         f.write('%s\n' % (cudnn_str))
43 | 


--------------------------------------------------------------------------------
/kaleido/frontend/operations/tensor/arithmetic/broadcast.py:
--------------------------------------------------------------------------------
 1 | # -------------------------------------------------------------------------
 2 | # Copyright (c) Microsoft Corporation. All rights reserved.
 3 | # Licensed under the MIT License.
 4 | # --------------------------------------------------------------------------
 5 | 
 6 | from __future__ import absolute_import, division, print_function
 7 | 
 8 | from typing import Tuple
 9 | 
10 | import kaleido
11 | from kaleido.frontend.operations.base import Broadcast
12 | 
13 | __all__ = [
14 |     'scale',
15 |     '_broadcast_div',
16 |     '_broadcast_pow',
17 | ]
18 | 
19 | 
20 | class Scale(Broadcast):
21 |     """y = x * y where x is a scalar
22 | 
23 |     x is the smaller tensor while y is the larger tensor.
24 |     """
25 | 
26 |     def __call__(self, x: kaleido.Tensor, y: kaleido.Tensor) -> kaleido.Tensor:
27 |         t = super(Scale, self).__call__(x, y)
28 | 
29 |         t.data = x.data * y.data
30 |         t._type._shape = t.data.shape
31 |         t.recompute_strides()
32 |         return t
33 | 
34 | 
35 | scale = Scale()
36 | 
37 | 
38 | class _BroadcastDiv(Broadcast):
39 | 
40 |     def __call__(self, x: kaleido.Tensor, y: kaleido.Tensor) -> kaleido.Tensor:
41 |         t = super(_BroadcastDiv, self).__call__(x, y)
42 | 
43 |         t.data = x.data / y.data
44 |         t._type._shape = t.data.shape
45 |         t.recompute_strides()
46 |         return t
47 | 
48 | 
49 | _broadcast_div = _BroadcastDiv()
50 | 
51 | 
52 | class _BroadcastPow(Broadcast):
53 | 
54 |     def __call__(self, x: kaleido.Tensor, y: kaleido.Tensor) -> kaleido.Tensor:
55 |         t = super(_BroadcastPow, self).__call__(x, y)
56 | 
57 |         t.data = x.data**y.data
58 |         t._type._shape = t.data.shape
59 |         t.recompute_strides()
60 |         return t
61 | 
62 | 
63 | _broadcast_pow = _BroadcastPow()
64 | 


--------------------------------------------------------------------------------
/examples/rnn_attention/rnn_attention_utils.py:
--------------------------------------------------------------------------------
 1 | # -------------------------------------------------------------------------
 2 | # Copyright (c) Microsoft Corporation. All rights reserved.
 3 | # Licensed under the MIT License.
 4 | # --------------------------------------------------------------------------
 5 | 
 6 | import torch
 7 | 
 8 | import kaleido
 9 | from examples.utils import gen_dataset
10 | from kaleido import Tensor
11 | 
12 | # ============= hyper parameters
13 | device = 'cpu'
14 | # device = 'cuda'
15 | 
16 | batch_size = 7
17 | vocab_size = 5000
18 | hidden_dim = 512
19 | 
20 | src_emb = Tensor((vocab_size, hidden_dim), kaleido.float32, device=device)
21 | src_emb.initialize(torch.rand, *src_emb.shape, device=device)
22 | 
23 | trg_emb = Tensor((vocab_size, hidden_dim), kaleido.float32, device=device)
24 | trg_emb.initialize(torch.rand, *trg_emb.shape, device=device)
25 | 
26 | 
27 | def create_cell_param():
28 |     W = Tensor((hidden_dim, hidden_dim), kaleido.float32, device=device)
29 |     W.initialize(torch.rand, *W.shape, device=device)
30 | 
31 |     U = Tensor((hidden_dim, hidden_dim), kaleido.float32, device=device)
32 |     U.initialize(torch.rand, *U.shape, device=device)
33 | 
34 |     b = Tensor((1, hidden_dim), kaleido.float32, device=device)
35 |     b.initialize(torch.rand, *b.shape, device=device)
36 |     return {'W': W, 'U': U, 'b': b}
37 | 
38 | 
39 | src_params = create_cell_param()
40 | trg_params = create_cell_param()
41 | 
42 | encoder_proj = Tensor((hidden_dim, 1), kaleido.float32, device=device)
43 | encoder_proj.initialize(torch.rand, *encoder_proj.shape, device=device)
44 | 
45 | decoder_proj = Tensor((hidden_dim, 1), kaleido.float32, device=device)
46 | decoder_proj.initialize(torch.rand, *decoder_proj.shape, device=device)
47 | 
48 | attn_params = (encoder_proj, decoder_proj)
49 | 
50 | src_words = gen_dataset(batch_size, vocab_size)
51 | trg_words = gen_dataset(batch_size, vocab_size)
52 | 


--------------------------------------------------------------------------------
/examples/dilated_rnn/dilated_rnn.py:
--------------------------------------------------------------------------------
 1 | # -------------------------------------------------------------------------
 2 | # Copyright (c) Microsoft Corporation. All rights reserved.
 3 | # Licensed under the MIT License.
 4 | # --------------------------------------------------------------------------
 5 | 
 6 | from typing import Tuple
 7 | 
 8 | import context
 9 | 
10 | from examples.stacked_rnn.rnn_utils import *
11 | from examples.stacked_rnn.stacked_rnn import lstm_cell
12 | from examples.utils import gen_dataset
13 | 
14 | 
15 | # @kaleido.function(ctx)
16 | def dilated_layer(state: FractalTensor[Tensor['1, 512', float, 'cpu']],
17 |                   itr: int,
18 |                   Ws: FractalTensor[Tensor['512, 521', float, 'cpu']],
19 |                   Us: FractalTensor[Tensor['512, 512', float, 'cpu']],
20 |                   bs: FractalTensor[Tensor['1, 512', float, 'cpu']]
21 |                   ) -> FractalTensor[Tensor['1, 512', float, 'cpu']]:
22 |     zeros = ops.zeros(shape=(1, 512), device='cpu', dtype='float')
23 |     h, _ =  ops.dilated_map(
24 |         lambda xs: ops.scan(lambda s, x: lstm_cell(*s, x, Ws, Us, bs),
25 |             xs, initializer=(zeros, zeros)),
26 |         state,
27 |         dilation=2**itr)
28 |     return h
29 | 
30 | 
31 | # @kaleido.function(ctx)
32 | def model(batch_words: FractalTensor[FractalTensor[Tensor['1,', int, 'cpu']]],
33 |           params: ModelParams
34 |           ) -> FractalTensor[FractalTensor[Tensor['1, 512', float, 'cpu']]]:
35 |     embs = ops.map(lambda words: ops.map(lambda word:
36 |             ops.index(ops.slices(params.embedding, dim=0), word), words),
37 |             batch_words)
38 |     itrs = ops.enumerate(params.Wss, params.Uss, params.bss)
39 |     rnn_outs = ops.map(lambda xs: ops.fold(lambda s, x: dilated_layer(s, *x),
40 |         itrs, initializer=xs), embs)
41 |     return rnn_outs
42 | 
43 | 
44 | if __name__ == '__main__':
45 |     xss = gen_dataset(batch_size, vocab_size)
46 |     yss = model(xss, params)
47 | 


--------------------------------------------------------------------------------
/kaleido/core/tensor_shape.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include <glog/logging.h>
 4 | 
 5 | #include <algorithm>
 6 | #include <cstdint>
 7 | #include <iterator>
 8 | #include <numeric>
 9 | #include <sstream>
10 | #include <string>
11 | #include <vector>
12 | 
13 | namespace kaleido {
14 | namespace core {
15 | 
16 | // The tensor shape is static after declaration.
17 | class TensorShape {
18 | public:
19 |   explicit TensorShape(std::initializer_list<int64_t> sizes)
20 |       : dim_sizes_(sizes),
21 |         dim_(sizes.size()),
22 |         numel_(std::accumulate(sizes.begin(), sizes.end(), 1,
23 |                                std::multiplies<int64_t>())) {}
24 |   explicit TensorShape(const std::vector<int64_t> sizes)
25 |       : dim_sizes_(std::move(sizes)),
26 |         dim_(sizes.size()),
27 |         numel_(std::accumulate(sizes.begin(), sizes.end(), 1,
28 |                                std::multiplies<int64_t>())) {}
29 |   ~TensorShape() = default;
30 |   bool IsEuqalShape(const TensorShape& b) const;
31 |   void operator=(TensorShape& b) {
32 |     dim_sizes_ = std::move(b.dims());
33 |     dim_ = b.ndim();
34 |     numel_ = b.numel();
35 |   };
36 | 
37 |   bool operator==(const TensorShape& b) const { return IsEuqalShape(b); };
38 |   bool operator!=(const TensorShape& b) const { return !IsEuqalShape(b); };
39 | 
40 |   std::string DebugString() const;
41 | 
42 |   size_t ndim() const { return dim_; }
43 |   int64_t dim_size(int i) const {
44 |     return i >= 0 ? dim_sizes_[i] : dim_sizes_[dim_ + i];
45 |   }
46 | 
47 |   const std::vector<int64_t>& dims() const { return dim_sizes_; }
48 |   int64_t numel() const { return numel_; }
49 | 
50 |   int64_t count(int i) const {
51 |     return std::accumulate(dim_sizes_.begin() + i, dim_sizes_.end(), 1,
52 |                            std::multiplies<int64_t>());
53 |   }
54 | 
55 |   std::vector<int64_t> dim_sizes_;
56 |   size_t dim_;
57 |   int64_t numel_;
58 | };
59 | 
60 | }  // namespace core
61 | }  // namespace kaleido
62 | 


--------------------------------------------------------------------------------
/kaleido/core/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | # -------------------------------------------------------------------------
 2 | # Copyright (c) Microsoft Corporation. All rights reserved. Licensed under the
 3 | # MIT License.
 4 | # --------------------------------------------------------------------------
 5 | 
 6 | set(TARGET fractaltensor_core)
 7 | 
 8 | include_directories("${PROJECT_SOURCE_DIR}")
 9 | include_directories(${CUDA_INCLUDE_DIRS})
10 | include_directories(${CUDNN_INCLUDE_DIRS})
11 | include_directories(${Boost_INCLUDE_DIRS})
12 | 
13 | # set(PATH_PREFIX ${PROJECT_SOURCE_DIR}/kaleido/core) file(GLOB_RECURSE
14 | # PROTOBUF_FILE "${PATH_PREFIX}/*.proto") get_filename_component(PROTO_PATH
15 | # ${PROTOBUF_FILE} ABSOLUTE) get_filename_component(PROTO_NAME ${PROTOBUF_FILE}
16 | # NAME_WE)
17 | 
18 | # cpp_proto_generate("${TARGET}_proto" SRCS "${PROTOBUF_FILE}")
19 | # add_custom_command( TARGET "${TARGET}_proto" POST_BUILD COMMAND cp *.pb.*
20 | # ${PATH_PREFIX} COMMENT "Copy generated C++ proto into directory kaleido/core."
21 | # WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR})
22 | #
23 | # py_proto_generate("${TARGET}_proto_py" SRCS "${PROTOBUF_FILE}")
24 | # add_custom_command( TARGET "${TARGET}_proto_py" POST_BUILD COMMAND cp *.py
25 | # "${PROJECT_SOURCE_DIR}/kaleido/frontend" COMMENT "Copy generated python proto
26 | # into directory kaleido/frontend." WORKING_DIRECTORY
27 | # ${CMAKE_CURRENT_BINARY_DIR})
28 | 
29 | add_subdirectory(operators)
30 | 
31 | file(
32 |   GLOB KALEIDO_CORE_SRCS
33 |   RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}"
34 |   "*.cc" "device/*.cc")
35 | 
36 | cc_library(
37 |   ${TARGET}
38 |   SHARED
39 |   IMPORTED
40 |   SRCS
41 |   ${KALEIDO_CORE_SRCS}
42 |   DEPS
43 |   python
44 |   # ${TARGET}_proto protobuf
45 | )
46 | 
47 | target_link_libraries(${TARGET} Boost::filesystem Boost::regex)
48 | target_link_libraries(${TARGET} ${CUDA_LIBRARIES} ${CUDA_CUBLAS_LIBRARIES}
49 |                       ${CUDNN_LIBRARIES})
50 | 
51 | add_subdirectory(tests)
52 | add_subdirectory(device/tests)
53 | 


--------------------------------------------------------------------------------
/kaleido/core/operators/fill_op.cu:
--------------------------------------------------------------------------------
 1 | // Copyright (c) Microsoft Corporation. All rights reserved.
 2 | // Licensed under the MIT License.
 3 | 
 4 | #include "kaleido/core/device/cuda_utils.h"
 5 | #include "kaleido/core/device/gpu_context.h"
 6 | #include "kaleido/core/device/kernels/fill.h"
 7 | #include "kaleido/core/operators/fill_op.h"
 8 | 
 9 | #include <cutlass/numeric_types.h>
10 | 
11 | namespace kaleido {
12 | namespace core {
13 | namespace ops {
14 | 
15 | template <typename T>
16 | class FillOp<GPUContext, CUDAPlace, T> {
17 | public:
18 |   void operator()(Tensor& input, float value) {
19 |     int numel = static_cast<int>(input.numel());
20 |     T* data = input.mutable_data<T>();
21 | 
22 |     int threads = 128;
23 |     int blocks = DIVUP(numel, threads);
24 |     cuda_kernel::KeFillValue<T><<<blocks, threads>>>(data, numel, value);
25 |   }
26 | 
27 |   void operator()(Tensor& input) {
28 |     T* data = input.mutable_data<T>();
29 |     int num = static_cast<int>(input.numel());
30 |     cuda_kernel::FillRandomValue<T>(data, num);
31 |   }
32 | 
33 |   void operator()(Tensor& input, float mean = 0, float stddev = 0.1) {
34 |     T* data = input.mutable_data<T>();
35 |     int num = static_cast<int>(input.numel());
36 |     cuda_kernel::FillRandomValue<T>(data, num, mean, stddev);
37 |   }
38 | 
39 |   void operator()(Tensor& input, const std::string& mode, float scale = 1.) {
40 |     if (mode == "seq") {
41 |       T* data = input.mutable_data<T>();
42 |       int64_t numel = input.numel();
43 | 
44 |       int threads = 128;
45 |       int blocks = DIVUP(numel, threads);
46 |       cuda_kernel::KeFillSequential<T><<<blocks, threads>>>(data, numel, scale);
47 |     } else {
48 |       LOG(FATAL) << "Unknown mode: " << mode << std::endl;
49 |     }
50 |   }
51 | };
52 | 
53 | template class FillOp<GPUContext, CUDAPlace, float>;
54 | template class FillOp<GPUContext, CUDAPlace, __half>;
55 | template class FillOp<GPUContext, CUDAPlace, cutlass::half_t>;
56 | 
57 | }  // namespace ops
58 | }  // namespace core
59 | }  // namespace kaleido
60 | 


--------------------------------------------------------------------------------
/docs/fractaltensor_operations/memory_operations.md:
--------------------------------------------------------------------------------
 1 | <!-- vscode-markdown-toc -->
 2 | 
 3 | - [Memory operations](#memory-operations)
 4 |   - [\*copy](#copy)
 5 |   - [repeat](#repeat)
 6 |   - [stack](#stack)
 7 |   - [flatten](#flatten)
 8 |   - [split (partition)](#split-partition)
 9 | 
10 | <!-- vscode-markdown-toc-config
11 | 	numbering=true
12 | 	autoSave=true
13 | 	/vscode-markdown-toc-config -->
14 | <!-- /vscode-markdown-toc -->
15 | 
16 | # Memory operations
17 | 
18 | ## \*copy
19 | 
20 | ```python
21 | copy(x: FractalTensor[T]) -> FractalTensor[T]
22 | ```
23 | 
24 | ## repeat
25 | 
26 | $$\mathbf{repeat} ::\Psi n.[\alpha]^d_n \rightarrow \text{int}\rightarrow \Psi m.[\alpha]^{d}_m$$
27 | 
28 | ```python
29 | repeat(x: FractalTensor[T], repeats: int) -> FractalTensor[T]
30 | ```
31 | 
32 | Examples:
33 | 
34 | ```
35 | x: FractalTensor = [t1, t2, t3]
36 | 
37 | y: FractalTensor = repeats(x, 3)
38 | 
39 | y = [t1, t2, t3, t1, t2, t3]
40 | ```
41 | 
42 | ## stack
43 | 
44 | $$\mathbf{stack} ::\Psi n.[\alpha]^1_n \rightarrow \text{int}\rightarrow \beta$$
45 | 
46 | ```python
47 | stack(x: FractalTensor[Tensor], axis: int) -> Tensor
48 | ```
49 | 
50 | `stack` is **ONLY** defined for a depth-1 `FractalTensor` .
51 | 
52 | Example, suppose `x = FractalTensor[Tensor[3, 7], float32]`
53 | 
54 | <p align="center">
55 | <img src="../images/stack.png" width=50%>
56 | </p>
57 | 
58 | ## flatten
59 | 
60 | $$\mathbf{flatten} ::\Psi n.[\alpha]^d_n \rightarrow \text{int}\rightarrow \beta$$
61 | 
62 | ```python
63 | flatten(x: FractalTensor[T], axis: int) -> Tensor
64 | ```
65 | 
66 | `flatten` is equal to retrieve all tensors contained in `x` by positive lexicographic order, put them into a depth-1 `FractalTensor` , and then call `stack` on this depth-1 `FractalTensor` .
67 | 
68 | ## split (partition)
69 | 
70 | $$\mathbf{split} ::\Psi n.[\alpha]^d_n \rightarrow \text{int}\rightarrow (\Psi m.[\beta]^d_m)$$
71 | 
72 | Partition a `FractalTensor` into a tuple of `FractalTensor` .
73 | 
74 | ```python
75 | split(x: FractalTensor[T], n: int, pad_value: T = None) -> Tuple[FractalTensor[T]]
76 | ```
77 | 


--------------------------------------------------------------------------------
/kaleido/frontend/operations/tests/test_flatten.py:
--------------------------------------------------------------------------------
 1 | # -------------------------------------------------------------------------
 2 | # Copyright (c) Microsoft Corporation. All rights reserved.
 3 | # Licensed under the MIT License.
 4 | # --------------------------------------------------------------------------
 5 | 
 6 | from __future__ import print_function
 7 | 
 8 | import torch
 9 | from context import *
10 | 
11 | from kaleido import FractalTensor
12 | 
13 | 
14 | class TestFlatten(unittest.TestCase):
15 | 
16 |     def create_data(self):
17 |         shape = [3, 4]
18 |         dtype = kaleido.TensorStorage(shape,
19 |                                       kaleido.float32,
20 |                                       device='cpu',
21 |                                       order='row')
22 | 
23 |         count = 0
24 | 
25 |         xs = []
26 |         x_indices = []
27 |         for i in range(5):
28 |             xs.append(FractalTensor(dtype))
29 |             n = random.randint(13, 27)
30 |             count += n
31 |             x_indices.append(list(range(n)))
32 | 
33 |         x = FractalTensor.from_fractaltensors(*xs)
34 |         x.indices = x_indices
35 | 
36 |         x.initialize(torch.rand, *x.flatten_shape)
37 |         return x, count
38 | 
39 |     def test1(self):
40 |         x, count = self.create_data()
41 |         dim = 0
42 |         y = kaleido.operations.flatten(x, dim)
43 |         self.assertTrue(isinstance(y, kaleido.Tensor))
44 | 
45 |         new_shape = x.element_type.shape
46 |         new_shape[dim] = new_shape[dim] * count
47 |         for s1, s2 in zip(new_shape, y.shape):
48 |             self.assertEqual(s1, s2)
49 | 
50 |     def test2(self):
51 |         x, count = self.create_data()
52 |         dim = 1
53 |         y = kaleido.operations.flatten(x, dim)
54 |         self.assertTrue(isinstance(y, kaleido.Tensor))
55 | 
56 |         new_shape = x.element_type.shape
57 |         new_shape[dim] = new_shape[dim] * count
58 |         for s1, s2 in zip(new_shape, y.shape):
59 |             self.assertEqual(s1, s2)
60 | 
61 | 
62 | if __name__ == '__main__':
63 |     unittest.main()
64 | 


--------------------------------------------------------------------------------
/kaleido/frontend/operations/tensor/reshape.py:
--------------------------------------------------------------------------------
 1 | # -------------------------------------------------------------------------
 2 | # Copyright (c) Microsoft Corporation. All rights reserved.
 3 | # Licensed under the MIT License.
 4 | # --------------------------------------------------------------------------
 5 | 
 6 | from __future__ import absolute_import, division, print_function
 7 | 
 8 | __all__ = [
 9 |     'reshape',
10 |     'squeeze',
11 |     'unsqueeze',
12 | ]
13 | 
14 | from typing import List, Union
15 | 
16 | import torch
17 | 
18 | import kaleido
19 | from kaleido.frontend.operations.base import BaseOp
20 | 
21 | 
22 | class Reshape(BaseOp):
23 | 
24 |     def __call__(self, x: kaleido.Tensor, shape: Union[List[int], int]):
25 |         if not isinstance(shape, List):
26 |             if isinstance(shape, int):
27 |                 shape = [shape]
28 |             else:
29 |                 raise ValueError('shape should be list of integers.')
30 | 
31 |         super(Reshape, self).__call__(x)
32 | 
33 |         t = kaleido.Tensor(shape, x._type._dtype, device=x.device)
34 |         t.data = torch.reshape(x.data, shape)
35 |         t._type._shape = list(t.data.shape)
36 |         t.recompute_strides()
37 |         return t
38 | 
39 | 
40 | reshape = Reshape()
41 | 
42 | 
43 | class Squeeze(BaseOp):
44 | 
45 |     def __call__(self, x: kaleido.Tensor, dim: int = None):
46 |         super(Squeeze, self).__call__(x)
47 | 
48 |         t = kaleido.Tensor([0], x._type._dtype, device=x.device)
49 |         t.data = x.data.squeeze(dim) if dim else x.data.squeeze()
50 |         t._type._shape = list(t.data.shape)
51 |         t.recompute_strides()
52 |         return t
53 | 
54 | 
55 | squeeze = Squeeze()
56 | 
57 | 
58 | class Unsqueeze(BaseOp):
59 | 
60 |     def __call__(self, x: kaleido.Tensor, dim: int):
61 |         super(Unsqueeze, self).__call__(x)
62 | 
63 |         t = kaleido.Tensor([0], x._type._dtype, device=x.device)
64 |         t.data = x.data.unsqueeze(dim)
65 |         t._type._shape = list(t.data.shape)
66 |         t.recompute_strides()
67 |         return t
68 | 
69 | 
70 | unsqueeze = Unsqueeze()
71 | 


--------------------------------------------------------------------------------
/kaleido/core/tests/test_allocator.cc:
--------------------------------------------------------------------------------
 1 | // Copyright (c) Microsoft Corporation. All rights reserved.
 2 | // Licensed under the MIT License.
 3 | 
 4 | #include "kaleido/core/cuda_allocator.h"
 5 | #include "kaleido/core/device/cuda_utils.h"
 6 | 
 7 | #include <gtest/gtest.h>
 8 | 
 9 | #include <iostream>
10 | #include <mutex>
11 | #include <thread>
12 | 
13 | namespace kaleido {
14 | namespace core {
15 | 
16 | TEST(test1, TEST_MEMORY_POOL) {
17 |   cudaStream_t stream;
18 |   CudaCheck(cudaStreamCreate(&stream));
19 | 
20 |   std::shared_ptr<CudaMemoryPool> memoryPool =
21 |       std::make_shared<CudaMemoryPool>();
22 |   // CudaMemoryPool is NOT multi-thread safe,
23 |   // prevent multiple threads from modifying the memory pool at the same time.
24 |   std::mutex mtx;
25 | 
26 |   // Before allocating memory from the memory pool,
27 |   // you need to register **all** the streams that may use memory space
28 |   // allocated from the memory pool.
29 |   const std::lock_guard<std::mutex> lock(mtx);
30 |   memoryPool->add_track_stream(stream);
31 | 
32 |   // Get 256MB cuda memory block.
33 |   // Only when the memory pool does not have a memory block that meets the
34 |   // requirements, a new memory block is actually allocated from the physical
35 |   // device. Requirements:
36 |   // - The returned memory block size should be greater than the required
37 |   // size.
38 |   // - The returned memory block size should be less than twice the requested
39 |   // size.
40 | 
41 |   // The returned memory space is guaranteed to meet the requested size.
42 |   // If the user reads and writes beyond the requested size, undefined
43 |   // behavior may occur.
44 |   int nbytes = 256 * 1024 * 2014;
45 |   void* ret = memoryPool->Allocate(nbytes);
46 | 
47 |   // Put the memory space back into the memory pool.
48 |   memoryPool->Deallocate(ret);
49 | 
50 |   nbytes = 128 * 1024 * 2014;
51 |   ret = memoryPool->Allocate(nbytes);
52 |   memoryPool->Deallocate(ret);
53 | 
54 |   nbytes = 128 * 1024 * 2014;
55 |   ret = memoryPool->Allocate(nbytes);
56 |   memoryPool->Deallocate(ret);
57 | }
58 | 
59 | }  // namespace core
60 | }  // namespace kaleido
61 | 


--------------------------------------------------------------------------------
/benchmarks/rnn/cuDNN/utils.h:
--------------------------------------------------------------------------------
 1 | // Copyright (c) Microsoft Corporation. All rights reserved.
 2 | // Licensed under the MIT License.
 3 | 
 4 | #include "RNN_example.h"
 5 | 
 6 | template <typename T_ELEM>
 7 | float runRNNSample(RNNSampleOptions& options) {
 8 |   RNNSample<T_ELEM> sample;
 9 |   sample.setup(options);
10 |   sample.run();
11 |   return sample.timeForward;
12 | }
13 | 
14 | float TestCuDNNLSTM(int mini_batch, int hidden_size, int seq_length,
15 |                     int num_layers, int input_size) {
16 |   RNNSampleOptions options;
17 | 
18 |   options.dataType = 1;  // CUDNN_DATA_FLOAT
19 |   // options.dataType = 0;
20 |   options.seqLength = seq_length;
21 |   options.numLayers = num_layers;
22 |   options.inputSize = input_size;
23 |   options.hiddenSize = hidden_size;
24 |   options.projSize = hidden_size;
25 |   options.miniBatch = mini_batch;
26 |   options.inputMode = 1;      // CUDNN_LINEAR_INPUT
27 |   options.dirMode = 0;        // CUDNN_UNIDIRECTIONAL
28 |   options.cellMode = 2;       // CUDNN_LSTM
29 |   options.biasMode = 3;       // CUDNN_RNN_DOUBLE_BIAS
30 |   options.algorithm = 0;      // CUDNN_RNN_ALGO_STANDARD
31 |   options.mathPrecision = 1;  // CUDNN_DATA_FLOAT
32 |   // options.mathPrecision = 0;
33 |   options.mathType = 0;  // CUDNN_DEFAULT_MATH
34 |   // options.mathType = 1;  // CUDNN_TENSOR_OP_MATH
35 |   options.dropout = 0.;
36 |   options.printWeights = 0;
37 | 
38 |   return runRNNSample<float>(options);
39 |   // return runRNNSample<__half>(options);
40 | }
41 | 
42 | int getRand(int min, int max) { return (rand() % (max - min)) + min + 1; }
43 | 
44 | void genSeqs(int batch_size, int seq_length, bool random) {
45 |   std::vector<int> temp(batch_size, seq_length);
46 | 
47 |   std::default_random_engine e;
48 |   e.seed(1234);
49 |   std::normal_distribution<float> distribution(seq_length / 2, seq_length / 8);
50 | 
51 |   for (int i = 1; i < batch_size; ++i) {
52 |     if (random) {
53 |       temp[i] = (int)distribution(e);
54 |     } else {
55 |       temp[i] = seq_length;
56 |     }
57 |   }
58 |   sort(temp.begin(), temp.end());
59 |   reverse(temp.begin(), temp.end());
60 |   seqs = temp;
61 | }
62 | 


--------------------------------------------------------------------------------
/kaleido/core/operators/gather_nd_op.cu:
--------------------------------------------------------------------------------
 1 | // Copyright (c) Microsoft Corporation. All rights reserved.
 2 | // Licensed under the MIT License.
 3 | 
 4 | #include "kaleido/core/device/gpu_context.h"
 5 | #include "kaleido/core/device/kernels/gather_scatter.h"
 6 | #include "kaleido/core/operators/gather_nd_op.h"
 7 | 
 8 | namespace kaleido {
 9 | namespace core {
10 | namespace ops {
11 | 
12 | template <typename T>
13 | class GatherNdOp<GPUContext, CUDAPlace, T> {
14 | public:
15 |   void operator()(const GPUContext& context, Tensor& output,
16 |                   const Tensor& input, const Tensor& indices) {
17 |     auto index_dims = indices.dims();
18 |     size_t index_dims_size = indices.ndim();
19 |     auto input_dims = input.dims();
20 |     size_t input_dims_size = input.ndim();
21 | 
22 |     // indices for the first `end_size` dimensionalities are specified
23 |     int64_t end_size = index_dims[index_dims_size - 1];
24 | 
25 |     int64_t remain_numel = 1;
26 |     for (int i = 0; i < index_dims_size - 1; ++i) remain_numel *= index_dims[i];
27 | 
28 |     // slice size
29 |     int64_t slice_size = 1;
30 |     for (int64_t i = end_size; i < input_dims_size; ++i) {
31 |       // innermost dimensionalities form contiguous memory to slice.
32 |       slice_size *= input_dims[i];
33 |     }
34 | 
35 |     int64_t* g_input_dims;
36 |     CudaCheck(cudaMalloc(&g_input_dims, input_dims_size * sizeof(int64_t)));
37 |     CudaCheck(cudaMemcpy(g_input_dims, input_dims.data(),
38 |                          input_dims_size * sizeof(int64_t),
39 |                          cudaMemcpyHostToDevice));
40 | 
41 |     int64_t block = 512;
42 |     int64_t n = slice_size * remain_numel;
43 |     int64_t grid = (n + block - 1) / block;
44 | 
45 |     cuda_kernel::GatherNdCUDAKernel<T><<<grid, block, 0>>>(
46 |         input.data<T>(), g_input_dims, indices.data<int64_t>(),
47 |         output.mutable_data<T>(), remain_numel, slice_size, end_size);
48 | 
49 |     cudaFree(g_input_dims);
50 |   }
51 | };
52 | 
53 | template class GatherNdOp<GPUContext, CUDAPlace, float>;
54 | template class GatherNdOp<GPUContext, CUDAPlace, int>;
55 | 
56 | }  // namespace ops
57 | }  // namespace core
58 | }  // namespace kaleido
59 | 


--------------------------------------------------------------------------------
/benchmarks/rnn/cuDNN/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | cmake_minimum_required(VERSION 3.0)
 2 | project(benchmarks CXX C)
 3 | 
 4 | list(APPEND CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH}
 5 |             "${CMAKE_SOURCE_DIR}/../../../cmake/Modules/")
 6 | 
 7 | message(STATUS "CXX compiler: ${CMAKE_CXX_COMPILER}, version: "
 8 |                "${CMAKE_CXX_COMPILER_ID} ${CMAKE_CXX_COMPILER_VERSION}")
 9 | message(STATUS "C compiler: ${CMAKE_C_COMPILER}, version: "
10 |                "${CMAKE_C_COMPILER_ID} ${CMAKE_C_COMPILER_VERSION}")
11 | 
12 | find_package(CUDA QUIET REQUIRED)
13 | find_package(CuDNN QUIET REQUIRED)
14 | 
15 | set(CMAKE_BUILD_TYPE Release)
16 | 
17 | set(CMAKE_CXX_STANDARD 14)
18 | set(CMAKE_CXX_STANDARD_REQUIRED TRUE)
19 | set(CMAKE_CUDA_STANDARD 14)
20 | set(CMAKE_CUDA_STANDARD_REQUIRED TRUE)
21 | 
22 | set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fPIC -Wl,--no-undefined")
23 | set(CMAKE_CXX_FLAGS_DEBUG
24 |     "$ENV{CXXFLAGS} -O0 -fPIC -Wall -Wno-sign-compare -g2 -ggdb")
25 | set(CMAKE_CXX_FLAGS_RELEASE "$ENV{CXXFLAGS} -fPIC -O3 -Wall -Wno-sign-compare")
26 | 
27 | set(CMAKE_CXX_LINK_EXECUTABLE
28 |     "${CMAKE_CXX_LINK_EXECUTABLE} -lpthread -ldl -lrt")
29 | 
30 | set(CUDA_PROPAGATE_HOST_FLAGS OFF)
31 | 
32 | set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS} -w -gencode arch=compute_75,code=sm_75)
33 | set(CUDA_NVCC_FLAGS_DEBUG ${CUDA_NVCC_FLAGS_DEBUG} -w -gencode
34 |                           arch=compute_75,code=sm_75)
35 | set(CUDA_NVCC_FLAGS_RELEASE ${CUDA_NVCC_FLAGS_RELEASE} -w -O3 -gencode
36 |                             arch=compute_75,code=sm_75)
37 | 
38 | include_directories(${CUDA_INCLUDE_DIRS})
39 | include_directories(${CUDNN_INCLUDE_DIRS})
40 | 
41 | cuda_add_executable(cudnn_lstm main.cu)
42 | target_link_libraries(cudnn_lstm ${CUDA_LIBRARIES} ${CUDA_curand_LIBRARY}
43 |                       ${CUDNN_LIBRARIES})
44 | 
45 | cuda_add_executable(lstm_cell_cudnn lstm_cell_cudnn.cu)
46 | target_link_libraries(lstm_cell_cudnn ${CUDA_LIBRARIES} ${CUDA_curand_LIBRARY}
47 |                       ${CUDNN_LIBRARIES})
48 | 
49 | cuda_add_executable(stacked_lstm_cudnn stacked_lstm_cudnn.cu)
50 | target_link_libraries(stacked_lstm_cudnn ${CUDA_LIBRARIES}
51 |                       ${CUDA_curand_LIBRARY} ${CUDNN_LIBRARIES})
52 | 


--------------------------------------------------------------------------------
/cmake/external/zlib.cmake:
--------------------------------------------------------------------------------
 1 | # -------------------------------------------------------------------------
 2 | # Copyright (c) Microsoft Corporation. All rights reserved. Licensed under the
 3 | # MIT License.
 4 | # --------------------------------------------------------------------------
 5 | 
 6 | include(ExternalProject)
 7 | 
 8 | set(ZLIB_PREFIX_DIR ${THIRD_PARTY_PATH}/zlib)
 9 | set(ZLIB_SOURCE_DIR ${THIRD_PARTY_PATH}/zlib/src/extern_zlib)
10 | set(ZLIB_INSTALL_DIR ${THIRD_PARTY_PATH}/install/zlib)
11 | set(ZLIB_ROOT
12 |     ${ZLIB_INSTALL_DIR}
13 |     CACHE FILEPATH "zlib root directory." FORCE)
14 | set(ZLIB_INCLUDE_DIR
15 |     "${ZLIB_INSTALL_DIR}/include"
16 |     CACHE PATH "zlib include directory." FORCE)
17 | set(ZLIB_REPOSITORY https://github.com/madler/zlib.git)
18 | set(ZLIB_TAG v1.2.8)
19 | 
20 | include_directories(${ZLIB_INCLUDE_DIR})
21 | include_directories(${THIRD_PARTY_PATH}/install)
22 | 
23 | cache_third_party(
24 |   extern_zlib
25 |   REPOSITORY
26 |   ${ZLIB_REPOSITORY}
27 |   TAG
28 |   ${ZLIB_TAG}
29 |   DIR
30 |   ZLIB_SOURCE_DIR)
31 | 
32 | ExternalProject_Add(
33 |   extern_zlib
34 |   ${EXTERNAL_PROJECT_LOG_ARGS}
35 |   ${SHALLOW_CLONE}
36 |   "${ZLIB_DOWNLOAD_CMD}"
37 |   PREFIX ${ZLIB_PREFIX_DIR}
38 |   SOURCE_DIR ${ZLIB_SOURCE_DIR}
39 |   UPDATE_COMMAND ""
40 |   CMAKE_ARGS -DCMAKE_C_COMPILER=${CMAKE_C_COMPILER}
41 |              -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER}
42 |              -DCMAKE_C_FLAGS=${CMAKE_C_FLAGS}
43 |              -DCMAKE_CXX_FLAGS=${CMAKE_CXX_FLAGS}
44 |              -DCMAKE_INSTALL_PREFIX=${ZLIB_INSTALL_DIR}
45 |              -DBUILD_SHARED_LIBS=OFF
46 |              -DCMAKE_POSITION_INDEPENDENT_CODE=ON
47 |              -DCMAKE_MACOSX_RPATH=ON
48 |              -DCMAKE_BUILD_TYPE=${THIRD_PARTY_BUILD_TYPE}
49 |              ${EXTERNAL_OPTIONAL_ARGS}
50 |   CMAKE_CACHE_ARGS
51 |     -DCMAKE_INSTALL_PREFIX:PATH=${ZLIB_INSTALL_DIR}
52 |     -DCMAKE_POSITION_INDEPENDENT_CODE:BOOL=ON
53 |     -DCMAKE_BUILD_TYPE:STRING=${THIRD_PARTY_BUILD_TYPE})
54 | set(ZLIB_LIBRARIES
55 |     "${ZLIB_INSTALL_DIR}/lib/libz.a"
56 |     CACHE FILEPATH "zlib library." FORCE)
57 | 
58 | add_library(zlib STATIC IMPORTED GLOBAL)
59 | set_property(TARGET zlib PROPERTY IMPORTED_LOCATION ${ZLIB_LIBRARIES})
60 | add_dependencies(zlib extern_zlib)
61 | 


--------------------------------------------------------------------------------
/kaleido/frontend/operations/tests/test_aggregate.py:
--------------------------------------------------------------------------------
 1 | # -------------------------------------------------------------------------
 2 | # Copyright (c) Microsoft Corporation. All rights reserved.
 3 | # Licensed under the MIT License.
 4 | # --------------------------------------------------------------------------
 5 | 
 6 | from __future__ import print_function
 7 | 
 8 | import itertools
 9 | import operator
10 | 
11 | import torch
12 | from context import *
13 | 
14 | 
15 | class TestScan(unittest.TestCase):
16 |     MAX = 193
17 |     N = 17
18 | 
19 |     def setUp(self):
20 |         random.seed(12345)
21 | 
22 |         self.data = [
23 |             random.randint(0, TestScan.MAX) for _ in range(TestScan.N)
24 |         ]
25 |         self.xs = kaleido.FractalTensor.from_pylist(self.data)
26 | 
27 |     def test1(self):
28 |         """Test single-level scan."""
29 | 
30 |         expected_results = list(itertools.accumulate(self.data, operator.add))
31 | 
32 |         ys = kaleido.operations.scan(lambda s, x: kaleido.operations.add(s, x),
33 |                                      self.xs)
34 |         self.assertTrue(isinstance(ys, kaleido.FractalTensor))
35 |         self.assertEqual(len(ys), len(self.xs))
36 | 
37 |         init = kaleido.Tensor((1, ), kaleido.int32)
38 |         init.data = torch.LongTensor([5])
39 |         ys = kaleido.operations.scan(lambda s, x: kaleido.operations.add(s, x),
40 |                                      self.xs, init)
41 |         self.assertTrue(isinstance(ys, kaleido.FractalTensor))
42 |         self.assertEqual(len(ys), len(self.xs))
43 | 
44 |         expected_results = list(
45 |             itertools.accumulate([5] + self.data, operator.add))
46 |         for i, y in enumerate(ys):
47 |             self.assertEqual(y.data.item(), expected_results[i + 1])
48 | 
49 |     def test2(self):
50 |         init = kaleido.Tensor((1, ), kaleido.int32)
51 |         init.data = torch.LongTensor([5])
52 | 
53 |         ys, zs = kaleido.operations.scan(lambda s, x: (x, x), self.xs, init)
54 | 
55 |         for x, y, z in kaleido.operations.zip(self.xs, ys, zs):
56 |             self.assertEqual(y.data.item(), x.data.item())
57 |             self.assertEqual(y.data.item(), x.data.item())
58 | 
59 | 
60 | if __name__ == '__main__':
61 |     unittest.main()
62 | 


--------------------------------------------------------------------------------
/kaleido/frontend/operations/tests/test_join.py:
--------------------------------------------------------------------------------
 1 | # -------------------------------------------------------------------------
 2 | # Copyright (c) Microsoft Corporation. All rights reserved.
 3 | # Licensed under the MIT License.
 4 | # --------------------------------------------------------------------------
 5 | 
 6 | from __future__ import print_function
 7 | 
 8 | import torch
 9 | from context import *
10 | 
11 | 
12 | class TestJoin(unittest.TestCase):
13 | 
14 |     def create_depth1_fractaltensor(self, length, device='cpu'):
15 |         shape = [3, 7]
16 |         xs = kaleido.FractalTensor(
17 |             kaleido.TensorStorage(shape, kaleido.float32, device=device))
18 |         xs.indices = list(range(length))
19 |         xs.initialize(torch.rand, *xs.flatten_shape, device=device)
20 |         return xs
21 | 
22 |     def create_depth2_fractaltensor(self, length, device='cpu'):
23 |         shape = [3, 7]
24 |         xss = kaleido.FractalTensor(
25 |             kaleido.FractalTensorStorage(
26 |                 kaleido.TensorStorage(shape, kaleido.float32, device=device)))
27 |         xss.indices = [
28 |             list(range(random.randint(5, 17))) for _ in range(length)
29 |         ]
30 |         xss.initialize(torch.rand, *xss.flatten_shape, device=device)
31 |         return xss
32 | 
33 |     def setUp(self):
34 |         random.seed(12345)
35 | 
36 |     def test_join1(self):
37 |         xs = self.create_depth1_fractaltensor(19)
38 |         ys = self.create_depth1_fractaltensor(3)
39 |         zs = kaleido.operations.join(xs, ys)
40 | 
41 |         self.assertTrue(isinstance(zs, kaleido.FractalTensor))
42 |         self.assertEqual(zs.depth, xs.depth)
43 |         self.assertEqual(len(xs) + len(ys), len(zs))
44 |         self.assertEqual(xs.numel + ys.numel, zs.numel)
45 | 
46 |     def test_join2(self):
47 |         xss = self.create_depth2_fractaltensor(11)
48 |         yss = self.create_depth2_fractaltensor(7)
49 |         zss = kaleido.operations.join(xss, yss)
50 | 
51 |         self.assertTrue(isinstance(zss, kaleido.FractalTensor))
52 |         self.assertEqual(zss.depth, xss.depth)
53 |         self.assertEqual(len(xss) + len(yss), len(zss))
54 |         self.assertEqual(xss.numel + yss.numel, zss.numel)
55 | 
56 | 
57 | if __name__ == '__main__':
58 |     unittest.main()
59 | 


--------------------------------------------------------------------------------
/kaleido/core/device/gpu_context.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include "kaleido/core/device/cuda_info.h"
 4 | #include "kaleido/core/device/cuda_utils.h"
 5 | #include "kaleido/core/device/device_context.h"
 6 | #include "kaleido/core/place.h"
 7 | 
 8 | namespace kaleido {
 9 | namespace core {
10 | 
11 | class GPUContext : public DeviceContext {
12 | public:
13 |   GPUContext();
14 |   explicit GPUContext(const CUDAPlace& place) : place_{place} {
15 |     CublasCheck(cublasCreate(&cublas_handle_));
16 |     CublasCheck(cublasSetPointerMode(cublas_handle_, CUBLAS_POINTER_MODE_HOST));
17 |     CudnnCheck(cudnnCreate(&cudnn_handle_));
18 | 
19 |     compute_capability_ = GetGPUComputeCapability(place_.GetDeviceId());
20 |     multi_process_ = GetGPUMultiProcessors(place_.GetDeviceId());
21 |     max_threads_per_mp_ =
22 |         GetGPUMaxThreadsPerMultiProcessor(place_.GetDeviceId());
23 |     max_threads_per_block_ = GetGPUMaxThreadsPerBlock(place_.GetDeviceId());
24 |     max_grid_dim_size_ = GetGpuMaxGridDimSize(place_.GetDeviceId());
25 |     device_name_ = GetDeviceName();
26 |   }
27 | 
28 |   static GPUContext& GetInstance() {
29 |     static GPUContext context;
30 |     return context;
31 |   }
32 | 
33 |   ~GPUContext();
34 |   GPUContext(GPUContext const&) = delete;
35 |   void operator=(GPUContext const&) = delete;
36 | 
37 |   int GetComputeCapability() const { return compute_capability_; };
38 | 
39 |   int GetMaxPhysicalThreadCount() const {
40 |     return multi_process_ * max_threads_per_mp_;
41 |   };
42 | 
43 |   int GetMaxThreadsPerBlock() const { return max_threads_per_block_; };
44 | 
45 |   int GetSMCount() const { return multi_process_; };
46 | 
47 |   dim3 GetCUDAMaxGridDimSize() const { return max_grid_dim_size_; };
48 |   std::string GetDeviceName() const { return device_name_; }
49 | 
50 |   cublasHandle_t cublas_handle() const { return cublas_handle_; }
51 |   cudnnHandle_t cudnn_handle() const { return cudnn_handle_; }
52 | 
53 | private:
54 |   cublasHandle_t cublas_handle_;
55 |   cudnnHandle_t cudnn_handle_;
56 | 
57 |   CUDAPlace place_;
58 |   int compute_capability_;
59 |   int multi_process_;
60 |   int max_threads_per_mp_;
61 |   int max_threads_per_block_;
62 |   dim3 max_grid_dim_size_;
63 |   std::string device_name_;
64 | };
65 | 
66 | }  // namespace core
67 | }  // namespace kaleido
68 | 


--------------------------------------------------------------------------------
/docs/fractaltensor_operations/extended_access_operations.md:
--------------------------------------------------------------------------------
 1 | <!-- vscode-markdown-toc -->
 2 | 
 3 | - [Extended operations to access a single FractalTensor](#extended-operations-to-access-a-single-fractaltensor)
 4 |   - [head and tail](#head-and-tail)
 5 |   - [slide](#slide)
 6 |   - [[**deprecated**] access_by_depth](#deprecated-access_by_depth)
 7 | 
 8 | <!-- vscode-markdown-toc-config
 9 | 	numbering=true
10 | 	autoSave=true
11 | 	/vscode-markdown-toc-config -->
12 | <!-- /vscode-markdown-toc -->
13 | 
14 | # Extended operations to access a single FractalTensor
15 | 
16 | In the full program, accessing `FractalTensor` is not materialized directly. They encode information of how parallel functions read the inputs.
17 | 
18 | Extended access APIs are wrappers of accessing primitives. It is not necessary to enumerate and implement them all. They are implemented through access primitives and are all unified into and analyzed as some form of access functions in the IR program.
19 | 
20 | ## head and tail
21 | 
22 | $$\mathbf{head}::\Psi n.[\alpha]_n^d \rightarrow [\alpha]_1^{[d-1]}$$
23 | $$\mathbf{tail}::\Psi n.[\alpha]_n^d \rightarrow [\alpha]_1^{[d-1]}$$
24 | 
25 | ```python
26 | head(x: FractalTensor[T]) -> T
27 | tail(x: FractalTensor[T]) -> T
28 | ```
29 | 
30 | ## slide
31 | 
32 | ```python
33 | slide(input: FractalTensor[T],
34 |       window_size: int,
35 |       stride: int,
36 |       dilation: int,
37 |       padding: int = None,
38 |       padding_value: T = None) -> FractalTensor[FractalTensor[T]]:
39 | ```
40 | 
41 | <p align="center">
42 | <img src="../images/slide_over_fractaltensor.png" width=75%><br>
43 | Fig. Apply a sliding window over a FractalTensor variable.
44 | </p>
45 | 
46 | 
47 | ## [**deprecated**] access_by_depth
48 | 
49 | _#TODO(ying): this operation is a little bit awkward. Rethink about this._
50 | 
51 | ```python
52 | access_by_depth(x: FractalTensor[T], depth: int) -> FractalTensor[T]
53 | ```
54 | 
55 | Example `x = access_by_depth(x, x.depth)` :
56 | 
57 | <p align="center">
58 | <img src="../images/access_by_depth_1.png"><br>
59 | Fig. Access a depth-N FractalTensor variable x by the depth N.
60 | </p>
61 | 
62 | Example `x = access_by_depth(x, x.depth - 1)` :
63 | 
64 | <p align="center">
65 | <img src="../images/access_by_depth_2.png" width=90%><br>
66 | Fig. Access a depth-N FractalTensor variable x by the depth N - 1.
67 | </p>
68 | 


--------------------------------------------------------------------------------
/benchmarks/rnn/baselines/stacked_dilated_rnn/stacked_drnn_tensorflow.py:
--------------------------------------------------------------------------------
 1 | # -------------------------------------------------------------------------
 2 | # Copyright (c) Microsoft Corporation. All rights reserved.
 3 | # Licensed under the MIT License.
 4 | # --------------------------------------------------------------------------
 5 | 
 6 | import os
 7 | 
 8 | os.environ['CUDA_VISIBLE_DEVICES'] = '0'
 9 | 
10 | import math
11 | import unittest
12 | from time import time
13 | 
14 | import tensorflow as tf
15 | from tf_model import StackedDRNN
16 | from utils import *
17 | 
18 | 
19 | class TFGraphDRNN(unittest.TestCase):
20 | 
21 |     def setUp(self):
22 |         self.shape = (SEQ_LEN, BATCH_SIZE, INPUT_SIZE)
23 |         self.stddev = 1.0 / math.sqrt(HIDDEN_SIZE)
24 | 
25 |         self.log_dir = ''
26 |         self.logger = init_logger(self.log_dir, 'tensorflow_drnn.txt')
27 | 
28 |     def _apply_forward(self, test_name, model, *inputs):
29 |         for i in range(WARMUP):
30 |             output = model(*inputs)
31 | 
32 |         start = time()
33 | 
34 |         for i in range(ITERS):
35 |             output = model(*inputs)
36 |         report(test_name, start, self.logger)
37 | 
38 |     def test_drnn_forward(self):
39 |         shape = (SEQ_LEN, BATCH_SIZE, INPUT_SIZE)
40 |         stddev = 1.0 / math.sqrt(HIDDEN_SIZE)
41 | 
42 |         gpus = tf.config.list_physical_devices('GPU')
43 |         for device in [
44 |                 # 'cpu',
45 |                 '/device:GPU:0',
46 |         ]:
47 |             with tf.device(device):
48 |                 model = StackedDRNN(batch_size=BATCH_SIZE,
49 |                                     seq_len=SEQ_LEN,
50 |                                     input_size=INPUT_SIZE,
51 |                                     hidden_size=HIDDEN_SIZE,
52 |                                     dilation=DILATION)
53 | 
54 |                 x = tf.random.uniform(shape, minval=-stddev, maxval=stddev)
55 |                 rate = DILATION[-1]
56 |                 padding_data = tf.zeros(
57 |                     ((rate - (SEQ_LEN % rate)) % rate, BATCH_SIZE, INPUT_SIZE),
58 |                     dtype=tf.dtypes.float32)
59 |                 test_name = f'TensorFlow_Stacked_DLSTM_{device}'
60 |                 self._apply_forward(test_name, model, x, padding_data)
61 | 
62 | 
63 | if __name__ == '__main__':
64 |     unittest.main(argv=['first-arg-is-ignored'])
65 | 


--------------------------------------------------------------------------------
/kaleido/core/device/kernels/gather_scatter.h:
--------------------------------------------------------------------------------
 1 | // Copyright (c) Microsoft Corporation. All rights reserved.
 2 | // Licensed under the MIT License.
 3 | 
 4 | #pragma once
 5 | 
 6 | namespace kaleido {
 7 | namespace core {
 8 | namespace cuda_kernel {
 9 | 
10 | #define CUDA_KERNEL_LOOP_TYPE(i, num, index_type)            \
11 |   int64_t __index__ = blockIdx.x * blockDim.x + threadIdx.x; \
12 |   for (index_type i = __index__; __index__ < (num);          \
13 |        __index__ += blockDim.x * gridDim.x, i = __index__)
14 | 
15 | template <typename T>
16 | __global__ void GatherNdCUDAKernel(const T* input, const int64_t* input_dims,
17 |                                    const int64_t* indices, T* output,
18 |                                    size_t remain_size, size_t slice_size,
19 |                                    size_t end_size) {
20 |   CUDA_KERNEL_LOOP_TYPE(i, remain_size * slice_size, int64_t) {
21 |     int64_t indices_i = i / slice_size;
22 |     int64_t slice_i = i - indices_i * slice_size;  // offset inside the slice
23 |     int64_t gather_i = 0;
24 |     int64_t temp = slice_size;
25 |     for (int64_t j = end_size - 1; j >= 0; --j) {
26 |       auto index_value = indices[indices_i * end_size + j];
27 |       gather_i += (index_value * temp);
28 |       temp *= input_dims[j];
29 |     }
30 |     int64_t input_i = gather_i + slice_i;
31 |     *(output + i) = *(input + input_i);
32 |   }
33 | }
34 | 
35 | template <typename T>
36 | __global__ void ScatterNdCUDAKernel(const T* update, const int64_t* indices,
37 |                                     T* output, const int64_t* output_dims,
38 |                                     size_t remain_size, size_t slice_size,
39 |                                     size_t end_size) {
40 |   CUDA_KERNEL_LOOP_TYPE(i, remain_size * slice_size, int64_t) {
41 |     int64_t indices_i = i / slice_size;
42 |     int64_t slice_i = i - indices_i * slice_size;  // offset inside the slice
43 |     int64_t gather_i = 0;
44 |     int64_t temp = slice_size;
45 |     for (int64_t j = end_size - 1; j >= 0; --j) {
46 |       int64_t index_value = indices[indices_i * end_size + j];
47 | 
48 |       gather_i += (index_value * temp);
49 |       temp *= output_dims[j];
50 |     }
51 |     int64_t output_i = gather_i + slice_i;
52 |     atomicAdd(output + output_i, *(update + i));
53 |   }
54 | }
55 | 
56 | }  // namespace cuda_kernel
57 | }  // namespace core
58 | }  // namespace kaleido
59 | 


--------------------------------------------------------------------------------
/kaleido/core/device/cuda_info.cc:
--------------------------------------------------------------------------------
 1 | // Copyright (c) Microsoft Corporation. All rights reserved.
 2 | // Licensed under the MIT License.
 3 | 
 4 | #include "kaleido/core/device/cuda_info.h"
 5 | 
 6 | #include "kaleido/core/device/cuda_utils.h"
 7 | 
 8 | #include <cuda_runtime.h>
 9 | 
10 | #include <sstream>
11 | #include <vector>
12 | 
13 | namespace kaleido {
14 | namespace core {
15 | 
16 | int GetGPUDeviceCount() {
17 |   int deviceCount = 0;
18 |   CudaCheck(cudaGetDeviceCount(&deviceCount));
19 |   return deviceCount;
20 | }
21 | 
22 | int GetGPUComputeCapability(int id) {
23 |   int major, minor;
24 |   CudaCheck(
25 |       cudaDeviceGetAttribute(&major, cudaDevAttrComputeCapabilityMajor, id));
26 |   CudaCheck(
27 |       cudaDeviceGetAttribute(&minor, cudaDevAttrComputeCapabilityMinor, id));
28 |   return major * 10 + minor;
29 | }
30 | 
31 | int GetGPUMultiProcessors(int id) {
32 |   int count;
33 |   CudaCheck(cudaDeviceGetAttribute(&count, cudaDevAttrMultiProcessorCount, id));
34 |   return count;
35 | }
36 | 
37 | int GetGPUMaxThreadsPerMultiProcessor(int id) {
38 |   int count;
39 |   CudaCheck(cudaDeviceGetAttribute(&count,
40 |                                    cudaDevAttrMaxThreadsPerMultiProcessor, id));
41 |   return count;
42 | }
43 | 
44 | int GetGPUMaxThreadsPerBlock(int id) {
45 |   int count;
46 |   CudaCheck(cudaDeviceGetAttribute(&count, cudaDevAttrMaxThreadsPerBlock, id));
47 |   return count;
48 | }
49 | 
50 | dim3 GetGpuMaxGridDimSize(int id) {
51 |   dim3 grid_size;
52 | 
53 |   int size;
54 |   CudaCheck(cudaDeviceGetAttribute(&size, cudaDevAttrMaxGridDimX, id));
55 |   grid_size.x = size;
56 | 
57 |   CudaCheck(cudaDeviceGetAttribute(&size, cudaDevAttrMaxGridDimY, id));
58 |   grid_size.y = size;
59 | 
60 |   CudaCheck(cudaDeviceGetAttribute(&size, cudaDevAttrMaxGridDimZ, id));
61 |   grid_size.z = size;
62 |   return grid_size;
63 | }
64 | 
65 | std::string GetDeviceName() {
66 |   cudaDeviceProp prop;
67 |   cudaGetDeviceProperties(&prop, 0);
68 | 
69 |   std::stringstream ss(prop.name);
70 |   const char delim = ' ';
71 | 
72 |   std::string s;
73 |   std::vector<std::string> out;
74 | 
75 |   while (std::getline(ss, s, delim)) {
76 |     out.push_back(s);
77 |   }
78 | 
79 |   std::stringstream out_ss;
80 |   int i = 0;
81 |   for (; i < out.size() - 1; ++i) out_ss << out[i] << "_";
82 |   out_ss << out[i];
83 |   return out_ss.str();
84 | }
85 | 
86 | }  // namespace core
87 | }  // namespace kaleido
88 | 


--------------------------------------------------------------------------------
/kaleido/parser/operations/access_patterns.py:
--------------------------------------------------------------------------------
 1 | # -------------------------------------------------------------------------
 2 | # Copyright (c) Microsoft Corporation. All rights reserved.
 3 | # Licensed under the MIT License.
 4 | # --------------------------------------------------------------------------
 5 | 
 6 | from __future__ import absolute_import, division, print_function
 7 | 
 8 | from collections import OrderedDict
 9 | from typing import Tuple
10 | 
11 | from kaleido.frontend.types import FractalTensorStorage, Storage, TensorStorage
12 | from kaleido.parser.ir_nodes import AccessNode
13 | from kaleido.parser.operations.common import registers
14 | 
15 | 
16 | @registers.access.register
17 | class Index(AccessNode):
18 |     opcode = 'index'
19 |     arity = 1
20 | 
21 |     def __init__(self, name: str):
22 |         super(Index, self).__init__(name, OrderedDict(), OrderedDict())
23 | 
24 |     def propagate_storage(self) -> Storage:
25 |         super(Index, self).propagate_storage()
26 |         ids = self.attributes['index']
27 |         self.output_ports[list(self.output_ports.keys())[-1]] = list(
28 |             self.input_ports.values())[0].element_type()
29 | 
30 | 
31 | @registers.access.register
32 | class Last(AccessNode):
33 |     opcode = 'last'
34 |     arity = 1
35 | 
36 |     def __init__(self, name: str):
37 |         super(Last, self).__init__(name, OrderedDict(), OrderedDict())
38 | 
39 | 
40 | @registers.access.register
41 | class Slice(AccessNode):
42 |     opcode = 'slice'
43 |     arity = 1
44 | 
45 |     def __init__(self, name: str):
46 |         super(Slice, self).__init__(name, OrderedDict(), OrderedDict())
47 | 
48 |     def propagate_storage(self) -> Storage:
49 |         super(Slice, self).propagate_storage()
50 | 
51 |         lower = self.attributes['lower']
52 |         step = self.attributes['step']
53 |         upper = self.attributes['upper']
54 | 
55 |         s_in = list(self.input_ports.values())[0].element_type()
56 |         s_out = FractalTensorStorage(s_in)
57 |         s_out.indices = list(range((upper - lower) // step))
58 |         self.output_ports[list(self.output_ports.keys())[-1]] = s_out
59 | 
60 | 
61 | @registers.access.register
62 | class Slices(AccessNode):
63 |     opcode = 'slices'
64 |     arity = 1
65 | 
66 |     def __init__(self, name: str):
67 |         super(Slices, self).__init__(name, OrderedDict(), OrderedDict())
68 | 
69 |     def propagate_storage(self) -> Storage:
70 |         super().propagate_storage()
71 | 


--------------------------------------------------------------------------------
/benchmarks/rnn/cuDNN/main.cu:
--------------------------------------------------------------------------------
 1 | // Copyright (c) Microsoft Corporation. All rights reserved.
 2 | // Licensed under the MIT License.
 3 | 
 4 | #include "utils.h"
 5 | 
 6 | int main(int argc, char* argv[]) {
 7 |   srand(1234);
 8 |   int batch_size = 64;
 9 |   int hidden_size = 256;
10 |   int seq_length = 100;
11 |   int depth = 10;
12 | 
13 |   int input_size = hidden_size;
14 | 
15 |   genSeqs(batch_size, seq_length, false);
16 | 
17 |   for (auto depth : {2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22}) {
18 |     float cudnn_time =
19 |         TestCuDNNLSTM(batch_size, hidden_size, seq_length, depth, input_size);
20 | 
21 |     std::stringstream ss;
22 |     ss << "[" << batch_size << ", " << hidden_size << ", " << seq_length << ", "
23 |        << depth << "]|";
24 |     std::cout << "|CuDNN|" << ss.str() << "||||" << cudnn_time << "|"
25 |               << std::endl;
26 |   }
27 | 
28 |   std::cout << std::endl;
29 | 
30 |   for (auto seq_length : {50, 75, 100, 125, 150, 175, 200}) {
31 |     genSeqs(batch_size, seq_length, false);
32 |     float cudnn_time =
33 |         TestCuDNNLSTM(batch_size, hidden_size, seq_length, depth, input_size);
34 | 
35 |     std::stringstream ss;
36 |     ss << "[" << batch_size << ", " << hidden_size << ", " << seq_length << ", "
37 |        << depth << "]|";
38 |     std::cout << "|CuDNN|" << ss.str() << "||||" << cudnn_time << "|"
39 |               << std::endl;
40 |   }
41 | 
42 |   std::cout << std::endl;
43 | 
44 |   genSeqs(batch_size, seq_length, true);
45 | 
46 |   for (auto depth : {2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22}) {
47 |     float cudnn_time =
48 |         TestCuDNNLSTM(batch_size, hidden_size, seq_length, depth, input_size);
49 | 
50 |     std::stringstream ss;
51 |     ss << "[" << batch_size << ", " << hidden_size << ", " << seq_length << ", "
52 |        << depth << "]|";
53 |     std::cout << "|CuDNN|" << ss.str() << "||||" << cudnn_time << "|"
54 |               << std::endl;
55 |   }
56 | 
57 |   std::cout << std::endl;
58 | 
59 |   for (auto seq_length : {50, 75, 100, 125, 150, 175, 200}) {
60 |     genSeqs(batch_size, seq_length, true);
61 |     float cudnn_time =
62 |         TestCuDNNLSTM(batch_size, hidden_size, seq_length, depth, input_size);
63 | 
64 |     std::stringstream ss;
65 |     ss << "[" << batch_size << ", " << hidden_size << ", " << seq_length << ", "
66 |        << depth << "]|";
67 |     std::cout << "|CuDNN|" << ss.str() << "||||" << cudnn_time << "|"
68 |               << std::endl;
69 |   }
70 | 
71 |   return 0;
72 | }
73 | 


--------------------------------------------------------------------------------
/kaleido/core/operators/scatter_nd_op.cu:
--------------------------------------------------------------------------------
 1 | // Copyright (c) Microsoft Corporation. All rights reserved.
 2 | // Licensed under the MIT License.
 3 | 
 4 | #include "kaleido/core/device/gpu_context.h"
 5 | #include "kaleido/core/device/kernels/gather_scatter.h"
 6 | #include "kaleido/core/operators/scatter_nd_op.h"
 7 | 
 8 | namespace kaleido {
 9 | namespace core {
10 | namespace ops {
11 | 
12 | template <typename T>
13 | class ScatterNdAddOp<GPUContext, CUDAPlace, T> {
14 | public:
15 |   void operator()(const GPUContext& context, Tensor& data,
16 |                   const Tensor& updates, const Tensor& indices) {
17 |     auto index_dims = indices.dims();
18 |     auto index_dims_size = index_dims.size();
19 | 
20 |     // output_dims = data.dims()
21 |     auto output_dims = data.dims();
22 |     auto output_dims_size = output_dims.size();
23 | 
24 |     // final dim
25 |     int64_t end_size = index_dims[index_dims_size - 1];
26 | 
27 |     // remain dim
28 |     auto remain_dims = std::vector<int64_t>(index_dims.begin(),
29 |                                             index_dims.end() - index_dims_size);
30 | 
31 |     // Compute the product of the indices dimensions.
32 |     int64_t remain_numel = 1;
33 |     for (int i = 0; i < index_dims_size - 1; ++i) remain_numel *= index_dims[i];
34 | 
35 |     // slice size
36 |     int64_t slice_size = 1;
37 | 
38 |     // Calculate the product of output dimensions.
39 |     for (int64_t i = end_size; i < output_dims_size; ++i)
40 |       slice_size *= output_dims[i];
41 | 
42 |     // Calculate bytes of each slice.
43 |     const size_t slice_bytes = slice_size * sizeof(T);
44 | 
45 |     int64_t* g_output_dims;
46 |     CudaCheck(cudaMalloc(&g_output_dims, output_dims_size * sizeof(int64_t)));
47 |     CudaCheck(cudaMemcpy(g_output_dims, output_dims.data(),
48 |                          output_dims_size * sizeof(int64_t),
49 |                          cudaMemcpyHostToDevice));
50 | 
51 |     int64_t block = 512;
52 |     int64_t n = slice_size * remain_numel;
53 |     int64_t grid = (n + block - 1) / block;
54 | 
55 |     cuda_kernel::ScatterNdCUDAKernel<T><<<grid, block, 0>>>(
56 |         updates.data<T>(), indices.data<int64_t>(), data.mutable_data<T>(),
57 |         g_output_dims, remain_numel, slice_size, end_size);
58 |   }
59 | };
60 | 
61 | template class ScatterNdAddOp<GPUContext, CUDAPlace, float>;
62 | template class ScatterNdAddOp<GPUContext, CUDAPlace, int>;
63 | 
64 | }  // namespace ops
65 | }  // namespace core
66 | }  // namespace kaleido
67 | 


--------------------------------------------------------------------------------
/benchmarks/rnn/baselines/stacked_dilated_rnn/tf_model/model.py:
--------------------------------------------------------------------------------
 1 | # -------------------------------------------------------------------------
 2 | # Copyright (c) Microsoft Corporation. All rights reserved.
 3 | # Licensed under the MIT License.
 4 | # --------------------------------------------------------------------------
 5 | 
 6 | from typing import List
 7 | 
 8 | import tensorflow as tf
 9 | 
10 | __all__ = [
11 |     'StackedDRNN',
12 | ]
13 | 
14 | 
15 | class StackedDRNN(tf.keras.Model):
16 | 
17 |     def __init__(self, batch_size: int, seq_len: int, input_size: int,
18 |                  hidden_size: int, dilation: List[int]):
19 |         super(StackedDRNN, self).__init__()
20 | 
21 |         self.batch_size = batch_size
22 |         self.seq_len = seq_len
23 |         self.hidden_size = hidden_size
24 |         self.input_size = input_size
25 |         self.dilation = dilation
26 |         self.num_layers = len(dilation)
27 | 
28 |         rate = dilation[-1]
29 |         self.padded_length = (rate - (seq_len % rate)) % rate + self.seq_len
30 | 
31 |         self.cells = []
32 |         for i in range(self.num_layers):
33 |             self.cells.append(
34 |                 tf.compat.v1.keras.layers.CuDNNLSTM(hidden_size,
35 |                                                     return_sequences=False))
36 | 
37 |     # uncomment the following line to enable auto-graph.
38 |     # @tf.function
39 |     def call(self, input, padding_data):
40 |         # step 0: pad the input
41 |         input_x = tf.concat((input, padding_data), axis=0)
42 | 
43 |         # no special treatment for the first layer.
44 |         xs = self.cells[0](input_x)
45 | 
46 |         for i, cell in enumerate(self.cells[1:]):
47 |             # for layers above the frist layer.
48 |             # step 1: pre-process: form a new batch
49 |             num_split = self.padded_length // self.dilation[i + 1]
50 | 
51 |             xs_ = [
52 |                 tf.reshape(x, (-1, self.hidden_size))
53 |                 for x in tf.split(xs, num_or_size_splits=num_split, axis=0)
54 |             ]
55 |             dilated_input = tf.stack(xs_)
56 | 
57 |             # step 2: call LSTM layer
58 |             xs = cell(dilated_input)
59 | 
60 |             # step 3: post-processing, revert to the original layout
61 |             xss = [
62 |                 tf.split(x, self.dilation[i + 1], axis=0)
63 |                 for x in tf.unstack(xs, axis=0)
64 |             ]
65 | 
66 |             xs = tf.stack([x for sublist in xss for x in sublist])
67 |         return xs
68 | 


--------------------------------------------------------------------------------
/kaleido/core/device/kernels/softmax_v2.h:
--------------------------------------------------------------------------------
 1 | // Copyright (c) Microsoft Corporation. All rights reserved.
 2 | // Licensed under the MIT License.
 3 | 
 4 | #pragma once
 5 | 
 6 | #include "kaleido/core/device/cuda_utils.h"
 7 | #include "kaleido/core/operators/kernels/math_functor.h"
 8 | #include "kaleido/core/operators/kernels/reduce.h"
 9 | #include "kaleido/core/operators/kernels/softmax_common.h"
10 | 
11 | namespace kaleido {
12 | namespace core {
13 | namespace cuda_kernel {
14 | 
15 | template <typename T, int BLOCK_SIZE>
16 | __launch_bounds__(BLOCK_SIZE) __global__
17 |     void KeMatrixSoftMaxV2(const T* __restrict input, T* __restrict output,
18 |                            int width) {
19 |   // use shared memory to cache input and intermediate results.
20 |   extern __shared__ __align__(sizeof(double)) unsigned char shared_buf[];
21 |   auto* sbuf = reinterpret_cast<T*>(shared_buf);
22 |   __shared__ MD<T> md;
23 | 
24 |   // cache input into shared memory
25 |   int tid = threadIdx.x;
26 |   int next_idx = blockIdx.x * width + tid;  // element index in input array
27 |   int cur_idx = tid;                        // element index in current row
28 |   for (; cur_idx < width; next_idx += BLOCK_SIZE, cur_idx += BLOCK_SIZE) {
29 |     sbuf[cur_idx] = input[next_idx];
30 |   }
31 |   __syncthreads();
32 | 
33 |   // Loop1: reduction Max, the maximum value is stored in md.m.
34 |   Max<T> max;
35 |   md.m =
36 |       BlockRowReduce<T, Max<T>, BLOCK_SIZE>(sbuf, width, max, -MaxValue<T>());
37 |   __syncthreads();
38 | 
39 |   // Loop2: reduction sum of exponential and substraction:
40 |   // sum(exp(x - m)). the reduction sum is stored in md.d;
41 |   SubAndExp<T> sub_and_exp(md.m);  // mapper
42 |   Add<T> sum;                      // reducer
43 |   Inverse<T> inverse;              // finalizer
44 |   md.d = BlockRowReduce<T, SubAndExp<T>, Add<T>, Inverse<T>, BLOCK_SIZE>(
45 |       sbuf, sbuf, width, sub_and_exp /*mapper*/, sum /*reducer*/,
46 |       inverse /*finalizer*/, static_cast<T>(0) /*initialier of reduction*/);
47 |   __syncthreads();
48 | 
49 |   // Loop3: map to rescale.
50 |   for (int cur_idx = tid; cur_idx < width; cur_idx += BLOCK_SIZE) {
51 |     sbuf[cur_idx] *= md.d;
52 |   }
53 | 
54 |   // Store result into global memory.
55 |   tid = threadIdx.x;
56 |   next_idx = blockIdx.x * width + tid;
57 |   cur_idx = tid;
58 |   for (; cur_idx < width; next_idx += BLOCK_SIZE, cur_idx += BLOCK_SIZE) {
59 |     output[next_idx] = sbuf[cur_idx];
60 |   }
61 | }
62 | 
63 | }  // namespace cuda_kernel
64 | }  // namespace core
65 | }  // namespace kaleido
66 | 


--------------------------------------------------------------------------------
/cmake/third_party.cmake:
--------------------------------------------------------------------------------
 1 | # -------------------------------------------------------------------------
 2 | # Copyright (c) Microsoft Corporation. All rights reserved. Licensed under the
 3 | # MIT License.
 4 | # --------------------------------------------------------------------------
 5 | 
 6 | set(THIRD_PARTY_PATH
 7 |     "${CMAKE_BINARY_DIR}/third_party"
 8 |     CACHE STRING
 9 |           "A path setting third party libraries download & build directories.")
10 | 
11 | set(THIRD_PARTY_CACHE_PATH
12 |     "${CMAKE_SOURCE_DIR}"
13 |     CACHE STRING
14 |           "A path cache third party source code to avoid repeated download.")
15 | 
16 | set(THIRD_PARTY_BUILD_TYPE Release)
17 | set(EXTERNAL_PROJECT_LOG_ARGS
18 |     LOG_DOWNLOAD
19 |     0
20 |     LOG_UPDATE
21 |     1
22 |     LOG_CONFIGURE
23 |     1
24 |     LOG_BUILD
25 |     0
26 |     LOG_TEST
27 |     1
28 |     LOG_INSTALL
29 |     0)
30 | set(SHALLOW_CLONE "GIT_SHALLOW TRUE")
31 | 
32 | function(cache_third_party TARGET)
33 |   set(options "")
34 |   set(oneValueArgs URL REPOSITORY TAG DIR)
35 |   set(multiValueArgs "")
36 |   cmake_parse_arguments(cache_third_party "${optionps}" "${oneValueArgs}"
37 |                         "${multiValueArgs}" ${ARGN})
38 | 
39 |   string(REPLACE "extern_" "" TARGET_NAME ${TARGET})
40 |   string(REGEX REPLACE "[0-9]+" "" TARGET_NAME ${TARGET_NAME})
41 |   string(TOUPPER ${TARGET_NAME} TARGET_NAME)
42 | 
43 |   if(cache_third_party_REPOSITORY)
44 |     set(${TARGET_NAME}_DOWNLOAD_CMD GIT_REPOSITORY
45 |                                     ${cache_third_party_REPOSITORY})
46 | 
47 |     if(cache_third_party_TAG)
48 |       list(APPEND ${TARGET_NAME}_DOWNLOAD_CMD GIT_TAG ${cache_third_party_TAG})
49 |     endif()
50 |   elseif(cache_third_party_URL)
51 |     set(${TARGET_NAME}_DOWNLOAD_CMD URL ${cache_third_party_URL})
52 |   else()
53 |     message(
54 |       FATAL_ERROR "Download link (Git repo or URL) must be specified for cache!"
55 |     )
56 |   endif()
57 | 
58 |   # Pass ${TARGET_NAME}_DOWNLOAD_CMD to parent scope, the double quotation marks
59 |   # can't be removed
60 |   set(${TARGET_NAME}_DOWNLOAD_CMD
61 |       "${${TARGET_NAME}_DOWNLOAD_CMD}"
62 |       PARENT_SCOPE)
63 | endfunction()
64 | 
65 | set(third_party_deps)
66 | 
67 | include(external/gflags)
68 | include(external/glog)
69 | include(external/gtest)
70 | include(external/pybind)
71 | include(external/zlib)
72 | include(external/protobuf)
73 | # required by benchmarks include(external/tvm)
74 | include(external/cccl)
75 | include(external/cutlass)
76 | list(APPEND third_party_deps extern_gtest extern_glog)
77 | 


--------------------------------------------------------------------------------
/cmake/external/gflags.cmake:
--------------------------------------------------------------------------------
 1 | # -------------------------------------------------------------------------
 2 | # Copyright (c) Microsoft Corporation. All rights reserved. Licensed under the
 3 | # MIT License.
 4 | # --------------------------------------------------------------------------
 5 | 
 6 | include(ExternalProject)
 7 | 
 8 | set(GFLAGS_PREFIX_DIR ${THIRD_PARTY_PATH}/gflags)
 9 | set(GFLAGS_SOURCE_DIR ${THIRD_PARTY_PATH}/gflags/src/extern_gflags)
10 | set(GFLAGS_INSTALL_DIR ${THIRD_PARTY_PATH}/install/gflags)
11 | set(GFLAGS_INCLUDE_DIR
12 |     "${GFLAGS_INSTALL_DIR}/include"
13 |     CACHE PATH "gflags include directory." FORCE)
14 | set(GFLAGS_REPOSITORY https://github.com/gflags/gflags.git)
15 | set(GFLAGS_TAG 77592648e3f3be87d6c7123eb81cbad75f9aef5a)
16 | set(GFLAGS_LIBRARIES
17 |     "${GFLAGS_INSTALL_DIR}/lib/libgflags.a"
18 |     CACHE FILEPATH "GFLAGS_LIBRARIES" FORCE)
19 | set(BUILD_COMMAND $(MAKE) --silent)
20 | set(INSTALL_COMMAND $(MAKE) install)
21 | 
22 | include_directories(${GFLAGS_INCLUDE_DIR})
23 | 
24 | cache_third_party(
25 |   extern_gflags
26 |   REPOSITORY
27 |   ${GFLAGS_REPOSITORY}
28 |   TAG
29 |   ${GFLAGS_TAG}
30 |   DIR
31 |   GFLAGS_SOURCE_DIR)
32 | 
33 | ExternalProject_Add(
34 |   extern_gflags
35 |   ${EXTERNAL_PROJECT_LOG_ARGS}
36 |   ${SHALLOW_CLONE}
37 |   "${GFLAGS_DOWNLOAD_CMD}"
38 |   PREFIX ${GFLAGS_PREFIX_DIR}
39 |   SOURCE_DIR ${GFLAGS_SOURCE_DIR}
40 |   BUILD_COMMAND ${BUILD_COMMAND}
41 |   INSTALL_COMMAND ${INSTALL_COMMAND}
42 |   UPDATE_COMMAND ""
43 |   CMAKE_ARGS -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER}
44 |              -DCMAKE_C_COMPILER=${CMAKE_C_COMPILER}
45 |              -DCMAKE_CXX_FLAGS=${CMAKE_CXX_FLAGS}
46 |              -DCMAKE_CXX_FLAGS_RELEASE=${CMAKE_CXX_FLAGS_RELEASE}
47 |              -DCMAKE_CXX_FLAGS_DEBUG=${CMAKE_CXX_FLAGS_DEBUG}
48 |              -DCMAKE_C_FLAGS=${CMAKE_C_FLAGS}
49 |              -DCMAKE_C_FLAGS_DEBUG=${CMAKE_C_FLAGS_DEBUG}
50 |              -DCMAKE_C_FLAGS_RELEASE=${CMAKE_C_FLAGS_RELEASE}
51 |              -DBUILD_STATIC_LIBS=ON
52 |              -DCMAKE_INSTALL_PREFIX=${GFLAGS_INSTALL_DIR}
53 |              -DCMAKE_POSITION_INDEPENDENT_CODE=ON
54 |              -DBUILD_TESTING=OFF
55 |              -DCMAKE_BUILD_TYPE=${THIRD_PARTY_BUILD_TYPE}
56 |              ${EXTERNAL_OPTIONAL_ARGS}
57 |   CMAKE_CACHE_ARGS
58 |     -DCMAKE_INSTALL_PREFIX:PATH=${GFLAGS_INSTALL_DIR}
59 |     -DCMAKE_POSITION_INDEPENDENT_CODE:BOOL=ON
60 |     -DCMAKE_BUILD_TYPE:STRING=${THIRD_PARTY_BUILD_TYPE})
61 | 
62 | add_library(gflags STATIC IMPORTED GLOBAL)
63 | set_property(TARGET gflags PROPERTY IMPORTED_LOCATION ${GFLAGS_LIBRARIES})
64 | add_dependencies(gflags extern_gflags)
65 | 


--------------------------------------------------------------------------------
/cmake/external/glog.cmake:
--------------------------------------------------------------------------------
 1 | # -------------------------------------------------------------------------
 2 | # Copyright (c) Microsoft Corporation. All rights reserved. Licensed under the
 3 | # MIT License.
 4 | # --------------------------------------------------------------------------
 5 | 
 6 | include(ExternalProject)
 7 | 
 8 | set(GLOG_PREFIX_DIR ${THIRD_PARTY_PATH}/glog)
 9 | set(GLOG_SOURCE_DIR ${THIRD_PARTY_PATH}/glog/src/extern_glog)
10 | set(GLOG_INSTALL_DIR ${THIRD_PARTY_PATH}/install/glog)
11 | set(GLOG_INCLUDE_DIR
12 |     "${GLOG_INSTALL_DIR}/include"
13 |     CACHE PATH "glog include directory." FORCE)
14 | set(GLOG_REPOSITORY https://github.com/google/glog.git)
15 | set(GLOG_TAG v0.3.5)
16 | 
17 | set(GLOG_LIBRARIES
18 |     "${GLOG_INSTALL_DIR}/lib/libglog.a"
19 |     CACHE FILEPATH "glog library." FORCE)
20 | set(GLOG_CMAKE_CXX_FLAGS ${CMAKE_CXX_FLAGS})
21 | 
22 | include_directories(${GLOG_INCLUDE_DIR})
23 | 
24 | cache_third_party(
25 |   extern_glog
26 |   REPOSITORY
27 |   ${GLOG_REPOSITORY}
28 |   TAG
29 |   ${GLOG_TAG}
30 |   DIR
31 |   GLOG_SOURCE_DIR)
32 | 
33 | ExternalProject_Add(
34 |   extern_glog
35 |   ${EXTERNAL_PROJECT_LOG_ARGS}
36 |   ${SHALLOW_CLONE}
37 |   "${GLOG_DOWNLOAD_CMD}"
38 |   DEPENDS gflags
39 |   PREFIX ${GLOG_PREFIX_DIR}
40 |   SOURCE_DIR ${GLOG_SOURCE_DIR}
41 |   UPDATE_COMMAND ""
42 |   CMAKE_ARGS -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER}
43 |              -DCMAKE_C_COMPILER=${CMAKE_C_COMPILER}
44 |              -DCMAKE_CXX_FLAGS=${GLOG_CMAKE_CXX_FLAGS}
45 |              -DCMAKE_CXX_FLAGS_RELEASE=${CMAKE_CXX_FLAGS_RELEASE}
46 |              -DCMAKE_CXX_FLAGS_DEBUG=${CMAKE_CXX_FLAGS_DEBUG}
47 |              -DCMAKE_C_FLAGS=${CMAKE_C_FLAGS}
48 |              -DCMAKE_C_FLAGS_DEBUG=${CMAKE_C_FLAGS_DEBUG}
49 |              -DCMAKE_C_FLAGS_RELEASE=${CMAKE_C_FLAGS_RELEASE}
50 |              -DCMAKE_INSTALL_PREFIX=${GLOG_INSTALL_DIR}
51 |              -DCMAKE_INSTALL_LIBDIR=${GLOG_INSTALL_DIR}/lib
52 |              -DCMAKE_POSITION_INDEPENDENT_CODE=ON
53 |              -DWITH_GFLAGS=ON
54 |              -Dgflags_DIR=${GFLAGS_INSTALL_DIR}/lib/cmake/gflags
55 |              -DBUILD_TESTING=OFF
56 |              -DCMAKE_BUILD_TYPE=${THIRD_PARTY_BUILD_TYPE}
57 |              ${EXTERNAL_OPTIONAL_ARGS}
58 |   CMAKE_CACHE_ARGS
59 |     -DCMAKE_INSTALL_PREFIX:PATH=${GLOG_INSTALL_DIR}
60 |     -DCMAKE_INSTALL_LIBDIR:PATH=${GLOG_INSTALL_DIR}/lib
61 |     -DCMAKE_POSITION_INDEPENDENT_CODE:BOOL=ON
62 |     -DCMAKE_BUILD_TYPE:STRING=${THIRD_PARTY_BUILD_TYPE})
63 | 
64 | add_library(glog SHARED IMPORTED GLOBAL)
65 | set_property(TARGET glog PROPERTY IMPORTED_LOCATION ${GLOG_LIBRARIES})
66 | add_dependencies(glog extern_glog gflags)
67 | link_libraries(glog gflags)
68 | 


--------------------------------------------------------------------------------
/benchmarks/rnn/fractaltensor/stacked_lstm/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | cmake_minimum_required(VERSION 3.18)
 2 | project(recurrence_test CXX C)
 3 | 
 4 | list(APPEND CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH}
 5 |             "${CMAKE_SOURCE_DIR}/../../../../cmake/Modules/")
 6 | 
 7 | set(CMAKE_BUILD_TYPE Release)
 8 | 
 9 | set(CMAKE_CXX_STANDARD 17)
10 | set(CMAKE_CXX_STANDARD_REQUIRED TRUE)
11 | set(CMAKE_CUDA_STANDARD 17)
12 | set(CMAKE_CUDA_STANDARD_REQUIRED TRUE)
13 | 
14 | set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fPIC -Wl,--no-undefined")
15 | set(CMAKE_CXX_FLAGS_DEBUG
16 |     "$ENV{CXXFLAGS} -O0 -fPIC -Wall -Wno-sign-compare -g2 -ggdb")
17 | set(CMAKE_CXX_FLAGS_RELEASE "$ENV{CXXFLAGS} -fPIC -O3 -Wall -Wno-sign-compare")
18 | 
19 | set(CMAKE_CXX_LINK_EXECUTABLE
20 |     "${CMAKE_CXX_LINK_EXECUTABLE} -lpthread -ldl -lrt")
21 | 
22 | find_package(CUDA QUIET REQUIRED)
23 | find_package(CuDNN QUIET REQUIRED)
24 | 
25 | cuda_select_nvcc_arch_flags(ARCH_FLAGS "Auto")
26 | set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} ${ARCH_FLAGS}")
27 | message(STATUS "CUDA Architecture flags = ${ARCH_FLAGS}")
28 | set(CUDA_PROPAGATE_HOST_FLAGS OFF)
29 | 
30 | # FIXME(ying): The RNN examples do not rely on cutlass, but the fill kernel in
31 | # `fill.h` depends on cutlass, and cutlass require C++17. This is a hotfix to
32 | # bypass the compiling error. Make the dependency clean in the future.
33 | if(CUTLASS_NATIVE_CUDA)
34 |   set(CMAKE_CUDA_STANDARD 17)
35 |   set(CMAKE_CUDA_STANDARD_REQUIRED ON)
36 |   list(APPEND CUTLASS_CUDA_NVCC_FLAGS --expt-relaxed-constexpr)
37 | else()
38 |   list(APPEND CUTLASS_CUDA_NVCC_FLAGS --std=c++17)
39 | endif()
40 | include_directories(
41 |   "../../../../build/third_party/cutlass/src/extern_cutlass/include")
42 | include_directories(
43 |   "../../../../build/third_party/cutlass/src/extern_cutlass/tools/util/include")
44 | 
45 | set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS} -w ${ARCH_FLAGS})
46 | set(CUDA_NVCC_FLAGS_DEBUG ${CUDA_NVCC_FLAGS_DEBUG} -w ${ARCH_FLAGS})
47 | set(CUDA_NVCC_FLAGS_RELEASE ${CUDA_NVCC_FLAGS_RELEASE} -w -O3 ${ARCH_FLAGS})
48 | 
49 | include_directories(${CUDA_INCLUDE_DIRS})
50 | include_directories(${CUDNN_INCLUDE_DIRS})
51 | 
52 | include_directories("../")
53 | include_directories("../../../../")
54 | include_directories("../../../../build/third_party/install/glog/include")
55 | include_directories(
56 |   "../../../../build/third_party/gflags/src/extern_gflags-build/include")
57 | link_directories("../../../../build/kaleido/core")
58 | link_directories("../../../../build/kaleido/core/operators")
59 | 
60 | cuda_add_executable(lstm lstm.cu)
61 | target_link_libraries(
62 |   lstm
63 |   ${CUDA_LIBRARIES}
64 |   ${CUDNN_LIBRARIES}
65 |   ${CUDA_CUBLAS_LIBRARIES}
66 |   ${CUDA_curand_LIBRARY}
67 |   fractaltensor_core
68 |   print_op)
69 | 


--------------------------------------------------------------------------------
/benchmarks/rnn/fractaltensor/grid_lstm/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | cmake_minimum_required(VERSION 3.18)
 2 | project(recurrence_test CXX C)
 3 | 
 4 | list(APPEND CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH}
 5 |             "${CMAKE_SOURCE_DIR}/../../../../cmake/Modules/")
 6 | 
 7 | set(CMAKE_BUILD_TYPE Release)
 8 | 
 9 | set(CMAKE_CXX_STANDARD 17)
10 | set(CMAKE_CXX_STANDARD_REQUIRED TRUE)
11 | set(CMAKE_CUDA_STANDARD 17)
12 | set(CMAKE_CUDA_STANDARD_REQUIRED TRUE)
13 | 
14 | set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fPIC -Wl,--no-undefined")
15 | set(CMAKE_CXX_FLAGS_DEBUG
16 |     "$ENV{CXXFLAGS} -O0 -fPIC -Wall -Wno-sign-compare -g2 -ggdb")
17 | set(CMAKE_CXX_FLAGS_RELEASE "$ENV{CXXFLAGS} -fPIC -O3 -Wall -Wno-sign-compare")
18 | 
19 | set(CMAKE_CXX_LINK_EXECUTABLE
20 |     "${CMAKE_CXX_LINK_EXECUTABLE} -lpthread -ldl -lrt")
21 | 
22 | find_package(CUDA QUIET REQUIRED)
23 | find_package(CuDNN QUIET REQUIRED)
24 | 
25 | cuda_select_nvcc_arch_flags(ARCH_FLAGS "Auto")
26 | set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} ${ARCH_FLAGS}")
27 | message(STATUS "CUDA Architecture flags = ${ARCH_FLAGS}")
28 | set(CUDA_PROPAGATE_HOST_FLAGS OFF)
29 | 
30 | # FIXME(ying): The RNN examples do not rely on cutlass, but the fill kernel in
31 | # `fill.h` depends on cutlass, and cutlass require C++17. This is a hotfix to
32 | # bypass the compiling error. Make the dependency clean in the future.
33 | if(CUTLASS_NATIVE_CUDA)
34 |   set(CMAKE_CUDA_STANDARD 17)
35 |   set(CMAKE_CUDA_STANDARD_REQUIRED ON)
36 |   list(APPEND CUTLASS_CUDA_NVCC_FLAGS --expt-relaxed-constexpr)
37 | else()
38 |   list(APPEND CUTLASS_CUDA_NVCC_FLAGS --std=c++17)
39 | endif()
40 | include_directories(
41 |   "../../../../build/third_party/cutlass/src/extern_cutlass/include")
42 | include_directories(
43 |   "../../../../build/third_party/cutlass/src/extern_cutlass/tools/util/include")
44 | 
45 | set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS} -w ${ARCH_FLAGS})
46 | set(CUDA_NVCC_FLAGS_DEBUG ${CUDA_NVCC_FLAGS_DEBUG} -w ${ARCH_FLAGS})
47 | set(CUDA_NVCC_FLAGS_RELEASE ${CUDA_NVCC_FLAGS_RELEASE} -w -O3 ${ARCH_FLAGS})
48 | 
49 | include_directories(${CUDA_INCLUDE_DIRS})
50 | include_directories(${CUDNN_INCLUDE_DIRS})
51 | 
52 | include_directories("../")
53 | include_directories("../../../../")
54 | include_directories("../../../../build/third_party/install/glog/include")
55 | include_directories(
56 |   "../../../../build/third_party/gflags/src/extern_gflags-build/include")
57 | link_directories("../../../../build/kaleido/core")
58 | link_directories("../../../../build/kaleido/core/operators")
59 | 
60 | cuda_add_executable(grid_rnn stacked_grid_rnn.cu)
61 | target_link_libraries(
62 |   grid_rnn
63 |   ${CUDA_LIBRARIES}
64 |   ${CUDNN_LIBRARIES}
65 |   ${CUDA_CUBLAS_LIBRARIES}
66 |   ${CUDA_curand_LIBRARY}
67 |   fractaltensor_core
68 |   concat_op
69 |   print_op)
70 | 


--------------------------------------------------------------------------------
/benchmarks/rnn/fractaltensor/dilated_lstm/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | cmake_minimum_required(VERSION 3.18)
 2 | project(recurrence_test CXX C)
 3 | 
 4 | list(APPEND CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH}
 5 |             "${CMAKE_SOURCE_DIR}/../../../../cmake/Modules/")
 6 | 
 7 | set(CMAKE_BUILD_TYPE Release)
 8 | 
 9 | set(CMAKE_CXX_STANDARD 17)
10 | set(CMAKE_CXX_STANDARD_REQUIRED TRUE)
11 | set(CMAKE_CUDA_STANDARD 17)
12 | set(CMAKE_CUDA_STANDARD_REQUIRED TRUE)
13 | 
14 | set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fPIC -Wl,--no-undefined")
15 | set(CMAKE_CXX_FLAGS_DEBUG
16 |     "$ENV{CXXFLAGS} -O0 -fPIC -Wall -Wno-sign-compare -g2 -ggdb")
17 | set(CMAKE_CXX_FLAGS_RELEASE "$ENV{CXXFLAGS} -fPIC -O3 -Wall -Wno-sign-compare")
18 | 
19 | set(CMAKE_CXX_LINK_EXECUTABLE
20 |     "${CMAKE_CXX_LINK_EXECUTABLE} -lpthread -ldl -lrt")
21 | 
22 | find_package(CUDA QUIET REQUIRED)
23 | find_package(CuDNN QUIET REQUIRED)
24 | 
25 | cuda_select_nvcc_arch_flags(ARCH_FLAGS "Auto")
26 | set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} ${ARCH_FLAGS}")
27 | message(STATUS "CUDA Architecture flags = ${ARCH_FLAGS}")
28 | 
29 | # FIXME(ying): The RNN examples do not rely on cutlass, but the fill kernel in
30 | # `fill.h` depends on cutlass, and cutlass require C++17. This is a hotfix to
31 | # bypass the compiling error. Make the dependency clean in the future.
32 | if(CUTLASS_NATIVE_CUDA)
33 |   set(CMAKE_CUDA_STANDARD 17)
34 |   set(CMAKE_CUDA_STANDARD_REQUIRED ON)
35 |   list(APPEND CUTLASS_CUDA_NVCC_FLAGS --expt-relaxed-constexpr)
36 | else()
37 |   list(APPEND CUTLASS_CUDA_NVCC_FLAGS --std=c++17)
38 | endif()
39 | include_directories(
40 |   "../../../../build/third_party/cutlass/src/extern_cutlass/include")
41 | include_directories(
42 |   "../../../../build/third_party/cutlass/src/extern_cutlass/tools/util/include")
43 | 
44 | set(CUDA_PROPAGATE_HOST_FLAGS OFF)
45 | set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS} -w ${ARCH_FLAGS})
46 | set(CUDA_NVCC_FLAGS_DEBUG ${CUDA_NVCC_FLAGS_DEBUG} -w ${ARCH_FLAGS})
47 | set(CUDA_NVCC_FLAGS_RELEASE ${CUDA_NVCC_FLAGS_RELEASE} -w -O3 ${ARCH_FLAGS})
48 | 
49 | include_directories(${CUDA_INCLUDE_DIRS})
50 | include_directories(${CUDNN_INCLUDE_DIRS})
51 | 
52 | include_directories("../")
53 | include_directories("../../../../")
54 | include_directories("../../../../build/third_party/install/glog/include")
55 | include_directories(
56 |   "../../../../build/third_party/gflags/src/extern_gflags-build/include")
57 | link_directories("../../../../build/kaleido/core")
58 | link_directories("../../../../build/kaleido/core/operators")
59 | 
60 | cuda_add_executable(stacked_dilated_lstm stacked_dilated_lstm.cu)
61 | target_link_libraries(
62 |   stacked_dilated_lstm
63 |   ${CUDA_LIBRARIES}
64 |   ${CUDNN_LIBRARIES}
65 |   ${CUDA_CUBLAS_LIBRARIES}
66 |   ${CUDA_curand_LIBRARY}
67 |   fractaltensor_core
68 |   print_op)
69 | 


--------------------------------------------------------------------------------
/examples/hello_world/hello_world.py:
--------------------------------------------------------------------------------
 1 | # -------------------------------------------------------------------------
 2 | # Copyright (c) Microsoft Corporation. All rights reserved.
 3 | # Licensed under the MIT License.
 4 | # --------------------------------------------------------------------------
 5 | 
 6 | from typing import NamedTuple
 7 | 
 8 | import context
 9 | 
10 | import kaleido
11 | from examples.hello_world.utils import *
12 | from kaleido import FractalTensor, Tensor
13 | from kaleido import operations as ops
14 | from kaleido.parser.plot import PlotProgram
15 | 
16 | ctx = kaleido.Context()
17 | 
18 | 
19 | @kaleido.params(ctx)
20 | class Params(NamedTuple):
21 |     Ws: FractalTensor[Tensor['512, 512', float, 'cpu']]
22 |     Us: FractalTensor[Tensor['512, 512', float, 'cpu']]
23 | 
24 | 
25 | @kaleido.function(ctx)
26 | def f3(a: Tensor['1, 512', float, 'cpu'], b: Tensor['1, 512', float, 'cpu'],
27 |        c: Tensor['512, 512', float, 'cpu'],
28 |        d: Tensor['512, 512', float, 'cpu']) -> Tensor['1, 512', float, 'cpu']:
29 |     y = a @ c + b @ d
30 |     return y
31 | 
32 | 
33 | @kaleido.function(ctx)
34 | def f2(
35 |     xs: FractalTensor[Tensor['1, 512', float, 'cpu']],
36 |     w: Tensor['512, 512', float, 'cpu'], u: Tensor['512, 512', float, 'cpu']
37 | ) -> FractalTensor[Tensor['1, 512', float, 'cpu']]:
38 |     ys = ops.scan(lambda s, x: f3(x, s, w, u),
39 |                   xs,
40 |                   initializer=ops.zeros(shape=(1, 512),
41 |                                         device='cpu',
42 |                                         dtype='float'))
43 |     return ys
44 | 
45 | 
46 | @kaleido.function(ctx)
47 | def f1(
48 |     xs: FractalTensor[Tensor['1, 512', float, 'cpu']],
49 |     Ws: FractalTensor[Tensor['512, 512', float,
50 |                              'cpu']], Us: FractalTensor[Tensor['512, 512',
51 |                                                                float, 'cpu']]
52 | ) -> FractalTensor[FractalTensor[Tensor['1, 512', float, 'cpu']]]:
53 |     yss = ops.scan(lambda state, x: f2(state, *x),
54 |                    ops.zip(Ws, Us),
55 |                    initializer=xs)
56 |     return yss
57 | 
58 | 
59 | @kaleido.function(ctx)
60 | def f(
61 |     xss: FractalTensor[FractalTensor[Tensor['1, 512', float,
62 |                                             'cpu']]], params: Params
63 | ) -> FractalTensor[FractalTensor[FractalTensor[Tensor['1, 512', float,
64 |                                                       'cpu']]]]:
65 |     ysss = ops.map(lambda xs: f1(xs, params.Ws, params.Us), xss)
66 |     return ysss
67 | 
68 | 
69 | block = ctx[-1].ir_block
70 | block.propagate_storage()
71 | 
72 | p = PlotProgram()
73 | p.plot(block)
74 | 
75 | if __name__ == '__main__':
76 |     param = Params(Ws=Ws, Us=Us)
77 | 
78 |     ysss = f(xss, param)
79 | 


--------------------------------------------------------------------------------
/SECURITY.md:
--------------------------------------------------------------------------------
 1 | <!-- BEGIN MICROSOFT SECURITY.MD V0.0.9 BLOCK -->
 2 | 
 3 | ## Security
 4 | 
 5 | Microsoft takes the security of our software products and services seriously, which includes all source code repositories managed through our GitHub organizations, which include [Microsoft](https://github.com/Microsoft), [Azure](https://github.com/Azure), [DotNet](https://github.com/dotnet), [AspNet](https://github.com/aspnet) and [Xamarin](https://github.com/xamarin).
 6 | 
 7 | If you believe you have found a security vulnerability in any Microsoft-owned repository that meets [Microsoft's definition of a security vulnerability](https://aka.ms/security.md/definition), please report it to us as described below.
 8 | 
 9 | ## Reporting Security Issues
10 | 
11 | **Please do not report security vulnerabilities through public GitHub issues.**
12 | 
13 | Instead, please report them to the Microsoft Security Response Center (MSRC) at [https://msrc.microsoft.com/create-report](https://aka.ms/security.md/msrc/create-report).
14 | 
15 | If you prefer to submit without logging in, send email to [secure@microsoft.com](mailto:secure@microsoft.com).  If possible, encrypt your message with our PGP key; please download it from the [Microsoft Security Response Center PGP Key page](https://aka.ms/security.md/msrc/pgp).
16 | 
17 | You should receive a response within 24 hours. If for some reason you do not, please follow up via email to ensure we received your original message. Additional information can be found at [microsoft.com/msrc](https://www.microsoft.com/msrc). 
18 | 
19 | Please include the requested information listed below (as much as you can provide) to help us better understand the nature and scope of the possible issue:
20 | 
21 |   * Type of issue (e.g. buffer overflow, SQL injection, cross-site scripting, etc.)
22 |   * Full paths of source file(s) related to the manifestation of the issue
23 |   * The location of the affected source code (tag/branch/commit or direct URL)
24 |   * Any special configuration required to reproduce the issue
25 |   * Step-by-step instructions to reproduce the issue
26 |   * Proof-of-concept or exploit code (if possible)
27 |   * Impact of the issue, including how an attacker might exploit the issue
28 | 
29 | This information will help us triage your report more quickly.
30 | 
31 | If you are reporting for a bug bounty, more complete reports can contribute to a higher bounty award. Please visit our [Microsoft Bug Bounty Program](https://aka.ms/security.md/msrc/bounty) page for more details about our active programs.
32 | 
33 | ## Preferred Languages
34 | 
35 | We prefer all communications to be in English.
36 | 
37 | ## Policy
38 | 
39 | Microsoft follows the principle of [Coordinated Vulnerability Disclosure](https://aka.ms/security.md/cvd).
40 | 
41 | <!-- END MICROSOFT SECURITY.MD BLOCK -->
42 | 


--------------------------------------------------------------------------------
/examples/sparse_attention/bigbird.py:
--------------------------------------------------------------------------------
 1 | # -------------------------------------------------------------------------
 2 | # Copyright (c) Microsoft Corporation. All rights reserved.
 3 | # Licensed under the MIT License.
 4 | # --------------------------------------------------------------------------
 5 | 
 6 | import context
 7 | from sparse_attention_utils import *
 8 | 
 9 | import kaleido
10 | from kaleido import operations as ops
11 | 
12 | 
13 | def norm(
14 |     g1: Tensor['32, 32', float,
15 |                'cuda'], w1: Tensor['32, 96', float,
16 |                                    'cuda'], g2: Tensor['32, 32', float, 'cuda']
17 | ) -> FractalTensor[Tensor['32, 32', float, 'cuda']]:
18 |     v = ops.softmax(ops.cat((g1, w1, g2), 1), 1)
19 |     v = ops.split(v, 5, 1)
20 |     return v
21 | 
22 | 
23 | def attn_func(
24 |     qs: FractalTensor[Tensor['32, 512', float, 'cuda']],
25 |     ks: FractalTensor[Tensor['32, 512', float,
26 |                              'cuda']], vs: FractalTensor[Tensor['32, 512',
27 |                                                                 float, 'cuda']]
28 | ) -> FractalTensor[Tensor['32, 512', float, 'cuda']]:
29 |     # windowed attention and global attention
30 |     # NOTE: Multiple heads and random attention are OMITTED for brevity.
31 |     wks, wvs = ops.shifted_slide(ops.zip(ks, vs), window_size=3)
32 |     wys = ops.map(lambda x: x[0] @ ops.flatten(x[1]).T,
33 |                   ops.zip(qs[2:-2], wks[2:-2]))
34 |     gys1 = ops.map(lambda x: x @ ks[0].T, qs[2:-2])  # left global attention
35 |     gys2 = ops.map(lambda x: x @ ks[-1].T, qs[2:-2])  # right global attention
36 | 
37 |     normed_vecs = ops.map(lambda x: norm(*x), ops.zip(gys1, wys, gys2))
38 | 
39 |     gvs1 = ops.map(lambda x: x[0] @ vs[0], normed_vecs)
40 |     gvs2 = ops.map(lambda x: x[-1] @ vs[-1], normed_vecs)
41 | 
42 |     wvs = ops.map(lambda x: ops.flatten(x[0][1:-1]).T @ ops.flatten(x[1]),
43 |                   ops.zip(normed_vecs, wvs[2:-2]))
44 |     vs = ops.map(lambda x: x[0] + x[1] + x[2], ops.zip(gvs1, gvs2, wvs))
45 |     return vs
46 | 
47 | 
48 | def bigbird(
49 |     qss: FractalTensor[FractalTensor[Tensor['32, 512', float, 'cuda']]],
50 |     kss: FractalTensor[FractalTensor[Tensor['32, 512', float, 'cuda']]],
51 |     vss: FractalTensor[FractalTensor[Tensor['32, 512', float, 'cuda']]]
52 | ) -> FractalTensor[FractalTensor[Tensor['32, 512', float, 'cuda']]]:
53 |     v = ops.map(lambda xs: attn_func(*xs), ops.zip(qss, kss, vss))
54 |     return v
55 | 
56 | 
57 | if __name__ == '__main__':
58 |     batch_size = 16
59 |     seq_len = 4096
60 |     hidden = 512
61 |     block_size = 32
62 | 
63 |     qss = create_blocked_input(batch_size, hidden, block_size, seq_len)
64 |     kss = create_blocked_input(batch_size, hidden, block_size, seq_len)
65 |     vss = create_blocked_input(batch_size, hidden, block_size, seq_len)
66 | 
67 |     bigbird(qss, kss, vss)
68 | 


--------------------------------------------------------------------------------