├── .gitignore ├── LICENSE ├── README.md └── flextensor ├── __init__.py ├── baselines ├── bilinear_baseline.py ├── block_matrix_circulant_baseline.cu ├── block_matrix_circulant_baseline.py ├── conv-autotvm │ ├── autotvm_opt_topi_conv2d_cpu.py │ ├── tune_conv2d_cuda.py │ ├── tune_conv2d_nchw_cuda.py │ ├── tune_depthwise_cuda.py │ ├── tune_nnvm_cuda.py │ └── tune_relay_cuda.py ├── conv-cudnn │ ├── Makefile │ └── cudnn_conv.cu ├── conv-mkl │ └── simple_conv.cpp ├── conv-tvm │ └── opt_conv_cuda.py ├── conv1d_baseline.py ├── conv2d_baseline.py ├── conv3d_baseline.py ├── conv_transpose1d_baseline.py ├── conv_transpose2d_baseline.py ├── conv_transpose3d_baseline.py ├── depthwise_baseline.py ├── dilation_baseline.py ├── flextensor │ ├── conv1d-p100.txt │ ├── conv1d-titanxp.txt │ ├── conv1d-v100.txt │ ├── conv3d-p100.txt │ ├── conv3d-titanxp.txt │ ├── conv3d-v100.txt │ ├── dilation-v100.txt │ ├── group_conv2d-v100.txt │ ├── yolo-conv2d-b128-v100.txt │ ├── yolo-conv2d-b32-v100.txt │ ├── yolo-conv2d-p100.txt │ ├── yolo-conv2d-titanxp.txt │ └── yolo-conv2d-v100.txt ├── gatedPixelCNN_baseline.py ├── gemm-cublas │ ├── Makefile │ ├── Makefile.old │ ├── cublas_batch_gemm.cu │ ├── cublas_gemm.cu │ ├── helper_cuda.h │ └── helper_string.h ├── gemm_baseline.py ├── gemv_baseline.py ├── grouped_baseline.py ├── pixelCNN_baseline.py ├── shift_conv2d_baseline.py ├── sparse │ └── sparse-gemm.py ├── taco │ └── taco-gemm.c ├── unpooling1d_baseline.py └── unpooling2d_baseline.py ├── configs ├── PixelCNN_config.py ├── bilinear_config.py ├── block_circulant_matrix_config.py ├── conv1d_config.py ├── conv2d_config.py ├── conv3d_config.py ├── conv_transpose2d_config.py ├── depthwise_config.py ├── dilation_config.py ├── gated_pixelcnn_config.py ├── gemm_config.py ├── gemv_config.py ├── grouped_config.py ├── maxunpooling1d_config.py ├── maxunpooling2d_config.py ├── mttkrp_config.py └── shift_conv2d_config.py ├── examples ├── __init__.py ├── autotvm_opt_conv1_cpu.py ├── autotvm_opt_topi_conv2d_cpu.py ├── autotvm_opt_topi_conv2d_gpu.py ├── autotvm_opt_topi_matmul_cpu.py ├── opt_blur2d_cpu.py ├── opt_conv3d_cpu.py ├── opt_conv_cpu.py ├── opt_conv_gpu.py ├── opt_gemm_cpu.py ├── opt_gemm_gpu.py ├── opt_mttkrp3_cpu.py ├── opt_outer_cpu.py ├── run_experiments.py ├── single_operation.py ├── transfer_cpu.py └── transfer_gpu.py ├── measure.py ├── model.py ├── nn ├── README.md ├── __init__.py ├── layers.py └── ops.py ├── optimize ├── README.md ├── common.py ├── conv1d-config.log ├── conv2d-config.log ├── depthwise_conv2d-config.log ├── gemm-config.log ├── gemm-config.old.log ├── gemm-config.v0.log ├── gemm-config.v1.log ├── optimize_bilinear.py ├── optimize_block_circulant_matrix.py ├── optimize_conv1d.py ├── optimize_conv2d.py ├── optimize_conv2d_1x1_packed.py ├── optimize_conv3d.py ├── optimize_conv_transpose1d.py ├── optimize_conv_transpose2d.py ├── optimize_conv_transpose3d.py ├── optimize_depthwise_conv2d.py ├── optimize_dilation_conv2d.py ├── optimize_gatedPixelCNN.py ├── optimize_gemm.py ├── optimize_gemm_conv2d.py ├── optimize_gemv.py ├── optimize_grouped_conv2d.py ├── optimize_mttkrp.py ├── optimize_pixelCNN.py ├── optimize_shift_conv2d.py ├── optimize_test_conv.py ├── optimize_unpooling1d.py ├── optimize_unpooling2d.py ├── run_remote_opencl_conv1d.sh ├── run_remote_opencl_conv2d.sh ├── run_remote_opencl_depthwise_conv2d.sh └── run_remote_opencl_gemm.sh ├── project └── tensor_graph │ ├── README.md │ ├── build.py │ ├── conv2d_model │ ├── conv2d_model.pkl │ └── log_conv2d_train.txt │ ├── dataset │ ├── all.txt │ ├── all_test.txt │ ├── all_train.txt │ ├── conv2d.txt │ ├── conv2d_test.txt │ ├── conv2d_train.txt │ ├── gemm.txt │ ├── gemm_test.txt │ ├── gemm_train.txt │ └── preprocess.py │ ├── example.py │ ├── expr_visitor.py │ ├── gemm_model │ ├── gemm_model.pkl │ └── log_gemm_train.txt │ ├── graph.py │ ├── model.py │ ├── node.py │ ├── ops.py │ ├── preprocess.py │ ├── softmax_issue.py │ ├── space.py │ ├── train.py │ └── utils.py ├── scheduler.py ├── space.py ├── task.py ├── templates ├── __init__.py ├── cpu.py ├── cuda.py ├── opencl.py └── utils.py ├── test ├── __init__.py ├── check_grouped_results.py ├── naive_schedule_all.py ├── pyimpl.py ├── test_halide │ ├── network.cpp │ └── tutorial.md ├── test_ops.py ├── test_scheduler.py ├── test_tvm │ ├── grad │ │ ├── dqn_pytorch.py │ │ ├── layers.py │ │ ├── relay-dqn.py │ │ └── relay-lenet.py │ ├── graph │ │ ├── placeholder-only.py │ │ └── share-placeholder.py │ └── legacy │ │ ├── android_gemm_square.py │ │ ├── conv.py │ │ ├── cross_compilation_and_rpc.py │ │ ├── cuda_gemm_square.py │ │ ├── depthwise_conv2d_test.py │ │ ├── gemm_int8.py │ │ ├── lstm.py │ │ ├── matexp.py │ │ ├── multi_compute_inline.py │ │ ├── opt_conv_cpu.py │ │ ├── opt_conv_cuda.py │ │ ├── opt_gemm.py │ │ ├── test_broadcast_map.py │ │ ├── test_compute_inline.py │ │ ├── test_conv2d_hwcn_map.py │ │ ├── test_conv_int8_intel.py │ │ ├── test_multi_outputs.py │ │ ├── test_one_thread.py │ │ ├── test_reduce_map.py │ │ ├── tune_relay_x86.py │ │ └── variant_scale.py └── test_tvm_expr │ ├── grad │ ├── requires_grad.py │ ├── te-avgpool-case1.py │ ├── te-broadcast-case1.py │ ├── te-cat-case1.py │ ├── te-conv2d-case1.py │ ├── te-conv2d-case2.py │ ├── te-conv2d-case3.py │ ├── te-conv2d-topi-case1.py │ ├── te-cross_entropy-case1.py │ ├── te-downcast-case1.py │ ├── te-flatten.py │ ├── te-gemm.py │ ├── te-maxpool-case1.py │ ├── te-mse_loss-case1.py │ ├── te-padding-case1.py │ ├── te-power-case1.py │ ├── te-repeat-case1.py │ ├── te-softmax-case1.py │ ├── te-sub-case1.py │ ├── te-sub-case2.py │ ├── test_report.md │ ├── tir-relu-case1.py │ └── tir-tanh-case1.py │ ├── network │ └── lenet.py │ └── train │ ├── get_lm_data.sh │ ├── lenet-CEloss-new-api.py │ ├── lenet-CEloss.py │ ├── lenet.py │ ├── lltm.py │ ├── mi_lstm_pytorch.py │ ├── mlp.py │ ├── scrnn_pytorch.py │ └── train-language-modeling.py ├── testing ├── __init__.py ├── array_mul.py ├── get_feature.py ├── net │ ├── nnvm-mobilenet-v1.py │ ├── nnvm-mobilenet-v2.py │ ├── overfeat.py │ ├── pytorch-overfeat.py │ ├── pytorch-yolo-v1.py │ └── yolo_v1.py ├── others │ ├── assemble.py │ ├── compare_conv_cpu.py │ ├── hand-craft │ │ ├── complex-gemm.cl │ │ ├── config_yolo1_cpu.py │ │ ├── config_yolo1_cuda.py │ │ ├── config_yolo24_cpu.py │ │ ├── config_yolo24_cuda.py │ │ ├── conv_example.cl │ │ ├── gemmini-conv2d-3x3-nhwc-spike.py │ │ ├── gemmini-conv2d-3x3-nhwc-zync.py │ │ ├── gemmini-gemv-spike.py │ │ ├── gemmini-ttm-spike.py │ │ ├── hcl_gemm.py │ │ ├── optimize_conv2d.py │ │ ├── schedule_conv2d_1x1.py │ │ ├── schedule_conv2d_nchw_cuda.py │ │ ├── schedule_conv2d_nchw_x86.py │ │ ├── schedule_conv2d_nchwc_x86.py │ │ ├── schedule_conv2d_vhls.py │ │ ├── schedule_gemm_conv2d_x86.py │ │ ├── schedule_gemm_vhls.py │ │ ├── schedule_shfit_x86.py │ │ ├── schedule_shift_cuda.py │ │ ├── simple-gemm.cl │ │ ├── tune_conv2d_NCHWc.py │ │ └── tvm_pragma.py │ ├── profile │ │ ├── Makefile │ │ ├── compute_flops.py │ │ ├── profile_autate_yolo_b8_conv11.cu │ │ ├── profile_autotvm_yolo_b8_conv11.cu │ │ ├── profile_flextensor_yolo_b1_conv3.cu │ │ ├── run_tune.txt │ │ ├── yolo_conv11_opencl_autate.cl │ │ └── yolo_conv11_opencl_autotvm.cl │ ├── test_conv2d_cuda_behavior.py │ ├── test_conv2d_cuda_different_schedule.py │ ├── test_conv2d_llvm_behavior.py │ └── tune_batch_conv2d_cuda.py ├── test_ir_visit.py └── test_ir_visit_print.py ├── train.py └── utils.py /.gitignore: -------------------------------------------------------------------------------- 1 | __pycache__ 2 | notes 3 | experiment_data 4 | *vscode* 5 | .idea 6 | .DS_Store 7 | *.cl 8 | *.csv 9 | *.zip 10 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2020-2021 Size Zheng 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /flextensor/__init__.py: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /flextensor/baselines/bilinear_baseline.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import timeit 3 | import torch 4 | import numpy as np 5 | from flextensor.configs.bilinear_config import bilinear_shapes 6 | torch.backends.cudnn.enabled = False 7 | 8 | 9 | def pytorch_cpu(N, K1, K2, M, number=100, dev=0): 10 | run_time = timeit.timeit(setup= 'import torch\n' 11 | 'A = torch.rand([' + str(N) + ', ' + str(K1) + '], dtype=torch.float32)\n' 12 | 'B = torch.rand([' + str(N) + ', ' + str(K2) + '], dtype=torch.float32)\n' 13 | 'C = torch.rand([' + str(M) + ', ' + str(K1) + ', ' + str(K2) + '], dtype=torch.float32)\n' 14 | 'torch.nn.functional.bilinear(A, B, C)\n', 15 | stmt='ans = torch.nn.functional.bilinear(A, B, C)', 16 | number=number) 17 | return run_time / number * 1e3 18 | 19 | 20 | def pytorch_cuda(N, K1, K2, M, number=100, dev=0): 21 | A = torch.rand([N, K1], dtype=torch.float32).cuda("cuda:" + str(dev)) 22 | B = torch.rand([N, K2], dtype=torch.float32).cuda("cuda:" + str(dev)) 23 | C = torch.rand([M, K1, K2], dtype=torch.float32).cuda("cuda:" + str(dev)) 24 | 25 | # warm-up 26 | torch.nn.functional.bilinear(A, B, C) 27 | torch.cuda.synchronize() 28 | sum_time = 0.0 29 | for i in range(number): 30 | start = torch.cuda.Event(enable_timing=True) 31 | end = torch.cuda.Event(enable_timing=True) 32 | start.record() 33 | ans = torch.nn.functional.bilinear(A, B, C) 34 | end.record() 35 | 36 | # Waits for everything to finish running 37 | torch.cuda.synchronize() 38 | sum_time += start.elapsed_time(end) 39 | 40 | return sum_time / number 41 | 42 | 43 | if __name__ == "__main__": 44 | parser = argparse.ArgumentParser() 45 | parser.add_argument("-f", "--from_", help="From which shape", type=int, default=0) 46 | parser.add_argument("-t", "--to", help="To which shape", type=int, default=-1) 47 | parser.add_argument("-n", "--number", help="number test run", type=int, default=10) 48 | parser.add_argument("--target", help="target device type", type=str, default="llvm") 49 | parser.add_argument("--device", help="target device number", type=int, default=0) 50 | parser.add_argument("--type", help="type of baseline", type=str, default="pytorch") 51 | 52 | args = parser.parse_args() 53 | shapes = bilinear_shapes 54 | if args.to < 0: 55 | end = len(shapes) 56 | else: 57 | end = args.to 58 | shapes = shapes[args.from_:end] 59 | if args.type == "pytorch": 60 | if args.target == "cuda": 61 | baseline = pytorch_cuda 62 | elif args.target == "llvm": 63 | baseline = pytorch_cpu 64 | else: 65 | raise RuntimeError("Only support target 'llvm' and 'cuda', but got %s"%args.target) 66 | else: 67 | raise RuntimeError("Only implement pytorch baseline now, no '%s' baseline"%args.type) 68 | 69 | print("%s baselines gemm for target %s (%d):" % (args.type, args.target, args.device)) 70 | for i, shape in enumerate(shapes): 71 | count = i + args.from_ 72 | print("layer", count) 73 | N, K1, K2, M = shape 74 | cost = baseline(N, K1, K2, M, args.number, args.device) 75 | print("Use %f(ms)" % cost) 76 | print("Done!") 77 | -------------------------------------------------------------------------------- /flextensor/baselines/block_matrix_circulant_baseline.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import time 3 | import copy 4 | import torch 5 | import numpy as np 6 | from flextensor.configs.block_circulant_matrix_config import block_circulant_matrix_shapes as shapes 7 | 8 | def python_cpu(ROW, COL, FFT, number=10, dev=0): 9 | Input = np.random.random([ROW, COL]).astype(np.float32) 10 | Output = np.random.random([ROW, COL]).astype(np.float32) 11 | 12 | def run(): 13 | nonlocal Input, Output 14 | for i in range(ROW // FFT): 15 | sub_vec = np.zeros([FFT], dtype=np.float32) 16 | vec = np.zeros([COL], dtype=np.float32) 17 | for t in range(COL // FFT): 18 | for m in range(FFT): 19 | for n in range(FFT): 20 | vec[t * FFT + m] += \ 21 | Input[FFT * i + n][t * FFT + (m + n) % FFT] / FFT 22 | 23 | for j in range(FFT): 24 | for k in range(COL//FFT): 25 | if j >= 1: 26 | sub_vec[0] = vec[FFT * (k + 1) - 1] 27 | sub_vec[1: FFT] = vec[FFT * k: FFT * (k + 1) - 1] 28 | vec[FFT * k: FFT * (k + 1)] = sub_vec 29 | Output[FFT * i + j][:] = copy.deepcopy(vec) 30 | 31 | sum_time = 0.0 32 | for _ in range(number): 33 | start = time.time() 34 | run() 35 | end = time.time() 36 | sum_time += end - start 37 | 38 | return sum_time / number * 1e3 39 | 40 | 41 | if __name__ == "__main__": 42 | parser = argparse.ArgumentParser() 43 | parser.add_argument("-f", "--from_", help="From which shape", type=int, default=0) 44 | parser.add_argument("-t", "--to", help="To which shape", type=int, default=-1) 45 | parser.add_argument("-n", "--number", help="number test run", type=int, default=10) 46 | parser.add_argument("--target", help="target device type", type=str, default="cpu") 47 | parser.add_argument("--device", help="target device number", type=int, default=0) 48 | parser.add_argument("--type", help="type of baseline", type=str, default="python") 49 | 50 | args = parser.parse_args() 51 | if args.to < 0: 52 | end = len(shapes) 53 | else: 54 | end = args.to 55 | shapes = shapes[args.from_:end] 56 | 57 | if args.type == "python": 58 | if args.target == "cpu": 59 | baseline = python_cpu 60 | else: 61 | raise RuntimeError("Only support target 'cpu', but got %s"%args.target) 62 | else: 63 | raise RuntimeError("Only implement pytorch baseline now, no '%s' baseline"%args.type) 64 | 65 | print("%s baselines convolution 1d for target %s (%d):" % (args.type, args.target, args.device)) 66 | for i, shape in enumerate(shapes): 67 | count = i + args.from_ 68 | print("layer", count) 69 | cost = baseline(*shape, number=args.number, dev=args.device) 70 | print("Use %f(ms)" % cost) 71 | print("Done!") 72 | -------------------------------------------------------------------------------- /flextensor/baselines/conv-cudnn/Makefile: -------------------------------------------------------------------------------- 1 | CXX := nvcc 2 | TARGET := cudnn_conv 3 | CUDNN_PATH := /usr/local/cuda-10.1 4 | HEADERS := -I $(CUDNN_PATH)/include 5 | LIBS := -L $(CUDNN_PATH)/lib64 -L/usr/local/lib 6 | LIBDEVICE := --dont-use-profile -ldir $(CUDNN_PATH)/nvvm/libdevice 7 | CXXFLAGS := -arch=sm_70 -std=c++11 -O2 8 | 9 | all: conv 10 | 11 | conv: $(TARGET).cu 12 | $(CXX) $(CXXFLAGS) $(LIBDEVICE) $(HEADERS) $(LIBS) $(TARGET).cu -o $(TARGET) \ 13 | -lcudnn 14 | 15 | .phony: clean 16 | 17 | clean: 18 | rm $(TARGET) || echo -n "" -------------------------------------------------------------------------------- /flextensor/baselines/conv1d_baseline.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import timeit 3 | import torch 4 | from flextensor.configs.conv1d_config import conv1d_shapes 5 | torch.backends.cudnn.enabled = True 6 | 7 | 8 | def pytorch_cpu(batch_size, length, channel, kernel_size, output_channel, stride=1, padding=0, dilation=1, groups=1, number=100, dev=0): 9 | run_time = timeit.timeit(setup= 'import torch\n' 10 | 'conv = torch.nn.functional.conv1d\n' 11 | 'A = torch.rand([' + str(batch_size) + ', ' + str(channel) + ', ' + str(length) + '], dtype=torch.float32)\n' 12 | 'W = torch.rand([' + str(output_channel) + ', ' + str(channel//groups) + ', ' + str(kernel_size) + '], dtype=torch.float32)\n' 13 | 'conv(A, W, stride=' + str(stride) + ', padding=' + str(padding) + ', dilation=' + str(dilation) + ', groups=' + str(groups) + ')\n', 14 | stmt='ans = conv(A, W, stride=' + str(stride) + ', padding=' + str(padding) + ', dilation=' + str(dilation) + ', groups=' + str(groups) + ')', 15 | number=number) 16 | return run_time / number * 1e3 17 | 18 | 19 | def pytorch_cuda(N, L, C, kernel_size, K, stride=1, padding=0, dilation=1, groups=1, number=100, dev=0): 20 | A = torch.rand([N, C, L], dtype=torch.float32).cuda("cuda:" + str(dev)) 21 | W = torch.rand([K, C//groups, kernel_size], dtype=torch.float32).cuda("cuda:" + str(dev)) 22 | 23 | # warm-up 24 | torch.nn.functional.conv1d(A, W, stride=stride, padding=padding, dilation=dilation, groups=groups) 25 | torch.cuda.synchronize() 26 | sum_time = 0.0 27 | for i in range(number): 28 | start = torch.cuda.Event(enable_timing=True) 29 | end = torch.cuda.Event(enable_timing=True) 30 | start.record() 31 | ans = torch.nn.functional.conv1d(A, W, stride=stride, padding=padding, dilation=dilation, groups=groups) 32 | end.record() 33 | 34 | # Waits for everything to finish running 35 | torch.cuda.synchronize() 36 | sum_time += start.elapsed_time(end) 37 | return sum_time / number 38 | 39 | 40 | if __name__ == "__main__": 41 | parser = argparse.ArgumentParser() 42 | parser.add_argument("-f", "--from_", help="From which shape", type=int, default=0) 43 | parser.add_argument("-t", "--to", help="To which shape", type=int, default=-1) 44 | parser.add_argument("-n", "--number", help="number test run", type=int, default=10) 45 | parser.add_argument("--target", help="target device type", type=str, default="llvm") 46 | parser.add_argument("--device", help="target device number", type=int, default=0) 47 | parser.add_argument("--type", help="type of baseline", type=str, default="pytorch") 48 | 49 | args = parser.parse_args() 50 | shapes = conv1d_shapes 51 | if args.to < 0: 52 | end = len(shapes) 53 | else: 54 | end = args.to 55 | shapes = shapes[args.from_:end] 56 | if args.type == "pytorch": 57 | if args.target == "cuda": 58 | baseline = pytorch_cuda 59 | elif args.target == "llvm": 60 | baseline = pytorch_cpu 61 | else: 62 | raise RuntimeError("Only support target 'llvm' and 'cuda', but got %s"%args.target) 63 | else: 64 | raise RuntimeError("Only implement pytorch baseline now, no '%s' baseline"%args.type) 65 | 66 | print("%s baselines convolution 1d for target %s (%d):" % (args.type, args.target, args.device)) 67 | for i, shape in enumerate(shapes): 68 | count = i + args.from_ 69 | print("layer", count) 70 | batch, in_channel, length, out_channel, _, k_len, _, stride, padding, dilation, groups = shape 71 | cost = baseline(batch, length, in_channel, k_len, out_channel, stride=stride, padding=padding, dilation=dilation, groups=groups, number=args.number, dev=args.device) 72 | print("Use %f(ms)" % cost) 73 | print("Done!") 74 | -------------------------------------------------------------------------------- /flextensor/baselines/conv3d_baseline.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import timeit 3 | import torch 4 | from flextensor.configs.conv3d_config import conv3d_shapes 5 | torch.backends.cudnn.enabled = True 6 | 7 | 8 | def pytorch_cpu(batch_size, depth, height, width, channel, kernel_size, output_channel, stride=1, padding=0, dilation=1, groups=1, number=100, dev=0): 9 | run_time = timeit.timeit(setup= 'import torch\n' 10 | 'conv = torch.nn.functional.conv3d\n' 11 | 'A = torch.rand([' + str(batch_size) + ', ' + str(channel) + ', ' + str(depth) + ', ' + str(height) + ', ' + str(width) + '], dtype=torch.float32)\n' 12 | 'W = torch.rand([' + str(output_channel) + ', ' + str(channel//groups) + ', ' + str(kernel_size) + ', ' + str(kernel_size) + ', ' + str(kernel_size) + '], dtype=torch.float32)\n' 13 | 'conv(A, W, stride=' + str(stride) + ', padding=' + str(padding) + ', dilation=' + str(dilation) + ', groups=' + str(groups) + ')\n', 14 | stmt='ans = conv(A, W, stride=' + str(stride) + ', padding=' + str(padding) + ', dilation=' + str(dilation) + ', groups=' + str(groups) + ')', 15 | number=number) 16 | return run_time / number * 1e3 17 | 18 | 19 | def pytorch_cuda(N, D, H, W, C, kernel_size, K, stride=1, padding=0, dilation=1, groups=1, number=100, dev=0): 20 | A = torch.rand([N, C, D, H, W], dtype=torch.float32).cuda("cuda:" + str(dev)) 21 | W = torch.rand([K, C//groups, kernel_size, kernel_size, kernel_size], dtype=torch.float32).cuda("cuda:" + str(dev)) 22 | 23 | # warm-up 24 | torch.nn.functional.conv3d(A, W, stride=stride, padding=padding, dilation=dilation, groups=groups) 25 | torch.cuda.synchronize() 26 | sum_time = 0.0 27 | for i in range(number): 28 | start = torch.cuda.Event(enable_timing=True) 29 | end = torch.cuda.Event(enable_timing=True) 30 | start.record() 31 | ans = torch.nn.functional.conv3d(A, W, stride=stride, padding=padding, dilation=dilation, groups=groups) 32 | end.record() 33 | 34 | # Waits for everything to finish running 35 | torch.cuda.synchronize() 36 | sum_time += start.elapsed_time(end) 37 | return sum_time / number 38 | 39 | 40 | if __name__ == "__main__": 41 | parser = argparse.ArgumentParser() 42 | parser.add_argument("-f", "--from_", help="From which shape", type=int, default=0) 43 | parser.add_argument("-t", "--to", help="To which shape", type=int, default=-1) 44 | parser.add_argument("-n", "--number", help="number test run", type=int, default=10) 45 | parser.add_argument("--target", help="target device type", type=str, default="llvm") 46 | parser.add_argument("--device", help="target device number", type=int, default=0) 47 | parser.add_argument("--type", help="type of baseline", type=str, default="pytorch") 48 | 49 | args = parser.parse_args() 50 | shapes = conv3d_shapes 51 | if args.to < 0: 52 | end = len(shapes) 53 | else: 54 | end = args.to 55 | shapes = shapes[args.from_:end] 56 | if args.type == "pytorch": 57 | if args.target == "cuda": 58 | baseline = pytorch_cuda 59 | elif args.target == "llvm": 60 | baseline = pytorch_cpu 61 | else: 62 | raise RuntimeError("Only support target 'llvm' and 'cuda', but got %s"%args.target) 63 | else: 64 | raise RuntimeError("Only implement pytorch baseline now, no '%s' baseline"%args.type) 65 | 66 | print("%s baselines convolution 3d for target %s (%d):" % (args.type, args.target, args.device)) 67 | for i, shape in enumerate(shapes): 68 | count = i + args.from_ 69 | print("layer", count) 70 | batch, in_channel, D, H, W, out_channel, _, k, _, stride, padding, _, _ = shape 71 | cost = baseline(batch, D, H, W, in_channel, k, out_channel, stride=stride, padding=padding, number=args.number, dev=args.device) 72 | print("Use %f(ms)" % cost) 73 | print("Done!") 74 | -------------------------------------------------------------------------------- /flextensor/baselines/dilation_baseline.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import timeit 3 | import torch 4 | from flextensor.configs.dilation_config import dilation_shapes 5 | torch.backends.cudnn.enabled = False 6 | 7 | 8 | def pytorch_cpu(batch_size, height, width, channel, kernel_size, output_channel, stride=1, padding=0, dilation=1, groups=1, number=100, dev=0): 9 | run_time = timeit.timeit(setup= 'import torch\n' 10 | 'conv = torch.nn.functional.conv2d\n' 11 | 'A = torch.rand([' + str(batch_size) + ', ' + str(channel) + ', ' + str(height) + ', ' + str(width) + '], dtype=torch.float32)\n' 12 | 'W = torch.rand([' + str(output_channel) + ', ' + str(channel//groups) + ', ' + str(kernel_size) + ', ' + str(kernel_size) + '], dtype=torch.float32)\n' 13 | 'conv(A, W, stride=' + str(stride) + ', padding=' + str(padding) + ', dilation=' + str(dilation) + ', groups=' + str(groups) + ')\n', 14 | stmt='ans = conv(A, W, stride=' + str(stride) + ', padding=' + str(padding) + ', dilation=' + str(dilation) + ', groups=' + str(groups) + ')', 15 | number=number) 16 | return run_time / number * 1e3 17 | 18 | 19 | def pytorch_cuda(N, H, W, C, kernel_size, K, stride=1, padding=0, dilation=1, groups=1, number=100, dev=0): 20 | A = torch.rand([N, C, H, W], dtype=torch.float32).cuda("cuda:" + str(dev)) 21 | W = torch.rand([K, C//groups, kernel_size, kernel_size], dtype=torch.float32).cuda("cuda:" + str(dev)) 22 | 23 | # warm-up 24 | torch.nn.functional.conv2d(A, W, stride=stride, padding=padding, dilation=dilation, groups=groups) 25 | torch.cuda.synchronize() 26 | sum_time = 0.0 27 | for i in range(number): 28 | start = torch.cuda.Event(enable_timing=True) 29 | end = torch.cuda.Event(enable_timing=True) 30 | start.record() 31 | ans = torch.nn.functional.conv2d(A, W, stride=stride, padding=padding, dilation=dilation, groups=groups) 32 | end.record() 33 | 34 | # Waits for everything to finish running 35 | torch.cuda.synchronize() 36 | 37 | sum_time += start.elapsed_time(end) 38 | return sum_time / number 39 | 40 | 41 | if __name__ == "__main__": 42 | parser = argparse.ArgumentParser() 43 | parser.add_argument("-f", "--from_", help="From which shape", type=int, default=0) 44 | parser.add_argument("-t", "--to", help="To which shape", type=int, default=-1) 45 | parser.add_argument("-n", "--number", help="number test run", type=int, default=10) 46 | parser.add_argument("--target", help="target device type", type=str, default="llvm") 47 | parser.add_argument("--device", help="target device number", type=int, default=0) 48 | parser.add_argument("--type", help="type of baseline", type=str, default="pytorch") 49 | 50 | args = parser.parse_args() 51 | shapes = dilation_shapes 52 | if args.to < 0: 53 | end = len(shapes) 54 | else: 55 | end = args.to 56 | shapes = shapes[args.from_:end] 57 | if args.type == "pytorch": 58 | if args.target == "cuda": 59 | baseline = pytorch_cuda 60 | elif args.target == "llvm": 61 | baseline = pytorch_cpu 62 | else: 63 | raise RuntimeError("Only support target 'llvm' and 'cuda', but got %s"%args.target) 64 | else: 65 | raise RuntimeError("Only implement pytorch baseline now, no '%s' baseline"%args.type) 66 | 67 | print("%s baselines dilation convolution for target %s (%d):" % (args.type, args.target, args.device)) 68 | for i, shape in enumerate(shapes): 69 | count = i + args.from_ 70 | print("layer", count) 71 | batch, in_channel, H, W, out_channel, k, _, stride, padding, dilation, groups = shape 72 | cost = baseline(batch, H, W, in_channel, k, out_channel, stride=stride, padding=padding, dilation=dilation, groups=groups, number=args.number, dev=args.device) 73 | print("Use %f(ms)" % cost) 74 | print("Done!") 75 | -------------------------------------------------------------------------------- /flextensor/baselines/flextensor/conv1d-p100.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pku-liang/FlexTensor/98db7a952a0508f792edfd876527c8517a5ccd36/flextensor/baselines/flextensor/conv1d-p100.txt -------------------------------------------------------------------------------- /flextensor/baselines/flextensor/conv1d-titanxp.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pku-liang/FlexTensor/98db7a952a0508f792edfd876527c8517a5ccd36/flextensor/baselines/flextensor/conv1d-titanxp.txt -------------------------------------------------------------------------------- /flextensor/baselines/flextensor/conv1d-v100.txt: -------------------------------------------------------------------------------- 1 | conv1d_conv1d_batch1_(1, 192, 3136, 128, 1, 1, 0, 1, 1)_cuda(0):[[{"fuse": [[1, 2, 3]], "spatial": [[1, 1, 1, 1], [8, 2, 3, 4], [49, 2, 32, 1]], "reduce": [], "reorder": [[2]], "inline": [], "unroll": [[512, 0]], "merge": []}, {"fuse": [[1, 2, 3]], "spatial": [[1, 1, 1, 1], [8, 4, 4, 1], [49, 2, 32, 1]], "reduce": [[4, 3, 16], [1, 1, 1]], "reorder": [[1]], "inline": [], "unroll": [[512, 1]], "merge": []}], {"fuse": [], "spatial": [], "reduce": [], "reorder": [], "inline": [[1, 0]], "unroll": [], "merge": [[1, 0]]}] 2 | conv1d_conv1d_batch1_(1, 128, 3136, 256, 9, 1, 1, 1, 1)_cuda(0):[[{"fuse": [[1, 2, 3]], "spatial": [[1, 1, 1, 1], [16, 1, 1, 8], [6, 1, 523, 1]], "reduce": [], "reorder": [[1]], "inline": [], "unroll": [[1, 1]], "merge": []}, {"fuse": [[1, 2, 3]], "spatial": [[1, 1, 1, 1], [16, 4, 2, 2], [5, 1, 313, 2]], "reduce": [[32, 1, 4], [1, 1, 9]], "reorder": [[2]], "inline": [], "unroll": [[1500, 0]], "merge": []}], {"fuse": [], "spatial": [], "reduce": [], "reorder": [], "inline": [[0, 0]], "unroll": [], "merge": [[0, 1]]}] 3 | conv1d_conv1d_batch1_(1, 512, 784, 256, 1, 1, 0, 1, 1)_cuda(0):[[{"fuse": [[1, 2, 3]], "spatial": [[1, 1, 1, 1], [16, 1, 8, 4], [14, 1, 56, 1]], "reduce": [], "reorder": [[2]], "inline": [], "unroll": [[512, 0]], "merge": []}, {"fuse": [[1, 2, 3]], "spatial": [[1, 1, 1, 1], [8, 4, 8, 1], [28, 1, 14, 2]], "reduce": [[8, 4, 16], [1, 1, 1]], "reorder": [[1]], "inline": [], "unroll": [[1500, 1]], "merge": []}], {"fuse": [], "spatial": [], "reduce": [], "reorder": [], "inline": [[1, 0]], "unroll": [], "merge": [[0, 1]]}] 4 | conv1d_conv1d_batch1_(1, 256, 784, 512, 9, 1, 1, 1, 1)_cuda(0):[[{"fuse": [[1, 2, 3]], "spatial": [[1, 1, 1, 1], [256, 1, 1, 1], [1, 3, 262, 1]], "reduce": [], "reorder": [[2]], "inline": [], "unroll": [[512, 1]], "merge": []}, {"fuse": [[1, 2, 3]], "spatial": [[1, 1, 1, 1], [128, 2, 1, 2], [1, 1, 389, 2]], "reduce": [[32, 2, 4], [1, 1, 9]], "reorder": [[2]], "inline": [], "unroll": [[1500, 1]], "merge": []}], {"fuse": [], "spatial": [], "reduce": [], "reorder": [], "inline": [[0, 0]], "unroll": [], "merge": [[1, 1]]}] 5 | conv1d_conv1d_batch1_(1, 1024, 196, 512, 1, 1, 0, 1, 1)_cuda(0):[[{"fuse": [[1, 2, 3]], "spatial": [[1, 1, 1, 1], [64, 2, 2, 4], [1, 2, 98, 1]], "reduce": [], "reorder": [[2]], "inline": [], "unroll": [[0, 0]], "merge": []}, {"fuse": [[1, 2, 3]], "spatial": [[1, 1, 1, 1], [32, 2, 8, 1], [7, 1, 14, 2]], "reduce": [[2, 32, 16], [1, 1, 1]], "reorder": [[1]], "inline": [], "unroll": [[1500, 1]], "merge": []}], {"fuse": [], "spatial": [], "reduce": [], "reorder": [], "inline": [[1, 0]], "unroll": [], "merge": [[0, 0]]}] 6 | conv1d_conv1d_batch1_(1, 512, 196, 1024, 9, 1, 1, 1, 1)_cuda(0):[[{"fuse": [[1, 2, 3]], "spatial": [[1, 1, 1, 1], [128, 1, 4, 1], [2, 1, 33, 3]], "reduce": [], "reorder": [[1]], "inline": [], "unroll": [[1500, 0]], "merge": []}, {"fuse": [[1, 2, 3]], "spatial": [[1, 1, 1, 1], [256, 4, 1, 1], [1, 1, 95, 2]], "reduce": [[1, 128, 4], [1, 1, 9]], "reorder": [[1]], "inline": [], "unroll": [[512, 0]], "merge": []}], {"fuse": [], "spatial": [], "reduce": [], "reorder": [], "inline": [[0, 0]], "unroll": [], "merge": [[0, 1]]}] 7 | conv1d_conv1d_batch1_(1, 1024, 49, 1024, 9, 1, 1, 1, 1)_cuda(0):[[{"fuse": [[1, 2, 3]], "spatial": [[1, 1, 1, 1], [256, 2, 1, 2], [1, 1, 51, 1]], "reduce": [], "reorder": [[2]], "inline": [], "unroll": [[512, 0]], "merge": []}, {"fuse": [[1, 2, 3]], "spatial": [[1, 1, 1, 1], [128, 4, 2, 1], [1, 1, 43, 1]], "reduce": [[32, 2, 16], [1, 1, 9]], "reorder": [[1]], "inline": [], "unroll": [[1500, 1]], "merge": []}], {"fuse": [], "spatial": [], "reduce": [], "reorder": [], "inline": [[0, 0]], "unroll": [], "merge": [[1, 1]]}] -------------------------------------------------------------------------------- /flextensor/baselines/flextensor/conv3d-p100.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pku-liang/FlexTensor/98db7a952a0508f792edfd876527c8517a5ccd36/flextensor/baselines/flextensor/conv3d-p100.txt -------------------------------------------------------------------------------- /flextensor/baselines/flextensor/conv3d-titanxp.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pku-liang/FlexTensor/98db7a952a0508f792edfd876527c8517a5ccd36/flextensor/baselines/flextensor/conv3d-titanxp.txt -------------------------------------------------------------------------------- /flextensor/baselines/flextensor/yolo-conv2d-p100.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pku-liang/FlexTensor/98db7a952a0508f792edfd876527c8517a5ccd36/flextensor/baselines/flextensor/yolo-conv2d-p100.txt -------------------------------------------------------------------------------- /flextensor/baselines/flextensor/yolo-conv2d-titanxp.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pku-liang/FlexTensor/98db7a952a0508f792edfd876527c8517a5ccd36/flextensor/baselines/flextensor/yolo-conv2d-titanxp.txt -------------------------------------------------------------------------------- /flextensor/baselines/gemm-cublas/Makefile: -------------------------------------------------------------------------------- 1 | CXX := nvcc 2 | TARGET := cublas_gemm 3 | CUBLAS_PATH := /usr/local/cuda-10.1 4 | HEADERS := -I $(CUBLAS_PATH)/include 5 | LIBS := -L $(CUBLAS_PATH)/lib64 -L/usr/local/lib 6 | LIBDEVICE := --dont-use-profile -ldir $(CUBLAS_PATH)/nvvm/libdevice 7 | CXXFLAGS := -arch=sm_70 -std=c++11 -O2 8 | 9 | all: conv 10 | 11 | conv: $(TARGET).cu 12 | $(CXX) $(CXXFLAGS) $(LIBDEVICE) $(HEADERS) $(LIBS) $(TARGET).cu -o $(TARGET) \ 13 | -lcublas 14 | 15 | .phony: clean 16 | 17 | clean: 18 | rm $(TARGET) || echo -n "" -------------------------------------------------------------------------------- /flextensor/baselines/gemm_baseline.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import timeit 3 | import torch 4 | import numpy as np 5 | from flextensor.configs.gemm_config import gemm_shapes 6 | torch.backends.cudnn.enabled = False 7 | 8 | 9 | def numpy_cpu(N, K, M, number=100, dev=0): 10 | run_time = timeit.timeit(setup='import numpy\n' 11 | 'N = ' + str(N) + '\n' 12 | 'K = ' + str(K) + '\n' 13 | 'M = ' + str(M) + '\n' 14 | 'dtype = "float32"\n' 15 | 'a = numpy.random.rand(N, K).astype(dtype)\n' 16 | 'b = numpy.random.rand(K, M).astype(dtype)\n', 17 | stmt='answer = numpy.dot(a, b)', 18 | number=number) 19 | return run_time / number * 1e3 20 | 21 | 22 | def pytorch_cpu(N, K, M, number=100, dev=0): 23 | run_time = timeit.timeit(setup= 'import torch\n' 24 | 'A = torch.rand([' + str(N) + ', ' + str(K) + '], dtype=torch.float32)\n' 25 | 'B = torch.rand([' + str(K) + ', ' + str(M) + '], dtype=torch.float32)\n' 26 | 'torch.mm(A, B)\n', 27 | stmt='ans = torch.mm(A, B)', 28 | number=number) 29 | return run_time / number * 1e3 30 | 31 | 32 | def pytorch_cuda(N, K, M, number=100, dev=0): 33 | A = torch.rand([N, K], dtype=torch.float32).cuda("cuda:" + str(dev)) 34 | B = torch.rand([K, M], dtype=torch.float32).cuda("cuda:" + str(dev)) 35 | 36 | # warm-up 37 | torch.mm(A, B) 38 | torch.cuda.synchronize() 39 | sum_time = 0.0 40 | for i in range(number): 41 | start = torch.cuda.Event(enable_timing=True) 42 | end = torch.cuda.Event(enable_timing=True) 43 | start.record() 44 | ans = torch.mm(A, B) 45 | end.record() 46 | 47 | # Waits for everything to finish running 48 | torch.cuda.synchronize() 49 | sum_time += start.elapsed_time(end) 50 | 51 | return sum_time / number 52 | 53 | 54 | if __name__ == "__main__": 55 | parser = argparse.ArgumentParser() 56 | parser.add_argument("-f", "--from_", help="From which shape", type=int, default=0) 57 | parser.add_argument("-t", "--to", help="To which shape", type=int, default=-1) 58 | parser.add_argument("-n", "--number", help="number test run", type=int, default=10) 59 | parser.add_argument("--target", help="target device type", type=str, default="llvm") 60 | parser.add_argument("--device", help="target device number", type=int, default=0) 61 | parser.add_argument("--type", help="type of baseline", type=str, default="pytorch") 62 | 63 | args = parser.parse_args() 64 | shapes = gemm_shapes 65 | if args.to < 0: 66 | end = len(shapes) 67 | else: 68 | end = args.to 69 | shapes = shapes[args.from_:end] 70 | if args.type == "pytorch": 71 | if args.target == "cuda": 72 | baseline = pytorch_cuda 73 | elif args.target == "llvm": 74 | baseline = pytorch_cpu 75 | else: 76 | raise RuntimeError("Only support target 'llvm' and 'cuda', but got %s"%args.target) 77 | elif args.type == "numpy": 78 | if args.target == "llvm": 79 | baseline = numpy_cpu 80 | else: 81 | raise RuntimeError("Only support target 'llvm', but got %s"%args.target) 82 | else: 83 | raise RuntimeError("Only implement pytorch baseline now, no '%s' baseline"%args.type) 84 | 85 | print("%s baselines gemm for target %s (%d):" % (args.type, args.target, args.device)) 86 | for i, shape in enumerate(shapes): 87 | count = i + args.from_ 88 | print("layer", count) 89 | N, K, M = shape 90 | cost = baseline(N, K, M, args.number, args.device) 91 | print("Use %f(ms)" % cost) 92 | print("Done!") 93 | -------------------------------------------------------------------------------- /flextensor/baselines/gemv_baseline.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import timeit 3 | import torch 4 | from flextensor.configs.gemv_config import gemv_shapes 5 | torch.backends.cudnn.enabled = False 6 | 7 | 8 | def pytorch_cpu(N, K, number=100, dev=0): 9 | run_time = timeit.timeit(setup= 'import torch\n' 10 | 'A = torch.rand([' + str(N) + ', ' + str(K) + '], dtype=torch.float32)\n' 11 | 'B = torch.rand([' + str(K) + ', 1], dtype=torch.float32)\n' 12 | 'torch.mm(A, B)\n', 13 | stmt='ans = torch.mm(A, B)', 14 | number=number) 15 | return run_time / number * 1e3 16 | 17 | 18 | def pytorch_cuda(N, K, number=100, dev=0): 19 | A = torch.rand([N, K], dtype=torch.float32).cuda("cuda:" + str(dev)) 20 | B = torch.rand([K, 1], dtype=torch.float32).cuda("cuda:" + str(dev)) 21 | 22 | # warm-up 23 | torch.mm(A, B) 24 | torch.cuda.synchronize() 25 | sum_time = 0.0 26 | for i in range(number): 27 | start = torch.cuda.Event(enable_timing=True) 28 | end = torch.cuda.Event(enable_timing=True) 29 | start.record() 30 | ans = torch.mm(A, B) 31 | end.record() 32 | 33 | # Waits for everything to finish running 34 | torch.cuda.synchronize() 35 | sum_time += start.elapsed_time(end) 36 | 37 | return sum_time / number 38 | 39 | 40 | if __name__ == "__main__": 41 | parser = argparse.ArgumentParser() 42 | parser.add_argument("-f", "--from_", help="From which shape", type=int, default=0) 43 | parser.add_argument("-t", "--to", help="To which shape", type=int, default=-1) 44 | parser.add_argument("-n", "--number", help="number test run", type=int, default=10) 45 | parser.add_argument("--target", help="target device type", type=str, default="llvm") 46 | parser.add_argument("--device", help="target device number", type=int, default=0) 47 | parser.add_argument("--type", help="type of baseline", type=str, default="pytorch") 48 | 49 | args = parser.parse_args() 50 | shapes = gemv_shapes 51 | if args.to < 0: 52 | end = len(shapes) 53 | else: 54 | end = args.to 55 | shapes = shapes[args.from_:end] 56 | if args.type == "pytorch": 57 | if args.target == "cuda": 58 | baseline = pytorch_cuda 59 | elif args.target == "llvm": 60 | baseline = pytorch_cpu 61 | else: 62 | raise RuntimeError("Only support target 'llvm' and 'cuda', but got %s"%args.target) 63 | else: 64 | raise RuntimeError("Only implement pytorch baseline now, no '%s' baseline"%args.type) 65 | 66 | print("%s baselines gemv for target %s (%d):" % (args.type, args.target, args.device)) 67 | for i, shape in enumerate(shapes): 68 | count = i + args.from_ 69 | print("layer", count) 70 | N, K, _ = shape 71 | cost = baseline(N, K, args.number, args.device) 72 | print("Use %f(ms)" % cost) 73 | print("Done!") 74 | -------------------------------------------------------------------------------- /flextensor/baselines/grouped_baseline.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import timeit 3 | import torch 4 | from flextensor.configs.grouped_config import grouped_shapes 5 | torch.backends.cudnn.enabled = False 6 | 7 | 8 | def pytorch_cpu(batch_size, height, width, channel, kernel_size, output_channel, stride=1, padding=0, dilation=1, groups=1, number=100, dev=0): 9 | run_time = timeit.timeit(setup= 'import torch\n' 10 | 'conv = torch.nn.functional.conv2d\n' 11 | 'A = torch.rand([' + str(batch_size) + ', ' + str(channel) + ', ' + str(height) + ', ' + str(width) + '], dtype=torch.float32)\n' 12 | 'W = torch.rand([' + str(output_channel) + ', ' + str(channel//groups) + ', ' + str(kernel_size) + ', ' + str(kernel_size) + '], dtype=torch.float32)\n' 13 | 'conv(A, W, stride=' + str(stride) + ', padding=' + str(padding) + ', dilation=' + str(dilation) + ', groups=' + str(groups) + ')\n', 14 | stmt='ans = conv(A, W, stride=' + str(stride) + ', padding=' + str(padding) + ', dilation=' + str(dilation) + ', groups=' + str(groups) + ')', 15 | number=number) 16 | return run_time / number * 1e3 17 | 18 | 19 | def pytorch_cuda(N, H, W, C, kernel_size, K, stride=1, padding=0, dilation=1, groups=1, number=100, dev=0): 20 | A = torch.rand([N, C, H, W], dtype=torch.float32).cuda("cuda:" + str(dev)) 21 | W = torch.rand([K, C//groups, kernel_size, kernel_size], dtype=torch.float32).cuda("cuda:" + str(dev)) 22 | 23 | # warm-up 24 | torch.nn.functional.conv2d(A, W, stride=stride, padding=padding, dilation=dilation, groups=groups) 25 | torch.cuda.synchronize() 26 | sum_time = 0.0 27 | for i in range(number): 28 | start = torch.cuda.Event(enable_timing=True) 29 | end = torch.cuda.Event(enable_timing=True) 30 | start.record() 31 | ans = torch.nn.functional.conv2d(A, W, stride=stride, padding=padding, dilation=dilation, groups=groups) 32 | end.record() 33 | 34 | # Waits for everything to finish running 35 | torch.cuda.synchronize() 36 | 37 | sum_time += start.elapsed_time(end) 38 | return sum_time / number 39 | 40 | 41 | if __name__ == "__main__": 42 | parser = argparse.ArgumentParser() 43 | parser.add_argument("-f", "--from_", help="From which shape", type=int, default=0) 44 | parser.add_argument("-t", "--to", help="To which shape", type=int, default=-1) 45 | parser.add_argument("-n", "--number", help="number test run", type=int, default=10) 46 | parser.add_argument("--target", help="target device type", type=str, default="llvm") 47 | parser.add_argument("--device", help="target device number", type=int, default=0) 48 | parser.add_argument("--type", help="type of baseline", type=str, default="pytorch") 49 | 50 | args = parser.parse_args() 51 | shapes = grouped_shapes 52 | if args.to < 0: 53 | end = len(shapes) 54 | else: 55 | end = args.to 56 | shapes = shapes[args.from_:end] 57 | if args.type == "pytorch": 58 | if args.target == "cuda": 59 | baseline = pytorch_cuda 60 | elif args.target == "llvm": 61 | baseline = pytorch_cpu 62 | else: 63 | raise RuntimeError("Only support target 'llvm' and 'cuda', but got %s"%args.target) 64 | else: 65 | raise RuntimeError("Only implement pytorch baseline now, no '%s' baseline"%args.type) 66 | 67 | print("%s baselines grouped convolution for target %s (%d):" % (args.type, args.target, args.device)) 68 | for i, shape in enumerate(shapes): 69 | count = i + args.from_ 70 | print("layer", count) 71 | batch, in_channel, H, W, out_channel, k, _, stride, padding, dilation, groups = shape 72 | cost = baseline(batch, H, W, in_channel, k, out_channel, stride=stride, padding=padding, dilation=dilation, groups=groups, number=args.number, dev=args.device) 73 | print("Use %f(ms)" % cost) 74 | print("Done!") 75 | -------------------------------------------------------------------------------- /flextensor/baselines/shift_conv2d_baseline.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import timeit 3 | import torch 4 | import time 5 | import tvm 6 | import topi 7 | import numpy as np 8 | import random 9 | import math 10 | from flextensor.configs.shift_conv2d_config import shift_conv2d_shape 11 | from flextensor.task import shiftconv2d 12 | 13 | torch.backends.cudnn.enabled = False 14 | 15 | def tvm_shift_conv2d_cpu(B, H, W, C, kernel_size, dilation, stride=1, number=100, dev=0): 16 | Input = torch.rand([B, H, W, C], dtype=torch.float32) 17 | Kernel = torch.rand([C, kernel_size, kernel_size], dtype=torch.float32) 18 | KernelIndex = torch.argmax(Kernel.reshape([C, -1]), dim=1) 19 | indexH = random.randint(0 ,kernel_size - 1) 20 | indexW = random.randint(0 ,kernel_size - 1) 21 | Kernel[:, indexH, indexW] = 0 22 | 23 | output, bufs = shiftconv2d(B, H, W, C, kernel_size, dilation, stride) 24 | s = tvm.te.create_schedule(output) 25 | ctx = tvm.cpu(dev) 26 | # print(tvm.lower(s, bufs, simple_mode=True)) 27 | f = tvm.build(s, bufs, 'llvm') 28 | 29 | im = tvm.nd.array(Input.numpy().astype(np.float32), ctx) 30 | fi = tvm.nd.array(KernelIndex.numpy().astype(np.int32), ctx) 31 | 32 | paddings = [math.ceil(((stride - 1) * H - stride + dilation * (kernel_size - 1)) / 2), 33 | math.ceil(((stride - 1) * W - stride + dilation * (kernel_size - 1)) / 2)] 34 | 35 | image_height = H 36 | image_width = W 37 | out_height = math.floor((image_height + 2 * paddings[0]- dilation * (kernel_size - 1) - 1) / stride + 1) 38 | out_width = math.floor((image_width + 2 * paddings[1] - dilation * (kernel_size - 1) - 1) / stride + 1) 39 | output_shape = (B, out_height, out_width, C) 40 | un = tvm.nd.array(np.zeros(output_shape).astype(np.float32), ctx) 41 | 42 | start_time = time.time() 43 | for i in range(number): 44 | f(im, fi, un) 45 | end_time = time.time() 46 | return (end_time - start_time) * 1e3 / number 47 | 48 | def tvm_shift_conv2d_cuda(B, H, W, C, kernel_size, stride, padding, number=100, dev=0): 49 | pass 50 | 51 | if __name__ == "__main__": 52 | shapes = shift_conv2d_shape 53 | 54 | """warm up""" 55 | """cost = pytorch_cpu(*shapes[0]) 56 | cost = pytorch_cuda(*shapes[0]) 57 | cost = tvm_shift_conv2d_cpu(*shapes[0])""" 58 | # cost = tvm_shift_conv2d_cuda(*shapes[0]) 59 | 60 | for shape in shapes: 61 | print("Shape", shape) 62 | cost = tvm_shift_conv2d_cpu(*shape) 63 | print("Tvm cost on cpu: {}ms".format(cost)) 64 | 65 | print("Done!") 66 | -------------------------------------------------------------------------------- /flextensor/baselines/sparse/sparse-gemm.py: -------------------------------------------------------------------------------- 1 | import time 2 | import numpy as np 3 | import torch 4 | 5 | 6 | def torch_spmm(M, N, K, dtype="float32", n_trial=1): 7 | spmm = torch.sparse.mm 8 | # a_np = np.random.uniform(-0.91, 0.9, [M, K]).astype(dtype) 9 | # b_np = np.random.uniform(-0.91, 0.9, [K, N]).astype(dtype) 10 | # a_torch = torch.relu(torch.tensor(a_np)).to_sparse() 11 | # b_torch = torch.tensor(b_np) 12 | m = torch.distributions.bernoulli.Bernoulli(torch.tensor(0.9)) 13 | a_torch = m.sample([M, K]).to_sparse() 14 | b_torch = m.sample([K, N]) 15 | 16 | # warm-up 17 | res = spmm(a_torch, b_torch) 18 | beg = time.time() 19 | for i in range(n_trial): 20 | spmm(a_torch, b_torch) 21 | end = time.time() 22 | return (end - beg) * 1e3 / n_trial 23 | 24 | 25 | def torch_spmv(M, K, dtype="float32", n_trial=1): 26 | spmm = torch.sparse.mm 27 | # a_np = np.random.uniform(-0.91, 0.9, [M, K]).astype(dtype) 28 | # b_np = np.random.uniform(-0.91, 0.9, [K, 1]).astype(dtype) 29 | # a_torch = torch.relu(torch.tensor(a_np)).to_sparse() 30 | # b_torch = torch.tensor(b_np) 31 | m = torch.distributions.bernoulli.Bernoulli(torch.tensor(0.9)) 32 | a_torch = m.sample([M, K]).to_sparse() 33 | b_torch = m.sample([K, 1]) 34 | 35 | # warm-up 36 | res = spmm(a_torch, b_torch) 37 | beg = time.time() 38 | for i in range(n_trial): 39 | spmm(a_torch, b_torch) 40 | end = time.time() 41 | return (end - beg) * 1e3 / n_trial 42 | 43 | 44 | if __name__ == "__main__": 45 | # for i in [1, 5, 10, 15, 20, 50]: 46 | # size = i * 2**10 47 | # try: 48 | # res = torch_spmm(size, size, size, n_trial=10) 49 | # except Exception as e: 50 | # print(str(e)) 51 | # res = float("inf") 52 | # print("Spmm pytorch: [scale: %d]: %f ms" % (size, res)) 53 | for i in range(1, 10 + 1): 54 | size = i * 10 * 2**10 55 | try: 56 | res = torch_spmv(size, size, n_trial=10) 57 | except Exception as e: 58 | print(str(e)) 59 | res = float("inf") 60 | print("Spmm pytorch: [scale: %d]: %f ms" % (size, res)) 61 | -------------------------------------------------------------------------------- /flextensor/configs/PixelCNN_config.py: -------------------------------------------------------------------------------- 1 | """ 2 | (batch, H, W, in_C, out_C, KH, KW, mask_type, bias, dilation, stride, padding) 3 | """ 4 | PixelCNN_shape = [ 5 | (1, 256, 256, 3, 32, 7, 7, 'A', None, 1, 1, 0), 6 | (1, 128, 128, 256, 256, 3, 3, 'B', None, 1, 1, 0) 7 | ] -------------------------------------------------------------------------------- /flextensor/configs/bilinear_config.py: -------------------------------------------------------------------------------- 1 | bilinear_shapes = [ 2 | (32, 1024, 1024, 32), 3 | (64, 512, 1024, 32), 4 | (128, 256, 512, 64), 5 | (256, 128, 256, 128), 6 | (512, 64, 128, 256), 7 | ] -------------------------------------------------------------------------------- /flextensor/configs/block_circulant_matrix_config.py: -------------------------------------------------------------------------------- 1 | block_circulant_matrix_shapes = [] 2 | 3 | for shape in [(1024, 256), (1024, 512), (1024, 40)]: 4 | for factor in [8, 16]: 5 | block_circulant_matrix_shapes.append((*shape, factor)) 6 | -------------------------------------------------------------------------------- /flextensor/configs/conv1d_config.py: -------------------------------------------------------------------------------- 1 | conv1d_shapes = [ 2 | (1, 192, 56 * 56, 128, 192, 1 * 1, 1, 1, 0, 1, 1), 3 | (1, 128, 56 * 56, 256, 128, 3 * 3, 1, 1, 1, 1, 1), 4 | (1, 512, 28 * 28, 256, 512, 1 * 1, 1, 1, 0, 1, 1), 5 | (1, 256, 28 * 28, 512, 256, 3 * 3, 1, 1, 1, 1, 1), 6 | (1, 1024, 196, 512, 1024, 1 * 1, 1, 1, 0, 1, 1), 7 | (1, 512, 196, 1024, 512, 3 * 3, 1, 1, 1, 1, 1), 8 | (1, 1024, 49, 1024, 1024, 3 * 3, 1, 1, 1, 1, 1), 9 | ] -------------------------------------------------------------------------------- /flextensor/configs/conv3d_config.py: -------------------------------------------------------------------------------- 1 | conv3d_shapes = [ 2 | # yolo 3 | # (1, 3, 16, 112, 112, 64, 3, 7, 1, 2, 3, 1, 1), 4 | # (1, 64, 16, 28, 28, 192, 64, 3, 1, 1, 1, 1, 1), 5 | # (1, 192, 4, 28, 28, 128, 192, 1, 1, 1, 0, 1, 1), 6 | # (1, 128, 4, 28, 28, 256, 128, 3, 1, 1, 1, 1, 1), 7 | # (1, 512, 4, 14, 14, 256, 512, 1, 1, 1, 0, 1, 1), 8 | # (1, 256, 4, 14, 14, 512, 256, 3, 1, 1, 1, 1, 1), 9 | # (1, 1024, 4, 7, 7, 512, 1024, 1, 1, 1, 0, 1, 1), 10 | # (1, 512, 4, 7, 7, 1024, 512, 3, 1, 1, 1, 1, 1), 11 | 12 | # resnet-18 13 | # N, C, D, H, W, _, K, _, kernel_size, stride=1, padding=0, dilation=1, groups=1 14 | # skip the first layer 15 | # ( 1, 3, 16, 112, 112, 64, 7, (1, 2, 2), 3, 1, 1), 16 | # conv2_x 17 | ( 1, 64, 8, 28, 28, 64, 64, 3, 1, 1, 1, 1, 1), 18 | # ( 1, 64, 8, 28, 28, 64, 3, 1, 1, 1, 1), 19 | # conv3_x 20 | ( 1, 64, 8, 28, 28, 128, 64, 3, 1, 2, 1, 1, 1), 21 | ( 1, 128, 4, 14, 14, 128, 128, 3, 1, 1, 1, 1, 1), 22 | ( 1, 64, 8, 28, 28, 128, 64, 1, 1, 2, 1, 1, 1), 23 | 24 | # ( 1, 128, 4, 14, 14, 128, 3, 1, 1, 1, 1), 25 | # ( 1, 128, 4, 14, 14, 128, 3, 1, 1, 1, 1), 26 | # conv4_x 27 | ( 1, 128, 4, 14, 14, 256, 128, 3, 1, 2, 1, 1, 1), 28 | ( 1, 256, 2, 7, 7, 256, 256, 3, 1, 1, 1, 1, 1), 29 | ( 1, 128, 4, 14, 14, 256, 128, 1, 1, 2, 1, 1, 1), 30 | 31 | # ( 1, 256, 2, 7, 7, 256, 3, 1, 1, 1, 1), 32 | # ( 1, 256, 2, 7, 7, 256, 3, 1, 1, 1, 1), 33 | # conv5_x 34 | ( 1, 256, 2, 7, 7, 512, 256, 3, 1, 2, 1, 1, 1), 35 | ( 1, 512, 1, 3, 3, 512, 512, 3, 1, 1, 1, 1, 1), 36 | ( 1, 256, 2, 7, 7, 512, 256, 1, 1, 2, 1, 1, 1), 37 | 38 | # ( 1, 512, 1, 3, 3, 512, 3, 1, 1, 1, 1), 39 | # ( 1, 512, 1, 3, 3, 512, 3, 1, 1, 1, 1), 40 | ] -------------------------------------------------------------------------------- /flextensor/configs/conv_transpose2d_config.py: -------------------------------------------------------------------------------- 1 | # N, C, H, W, K, kernel_size, stride=1, padding=0, output_padding=0, dilation=1, groups=1 2 | conv_transpose2d_shapes = [ 3 | # U-Net(https://arxiv.org/abs/1505.04597) with input size: 572 * 572 4 | (1, 1024, 28, 28, 512, 2, 2, 0, 0, 1, 1), # 1*1024*28*28 -> 1*512*56*56 5 | (1, 512, 52, 52, 256, 2, 2, 0, 0, 1, 1), # 1*512*52*52 -> 1*256*104*104 6 | (1, 256, 100, 100, 128, 2, 2, 0, 0, 1, 1), # 1*256*100*100 -> 1*128*200*200 7 | (1, 128, 196, 196, 64, 2, 2, 0, 0, 1, 1), # 1*128*196*196 -> 1*64*392*392 8 | 9 | # ShiftNet(https://arxiv.org/abs/1801.09392) with input size: 256*256 10 | (1, 512, 1, 1, 512, 4, 2, 1, 0, 1, 1), # 1*512*1*1 -> 1*512*2*2 11 | (1, 1024, 2, 2, 512, 4, 2, 1, 0, 1, 1), # 1*1024*2*2 -> 1*512*4*4 12 | (1, 1024, 4, 4, 512, 4, 2, 1, 0, 1, 1), # 1*1024*4*4 -> 1*512*8*8 13 | (1, 1024, 8, 8, 512, 4, 2, 1, 0, 1, 1), # 1*1024*8*8 -> 1*512*16*16 14 | (1, 1024, 16, 16, 256, 4, 2, 1, 0, 1, 1), # 1*1024*16*16 -> 1*256*32*32 15 | (1, 768, 32, 32, 128, 4, 2, 1, 0, 1, 1), # 1*768*32*32 -> 1*128*64*64 16 | (1, 256, 64, 64, 64, 4, 2, 1, 0, 1, 1), # 1*256*64*64 -> 1*64*128*128 17 | (1, 128, 128, 128, 3, 4, 2, 1, 0, 1, 1), # 1*128*128*128 -> 1*3*256*256 18 | ] -------------------------------------------------------------------------------- /flextensor/configs/depthwise_config.py: -------------------------------------------------------------------------------- 1 | depthwise_shapes = [ 2 | (1, 32, 112, 112, 1, 3, 3, 1, 1, 1), 3 | (1, 16, 112, 112, 6, 3, 3, 2, 1, 1), 4 | (1, 24, 56, 56, 6, 3, 3, 2, 1, 1), 5 | (1, 32, 28, 28, 6, 3, 3, 2, 1, 1), 6 | (1, 64, 14, 14, 6, 3, 3, 1, 1, 1), 7 | (1, 96, 14, 14, 6, 3, 3, 2, 1, 1), 8 | (1, 160, 7, 7, 6, 3, 3, 1, 1, 1), 9 | ] 10 | 11 | 12 | depthwise_shapes_b8 = [ 13 | (8, 32, 112, 112, 1, 3, 3, 1, 1, 1), 14 | (8, 16, 112, 112, 6, 3, 3, 2, 1, 1), 15 | (8, 24, 56, 56, 6, 3, 3, 2, 1, 1), 16 | (8, 32, 28, 28, 6, 3, 3, 2, 1, 1), 17 | (8, 64, 14, 14, 6, 3, 3, 1, 1, 1), 18 | (8, 96, 14, 14, 6, 3, 3, 2, 1, 1), 19 | (8, 160, 7, 7, 6, 3, 3, 1, 1, 1), 20 | ] 21 | 22 | 23 | depthwise_shapes_b16 = [ 24 | (16, 32, 112, 112, 1, 3, 3, 1, 1, 1), 25 | (16, 16, 112, 112, 6, 3, 3, 2, 1, 1), 26 | (16, 24, 56, 56, 6, 3, 3, 2, 1, 1), 27 | (16, 32, 28, 28, 6, 3, 3, 2, 1, 1), 28 | (16, 64, 14, 14, 6, 3, 3, 1, 1, 1), 29 | (16, 96, 14, 14, 6, 3, 3, 2, 1, 1), 30 | (16, 160, 7, 7, 6, 3, 3, 1, 1, 1), 31 | ] -------------------------------------------------------------------------------- /flextensor/configs/dilation_config.py: -------------------------------------------------------------------------------- 1 | dilation_shapes_test = [ 2 | (1, 3, 448, 448, 64, 7, 7, 2, 3, 2, 1), 3 | (1, 64, 112, 112, 192, 3, 3, 1, 1, 2, 1), 4 | (1, 192, 56, 56, 128, 1, 1, 1, 0, 2, 1), 5 | (1, 128, 56, 56, 256, 3, 3, 1, 1, 2, 1), 6 | ] 7 | 8 | dilation_shapes = [ 9 | # yolo 10 | (1, 256, 56, 56, 256, 1, 1, 1, 0, 2, 1), # conv5 4 11 | (1, 256, 56, 56, 512, 3, 3, 1, 1, 2, 1), # conv6 5 12 | (1, 512, 28, 28, 256, 1, 1, 1, 0, 2, 1), # conv7 6 13 | (1, 256, 28, 28, 512, 3, 3, 1, 1, 2, 1), # conv8 7 14 | 15 | (1, 512, 28, 28, 512, 1, 1, 1, 0, 2, 1), # conv15 8 16 | (1, 512, 28, 28, 1024, 3, 3, 1, 1, 2, 1), # conv16 9 17 | (1, 1024, 14, 14, 512, 1, 1, 1, 0, 2, 1), # conv17 10 18 | (1, 512, 14, 14, 1024, 3, 3, 1, 1, 2, 1), # conv18 11 19 | 20 | (1, 1024, 14, 14, 1024, 3, 3, 1, 1, 2, 1), # conv21 12 21 | (1, 1024, 14, 14, 1024, 3, 3, 2, 1, 2, 1), # conv22 13 22 | (1, 1024, 7, 7, 1024, 3, 3, 1, 1, 2, 1), # conv23 14 23 | ] -------------------------------------------------------------------------------- /flextensor/configs/gated_pixelcnn_config.py: -------------------------------------------------------------------------------- 1 | """ 2 | batch_size, input_height, input_width, in_channels, out_channels, kernel_height, kernel_width, ClassVector=None, bias=None, dilation=1, stride=1, padding=0 3 | """ 4 | gated_pixelcnn_shape = [ 5 | (1, 256, 256, 3, 256, 3, None, None, 1, 1, 0) 6 | ] -------------------------------------------------------------------------------- /flextensor/configs/gemm_config.py: -------------------------------------------------------------------------------- 1 | old_gemm_shapes = [ 2 | (32, 32, 32), 3 | (64, 64, 64), 4 | (128, 128, 128), 5 | (256, 256, 256), 6 | (512, 512, 512), 7 | (1024, 1024, 1024), 8 | (2048, 2048, 2048) 9 | ] 10 | 11 | gemm_shapes = [] 12 | for i in range(5, 13): 13 | for j in range(5, 13): 14 | for k in range(5, 13): 15 | gemm_shapes.append([2**i, 2**k, 2**j]) 16 | 17 | 18 | test_gemm_shapes = [ 19 | # batch 20 | # height 21 | # width 22 | # length 23 | 24 | # open 25 | (1, 1024, 1024, 1024), 26 | (2, 512, 512, 512), 27 | (3, 1024, 32, 1024), 28 | # confidential 29 | (16, 4096, 128, 1024), 30 | (32, 28, 1024, 28), 31 | ] -------------------------------------------------------------------------------- /flextensor/configs/gemv_config.py: -------------------------------------------------------------------------------- 1 | gemv_shapes = [ 2 | (128, 128, 128), 3 | (256, 256, 256), 4 | (512, 512, 512), 5 | (1024, 1024, 1024), 6 | (2048, 2048, 2048), 7 | (4096, 4096, 4096), 8 | ] -------------------------------------------------------------------------------- /flextensor/configs/grouped_config.py: -------------------------------------------------------------------------------- 1 | grouped_shapes = [ 2 | (1, 1024, 14, 14, 512, 1, 1, 1, 0, 1, 4), 3 | (1, 1024, 14, 14, 512, 1, 1, 1, 0, 1, 8), 4 | (1, 1024, 14, 14, 512, 1, 1, 1, 0, 1, 16), 5 | (1, 1024, 14, 14, 512, 1, 1, 1, 0, 1, 32), 6 | ] 7 | 8 | grouped_shapes_ = [ 9 | (1, 64, 112, 112, 192, 3, 3, 1, 1, 1, 16), # conv2 1 10 | (1, 192, 56, 56, 128, 1, 1, 1, 0, 1, 16), # conv3 2 11 | (1, 128, 56, 56, 256, 3, 3, 1, 1, 1, 16), # conv4 3 12 | (1, 256, 56, 56, 256, 1, 1, 1, 0, 1, 16), # conv5 4 13 | (1, 256, 56, 56, 512, 3, 3, 1, 1, 1, 16), # conv6 5 14 | (1, 512, 28, 28, 256, 1, 1, 1, 0, 1, 16), # conv7 6 15 | (1, 256, 28, 28, 512, 3, 3, 1, 1, 1, 16), # conv8 7 16 | 17 | (1, 512, 28, 28, 512, 1, 1, 1, 0, 1, 16), # conv15 8 18 | (1, 512, 28, 28, 1024, 3, 3, 1, 1, 1, 16), # conv16 9 19 | (1, 1024, 14, 14, 512, 1, 1, 1, 0, 1, 16), # conv17 10 20 | (1, 512, 14, 14, 1024, 3, 3, 1, 1, 1, 16), # conv18 11 21 | 22 | (1, 1024, 14, 14, 1024, 3, 3, 1, 1, 1, 16), # conv21 12 23 | (1, 1024, 14, 14, 1024, 3, 3, 2, 1, 1, 16), # conv22 13 24 | (1, 1024, 7, 7, 1024, 3, 3, 1, 1, 1, 16), # conv23 14 25 | ] -------------------------------------------------------------------------------- /flextensor/configs/maxunpooling1d_config.py: -------------------------------------------------------------------------------- 1 | """ 2 | batch, channels, in_lengths, kernel_size, stride, padding 3 | """ 4 | maxunpooling1d_shape = [ 5 | (1, 128, 114, 2, 2, 0), 6 | (1, 256, 56, 2, 2, 0), 7 | (1, 512, 28, 2, 2, 0), 8 | (1, 512, 14, 2, 2, 0) 9 | ] -------------------------------------------------------------------------------- /flextensor/configs/maxunpooling2d_config.py: -------------------------------------------------------------------------------- 1 | 2 | """ 3 | batch, channels, height, width, kernel_size, stride, padding 4 | """ 5 | maxunpooling2d_shape = [ 6 | # DeconvNet(https://arxiv.org/abs/1505.04366) --- based on VGG-16 7 | (1, 64, 112, 112, 2, 2, 0), # 1*64*112*112 -> 1*64*224*224 8 | (1, 128, 56, 56, 2, 2, 0), # 1*128*56*56 -> 1*128*112*112 9 | (1, 256, 28, 28, 2, 2, 0), # 1*256*28*28 -> 1*256*56*56 10 | (1, 512, 14, 14, 2, 2, 0), # 1*512*14*14 -> 1*512*28*28 11 | (1, 512, 7, 7, 2, 2, 0), # 1*512*7*7 -> 1*512*14*14 12 | ] -------------------------------------------------------------------------------- /flextensor/configs/mttkrp_config.py: -------------------------------------------------------------------------------- 1 | mttkrp_shapes = [ 2 | (128, 256, 512, 64), 3 | (128, 256, 256, 128), 4 | (256, 128, 256, 128), 5 | (256, 128, 128, 256), 6 | (512, 64, 128, 256), 7 | (512, 64, 64, 512), 8 | (1024, 32, 32, 1024) 9 | ] -------------------------------------------------------------------------------- /flextensor/configs/shift_conv2d_config.py: -------------------------------------------------------------------------------- 1 | """ 2 | (N, H, W, C, kernel_size, dilation) 3 | """ 4 | shift_conv2d_shape = [ 5 | # ShiftNet(https://arxiv.org/abs/1801.09392) with input size: 256*256 6 | # (1, 128, 128, 64, 3, 1), 7 | (1, 128, 128, 64, 3, 1), 8 | (1, 64, 64, 128, 5, 1), 9 | (1, 32, 32, 256, 3, 1), 10 | (1, 16, 16, 512, 3, 1) 11 | ] -------------------------------------------------------------------------------- /flextensor/examples/__init__.py: -------------------------------------------------------------------------------- 1 | from .single_operation import FUNC_TABLE 2 | -------------------------------------------------------------------------------- /flextensor/examples/autotvm_opt_conv1_cpu.py: -------------------------------------------------------------------------------- 1 | import tvm 2 | import logging 3 | import sys 4 | import time 5 | from tvm import autotvm 6 | from flextensor.scheduler import parallel_evaluate 7 | 8 | 9 | @autotvm.template 10 | def conv2d_channel_batch(B, N, M, C, K, L, O, stride=1, padding=0, dtype="float32"): 11 | A = tvm.te.placeholder((B, N, M, C), dtype=dtype, name="A") 12 | W = tvm.te.placeholder((K, L, C, O), dtype=dtype, name="W") 13 | N_out = max(0, (N + padding * 2 - K) // stride) + 1 14 | M_out = max(0, (M + padding * 2 - L) // stride) + 1 15 | Apad = tvm.te.compute((B, N + 2 * padding, M + 2 * padding, C), 16 | lambda b, i, j, k: tvm.te.if_then_else( 17 | tvm.te.all(i >= padding, j >= padding, i < N + padding, j < M + padding), 18 | A[b, i - padding, j - padding, k], 0.0), name="Apad") 19 | rx, ry = tvm.te.reduce_axis((0, K), name="rx"), tvm.te.reduce_axis((0, L), name="ry") 20 | rc = tvm.te.reduce_axis((0, C), name="rc") 21 | Output = tvm.te.compute((B, N_out, M_out, O), 22 | lambda b, i, j, k: tvm.te.sum(Apad[b, i * stride + rx, j * stride + ry, rc] * W[rx, ry, rc, k], 23 | axis=[rx, ry, rc]), 24 | name="Output") 25 | 26 | s = tvm.te.create_schedule(Output.op) 27 | s[Apad].compute_inline() 28 | CL = s.cache_write(Output, "local") 29 | 30 | n, h, w, c = s[Output].op.axis 31 | out = s[Output].fuse(h, w) 32 | cfg = autotvm.get_config() 33 | cfg.define_split("split_n", n, num_outputs=2) 34 | cfg.define_split("split_c", c, num_outputs=2) 35 | no, ni = cfg["split_n"].apply(s, Output, n) 36 | co, ci = cfg["split_c"].apply(s, Output, c) 37 | s[Output].reorder(no, out, co, ni, ci) 38 | s[Output].parallel(out) 39 | 40 | # schedule CL 41 | s[CL].compute_at(s[Output], co) 42 | ni, hi, wi, ci = s[CL].op.axis 43 | xi, yi, ki = s[CL].op.reduce_axis 44 | cfg.define_split("split_k", ki, num_outputs=2) 45 | ko, ki = cfg["split_k"].apply(s, CL, ki) 46 | s[CL].reorder(ko, xi, yi, ni, ki, ci) 47 | s[CL].unroll(ki) 48 | s[CL].vectorize(ci) 49 | 50 | return s, [A, W, Output] 51 | 52 | 53 | args = (1, 14, 14, 256, 3, 3, 512, 1, 1) 54 | task = autotvm.task.create(conv2d_channel_batch, args=args, target="llvm") 55 | 56 | logging.getLogger("autotvm").setLevel(logging.DEBUG) 57 | logging.getLogger("autotvm").addHandler(logging.StreamHandler(sys.stdout)) 58 | 59 | measure_option = autotvm.measure_option(builder="local", runner=autotvm.LocalRunner(number=10)) 60 | 61 | # begin tuning 62 | tuner = autotvm.tuner.XGBTuner(task) 63 | # tuner = autotvm.tuner.RandomTuner(task) 64 | # tuner = autotvm.tuner.GATuner(task) 65 | # tuner = autotvm.tuner.GridSearchTuner(task) 66 | n_trial = len(task.config_space) 67 | print("trials=", n_trial) 68 | beg = time.time() 69 | tuner.tune(n_trial=n_trial, measure_option=measure_option, callbacks=[autotvm.callback.log_to_file("conv2d.log")]) 70 | end = time.time() 71 | # history best 72 | with autotvm.apply_history_best("conv2d.log"): 73 | with tvm.target.create("llvm"): 74 | s, bufs = conv2d_channel_batch(*args) 75 | func = tvm.build(s, bufs) 76 | 77 | # time evaluate 78 | time_cost = parallel_evaluate(s, bufs, "llvm", 2) 79 | print("time cost is: ", time_cost, "ms, use ",(end - beg), "s") 80 | -------------------------------------------------------------------------------- /flextensor/examples/opt_blur2d_cpu.py: -------------------------------------------------------------------------------- 1 | import time 2 | import os 3 | import numpy as np 4 | import argparse 5 | from flextensor.examples import FUNC_TABLE 6 | from flextensor.test import test_graph_schedule_cpu_general_dx 7 | from flextensor.train import Entity, train_op_schedule_cpu_general_dx 8 | 9 | 10 | def run(M, N, k, model_path, epoch=5, sample_size=16, number=100, test=False): 11 | entities = [] 12 | func = FUNC_TABLE["gaussian_blur2d"].func 13 | args = (M, N, k) 14 | entities.append(Entity("gaussian_blur2d", args)) 15 | model_path = os.path.abspath(model_path) 16 | if not test: 17 | beg = time.time() 18 | train_op_schedule_cpu_general_dx(entities, epoch, sample_size, model_path) 19 | end = time.time() 20 | print("{}({}):".format("gaussian_blur2d", args)) 21 | print("train done! use {}ms".format((end - beg) * 1e3)) 22 | test_graph_schedule_cpu_general_dx(func, args, model_path, number=number) 23 | 24 | 25 | def numpy_baseline(M, N, k, number=10): 26 | A = np.random.random([M, N]) 27 | B = np.zeros([M, N]) 28 | 29 | def blur(A, k, B): 30 | Apad = np.vstack([np.zeros([k//2, A.shape[1] + k // 2]), np.hstack([np.zeros([A.shape[0], k // 2]), A])]) 31 | for i in range(k): 32 | for j in range(k): 33 | np.add(B, np.vstack([np.hstack([Apad[:A.shape[0], j:], np.zeros([A.shape[0], j])])[i:, :A.shape[1]], np.zeros([i, A.shape[1]])]), B) 34 | np.divide(B, k * k, B) 35 | 36 | beg = time.time() 37 | for i in range(number): 38 | blur(A, k, B) 39 | end = time.time() 40 | cost = (end - beg) * 1e3 41 | return cost 42 | 43 | 44 | if __name__ == "__main__": 45 | # parser = argparse.ArgumentParser() 46 | # parser.add_argument("-t", "--train", help="train the model", action="store_true") 47 | # parser.add_argument("-p", "--pytorch", help="run pytorch baseline", action="store_true") 48 | # parser.add_argument("-a", "--auto_schedule", help="run auto-scheduler", action="store_true") 49 | # parser.add_argument("-n", "--number", help="number of tests", type=int, default=100) 50 | # parser.add_argument("-f", "--model_file_path", type=str, default="../logs/test_model.pkl") 51 | # parser.add_argument("--params", help="N,H,W,L,C,k,K,stride,padding", type=str, default="1,14,14,14,512,3,512,1,1") 52 | # parser.add_argument("--epoch", type=int, default=5) 53 | # parser.add_argument("--sample", type=int, default=16) 54 | # args = parser.parse_args() 55 | # test = not args.train 56 | # use_torch = args.pytorch 57 | # use_auto = args.auto_schedule 58 | # try: 59 | # params = [int(x) for x in args.params.split(",")] 60 | # batch_size, height, width, depth, channel, kernel_size, output_channel, stride, padding = params 61 | # if use_torch: 62 | # pytorch_baseliine(batch_size, height, width, depth, channel, kernel_size, output_channel, stride, padding, args.number) 63 | # if use_auto: 64 | # run(batch_size, height, width, depth, channel, kernel_size, output_channel, stride, padding, 65 | # args.model_file_path, args.epoch, args.sample, args.number, test) 66 | # except Exception as e: 67 | # raise ValueError("Bad parameters, please refer to usage") 68 | arg_lst = [ 69 | (1024, 1024, 7), 70 | (1024, 1024, 3), 71 | (2048, 2048, 3), 72 | ] 73 | 74 | names = [ 75 | "1024_7", 76 | "1024_3", 77 | "2048_3", 78 | ] 79 | 80 | for i in range(len(arg_lst)): 81 | # model_path = "../models/opt_blur2d_" + names[i] + "_cpu_process.pkl" 82 | # entities = [] 83 | # func = FUNC_TABLE["gaussian_blur2d"].func 84 | # args = arg_lst[i] 85 | # entities.append(Entity("gaussian_blur2d", args)) 86 | # model_path = os.path.abspath(model_path) 87 | # train_op_schedule_cpu_general_dx(entities, 20, 25, model_path, logfile="process_blur_" + names[i] + "_cpu.txt", device="cuda:1") 88 | M, N, k = arg_lst[i] 89 | print(arg_lst[i], numpy_baseline(M, N, k)) 90 | -------------------------------------------------------------------------------- /flextensor/examples/opt_conv_gpu.py: -------------------------------------------------------------------------------- 1 | import time 2 | import os 3 | import torch 4 | import argparse 5 | 6 | 7 | def run(N, H, W, C, kernel_size, K, stride, padding, model_path, epoch=5, sample=16, number=100, test=False): 8 | return -1 9 | 10 | 11 | def pytorch_baseliine(N, H, W, C, kernel_size, K, stride, padding, number=100, dev=0): 12 | A = torch.rand([N, C, H, W]).cuda("cuda:" + str(dev)) 13 | W = torch.rand([K, C, kernel_size, kernel_size]).cuda("cuda:" + str(dev)) 14 | 15 | start = torch.cuda.Event(enable_timing=True) 16 | end = torch.cuda.Event(enable_timing=True) 17 | 18 | start.record() 19 | for i in range(number): 20 | torch.nn.functional.conv2d(A, W) 21 | end.record() 22 | 23 | # Waits for everything to finish running 24 | torch.cuda.synchronize() 25 | 26 | return start.elapsed_time(end) / number 27 | 28 | 29 | if __name__ == "__main__": 30 | parser = argparse.ArgumentParser() 31 | parser.add_argument("-t", "--train", help="train the model", action="store_true") 32 | parser.add_argument("-p", "--pytorch", help="run pytorch baseline", action="store_true") 33 | parser.add_argument("-a", "--flextensor", help="run auto-scheduler", action="store_true") 34 | parser.add_argument("-n", "--number", help="number of tests", type=int, default=100) 35 | parser.add_argument("-f", "--model_file_path", type=str, default="../logs/test_model.pkl") 36 | parser.add_argument("--params", help="N,H,W,C,k,K,stride,padding", type=str, default="1,14,14,512,3,512,1,1") 37 | parser.add_argument("--epoch", type=int, default=5) 38 | parser.add_argument("--sample", type=int, default=16) 39 | parser.add_argument("--device", type=int, default=0) 40 | args = parser.parse_args() 41 | test = not args.train 42 | use_torch = args.pytorch 43 | use_auto = args.flextensor 44 | try: 45 | params = [int(x) for x in args.params.split(",")] 46 | batch_size, height, width, channel, kernel_size, output_channel, stride, padding = params 47 | if use_torch: 48 | cost = pytorch_baseliine(batch_size, height, width, channel, kernel_size, output_channel, stride, padding, args.number, args.device) 49 | print("PyTorch baseline: {}ms".format(cost)) 50 | if use_auto: 51 | run(batch_size, height, width, channel, kernel_size, output_channel, stride, padding, 52 | args.model_file_path, args.epoch, args.sample, args.number, test) 53 | except Exception as e: 54 | raise ValueError("Bad parameters, please refer to usage") 55 | 56 | # arg_lst = [ 57 | # (1, 7, 7, 1024, 3, 3, 1024, 1, 1), 58 | # # (8, 7, 7, 1024, 3, 3, 1024, 1, 1), 59 | # # (64, 7, 7, 1024, 3, 3, 1024, 1, 1), 60 | # # (256, 7, 7, 1024, 3, 3, 1024, 1, 1), 61 | # (1, 14, 14, 1024, 1, 1, 512, 1, 0), 62 | # (1, 28, 28, 256, 3, 3, 512, 1, 1), 63 | # (1, 28, 28, 512, 1, 1, 256, 1, 0), 64 | # (1, 56, 56, 128, 3, 3, 256, 1, 1), 65 | # (1, 56, 56, 192, 1, 1, 128, 1, 0), 66 | # (1, 112, 112, 64, 3, 3, 192, 1, 1), 67 | # (1, 448, 448, 3, 7, 7, 64, 2, 3) 68 | # ] 69 | # names = [ 70 | # "yolo24_b1", 71 | # # "yolo24_b8", 72 | # # "yolo24_b64", 73 | # # "yolo24_b256", 74 | # "yolo19_b1", 75 | # "yolo10_b1", 76 | # "yolo7_b1", 77 | # "yolo4_b1", 78 | # "yolo3_b1", 79 | # "yolo2_b1", 80 | # "yolo1_b1" 81 | # ] 82 | # for i in range(len(arg_lst)): 83 | # model_path = "opt_conv2d_nchw_" + names[i] + "_gpu.pkl" 84 | # entities = [] 85 | # args = arg_lst[i] 86 | # entities.append(Entity("conv2d_nchw", args)) 87 | # model_path = os.path.abspath(model_path) 88 | # train_op_schedule_gpu_general_dx(entities, 40, 25, model_path, logfile="process_conv2d_nchw_" + names[i] + "_gpu.txt") 89 | 90 | -------------------------------------------------------------------------------- /flextensor/examples/opt_gemm_cpu.py: -------------------------------------------------------------------------------- 1 | import time 2 | import os 3 | import torch 4 | import argparse 5 | from flextensor.examples import FUNC_TABLE 6 | from flextensor.test import test_graph_schedule_cpu_general_dx 7 | from flextensor.train import Entity, train_op_schedule_cpu_general_dx 8 | 9 | 10 | def run(batch_size, M, N, L, model_path, epoch=5, sample=16, number=100, test=False): 11 | entities = [] 12 | func = FUNC_TABLE["matmul_batch"].func 13 | args = (batch_size, M, N, L) 14 | entities.append(Entity(func, args)) 15 | model_path = os.path.abspath(model_path) 16 | if not test: 17 | beg = time.time() 18 | train_op_schedule_cpu_general_dx(entities, epoch, sample, model_path) 19 | end = time.time() 20 | print("{}({}):".format("matmul_batch", args)) 21 | print("train done! use {}ms".format((end - beg) * 1e3)) 22 | test_graph_schedule_cpu_general_dx(func, args, model_path, number=number) 23 | 24 | 25 | def pytorch_baseliine(batch_size, M, N, L, number=100): 26 | A = torch.rand((batch_size, M, N)) 27 | B = torch.rand((N, L)) 28 | beg = time.time() 29 | for i in range(number): 30 | C = A.matmul(B) 31 | end = time.time() 32 | cost = (end - beg) / number * 1e3 33 | print("pytorch gemm use {}ms".format(cost)) 34 | return cost 35 | 36 | 37 | if __name__ == "__main__": 38 | # parser = argparse.ArgumentParser() 39 | # parser.add_argument("-t", "--train", help="train the model", action="store_true") 40 | # parser.add_argument("-p", "--pytorch", help="run pytorch baseline", action="store_true") 41 | # parser.add_argument("-a", "--auto_schedule", help="run auto-scheduler", action="store_true") 42 | # parser.add_argument("-n", "--number", help="number of tests", type=int, default=100) 43 | # parser.add_argument("-f", "--model_file_path", type=str, default="../logs/test_model.pkl") 44 | # parser.add_argument("--params", help="B,M,N,L", type=str, default="1,1024,1024,1024") 45 | # parser.add_argument("--epoch", type=int, default=5) 46 | # parser.add_argument("--sample", type=int, default=16) 47 | # args = parser.parse_args() 48 | # test = not args.train 49 | # use_torch = args.pytorch 50 | # use_auto = args.auto_schedule 51 | # try: 52 | # params = [int(x) for x in args.params.split(",")] 53 | # batch_size, M, N, L = params 54 | # if use_torch: 55 | # pytorch_baseliine(batch_size, M, N, L, args.number) 56 | # if use_auto: 57 | # run(batch_size, M, N, L, args.model_file_path, args.epoch, args.sample, args.number, test) 58 | # except Exception as e: 59 | # raise ValueError("Bad parameters, please refer to usage") 60 | 61 | # gemm 62 | arg_lst = [ 63 | (1, 128, 128, 128), 64 | (1, 256, 256, 256), 65 | (1, 512, 512, 512), 66 | (1, 1024, 1024, 1024), 67 | (1, 2048, 2048, 2048), 68 | (1, 4096, 4096, 4096), 69 | (1, 1024, 32, 1024), 70 | (1, 32, 1024, 32), 71 | ] 72 | 73 | names = [ 74 | "128_128_128_b1", 75 | "256_256_256_b1", 76 | "512_512_512_b1", 77 | "1024_1024_1024_b1", 78 | "2048_2048_2048_b1", 79 | "4096_4096_4096_b1", 80 | "1024_32_1024_b1", 81 | "32_1024_32_b1", 82 | ] 83 | for i in range(len(arg_lst)): 84 | model_path = "opt_gemm_" + names[i] + "_cpu.pkl" 85 | entities = [] 86 | args = arg_lst[i] 87 | entities.append(Entity("matmul_batch", args)) 88 | model_path = os.path.abspath(model_path) 89 | train_op_schedule_cpu_general_dx(entities, 20, 50, model_path, logfile="process_gemm_" + names[i] + "_cpu.txt", device="cuda:0") 90 | 91 | -------------------------------------------------------------------------------- /flextensor/examples/opt_gemm_gpu.py: -------------------------------------------------------------------------------- 1 | import time 2 | import os 3 | import torch 4 | import argparse 5 | from flextensor.test import test_graph_schedule_gpu_general_dx 6 | from flextensor.train import Entity, train_op_schedule_gpu_general_dx 7 | 8 | 9 | def run(batch_size, M, N, L, model_path, epoch=5, sample=16, number=100, test=False): 10 | entities = [] 11 | args = (batch_size, M, N, L) 12 | entities.append(Entity("matmul_batch", args)) 13 | model_path = os.path.abspath(model_path) 14 | if not test: 15 | beg = time.time() 16 | train_op_schedule_gpu_general_dx(entities, epoch, sample, model_path) 17 | end = time.time() 18 | print("{}({}):".format("matmul_batch", args)) 19 | print("train done! use {}ms".format((end - beg) * 1e3)) 20 | test_graph_schedule_gpu_general_dx(entities, model_path, sampling=True, number=number) 21 | 22 | 23 | def pytorch_baseliine(batch_size, M, N, L, number=100): 24 | A = torch.rand((batch_size, M, N)).cuda() 25 | B = torch.rand((N, L)).cuda() 26 | beg = time.time() 27 | for i in range(number): 28 | C = A.matmul(B) 29 | end = time.time() 30 | cost = (end - beg) / number * 1e3 31 | return cost 32 | 33 | 34 | if __name__ == "__main__": 35 | # parser = argparse.ArgumentParser() 36 | # parser.add_argument("-t", "--train", help="train the model", action="store_true") 37 | # parser.add_argument("-p", "--pytorch", help="run pytorch baseline", action="store_true") 38 | # parser.add_argument("-a", "--auto_schedule", help="run auto-scheduler", action="store_true") 39 | # parser.add_argument("-n", "--number", help="number of tests", type=int, default=100) 40 | # parser.add_argument("-f", "--model_file_path", type=str, default="../logs/test_model.pkl") 41 | # parser.add_argument("--params", help="B,M,N,L", type=str, default="1,1024,1024,1024") 42 | # parser.add_argument("--epoch", type=int, default=5) 43 | # parser.add_argument("--sample", type=int, default=16) 44 | # args = parser.parse_args() 45 | # test = not args.train 46 | # use_torch = args.pytorch 47 | # use_auto = args.auto_schedule 48 | # try: 49 | # params = [int(x) for x in args.params.split(",")] 50 | # batch_size, M, N, L = params 51 | # if use_torch: 52 | # cost = pytorch_baseliine(batch_size, M, N, L, args.number) 53 | # print("pytorch gemm use {}ms".format(cost)) 54 | # if use_auto: 55 | # run(batch_size, M, N, L, args.model_file_path, args.epoch, args.sample, args.number, test) 56 | # except Exception as e: 57 | # raise ValueError("Bad parameters, please refer to usage") 58 | 59 | # gemm 60 | arg_lst = [ 61 | (1, 32, 32, 32), 62 | (1, 64, 64, 64), 63 | (1, 128, 128, 128), 64 | (1, 256, 256, 256), 65 | (1, 512, 512, 512), 66 | (1, 1024, 1024, 1024), 67 | (1, 2048, 2048, 2048), 68 | (1, 4096, 4096, 4096), 69 | (1, 1024, 32, 1024), 70 | (1, 32, 1024, 32), 71 | ] 72 | 73 | names = [ 74 | "32_32_32_b1", 75 | "64_64_64_b1", 76 | "128_128_128_b1", 77 | "256_256_256_b1", 78 | "512_512_512_b1", 79 | "1024_1024_1024_b1", 80 | "2048_2048_2048_b1", 81 | "4096_4096_4096_b1", 82 | "1024_32_1024_b1", 83 | "32_1024_32_b1", 84 | ] 85 | for i in range(len(arg_lst)): 86 | model_path = "opt_gemm_" + names[i] + "_gpu.pkl" 87 | entities = [] 88 | args = arg_lst[i] 89 | entities.append(Entity("matmul_batch", args)) 90 | model_path = os.path.abspath(model_path) 91 | train_op_schedule_gpu_general_dx(entities, 20, 50, model_path, logfile="process_gemm_" + names[i] + "_gpu.txt", device="cuda:1") 92 | 93 | 94 | -------------------------------------------------------------------------------- /flextensor/examples/opt_outer_cpu.py: -------------------------------------------------------------------------------- 1 | import time 2 | import os 3 | import numpy as np 4 | import argparse 5 | from flextensor.examples import FUNC_TABLE 6 | from flextensor.test import test_graph_schedule_cpu_general_dx 7 | from flextensor.train import Entity, train_op_schedule_cpu_general_dx 8 | 9 | 10 | def run(M, N, model_path, epoch=5, sample_size=16, number=100, test=False): 11 | entities = [] 12 | func = FUNC_TABLE["outer_product"].func 13 | args = (M, N) 14 | entities.append(Entity("outer_product", args)) 15 | model_path = os.path.abspath(model_path) 16 | if not test: 17 | beg = time.time() 18 | train_op_schedule_cpu_general_dx(entities, epoch, sample_size, model_path) 19 | end = time.time() 20 | print("{}({}):".format("outer_product", args)) 21 | print("train done! use {}ms".format((end - beg) * 1e3)) 22 | test_graph_schedule_cpu_general_dx(func, args, model_path, number=number) 23 | 24 | 25 | def numpy_baseline(M, N, number=100): 26 | A = np.random.random(M) 27 | B = np.random.random(N) 28 | C = np.zeros([M, N]) 29 | beg = time.time() 30 | for i in range(number): 31 | np.outer(A, B, C) 32 | end = time.time() 33 | cost = (end - beg) * 1e3 / number 34 | return cost 35 | 36 | 37 | if __name__ == "__main__": 38 | # parser = argparse.ArgumentParser() 39 | # parser.add_argument("-t", "--train", help="train the model", action="store_true") 40 | # parser.add_argument("-p", "--pytorch", help="run pytorch baseline", action="store_true") 41 | # parser.add_argument("-a", "--auto_schedule", help="run auto-scheduler", action="store_true") 42 | # parser.add_argument("-n", "--number", help="number of tests", type=int, default=100) 43 | # parser.add_argument("-f", "--model_file_path", type=str, default="../logs/test_model.pkl") 44 | # parser.add_argument("--params", help="N,H,W,L,C,k,K,stride,padding", type=str, default="1,14,14,14,512,3,512,1,1") 45 | # parser.add_argument("--epoch", type=int, default=5) 46 | # parser.add_argument("--sample", type=int, default=16) 47 | # args = parser.parse_args() 48 | # test = not args.train 49 | # use_torch = args.pytorch 50 | # use_auto = args.auto_schedule 51 | # try: 52 | # params = [int(x) for x in args.params.split(",")] 53 | # batch_size, height, width, depth, channel, kernel_size, output_channel, stride, padding = params 54 | # if use_torch: 55 | # pytorch_baseliine(batch_size, height, width, depth, channel, kernel_size, output_channel, stride, padding, args.number) 56 | # if use_auto: 57 | # run(batch_size, height, width, depth, channel, kernel_size, output_channel, stride, padding, 58 | # args.model_file_path, args.epoch, args.sample, args.number, test) 59 | # except Exception as e: 60 | # raise ValueError("Bad parameters, please refer to usage") 61 | arg_lst = [ 62 | (512, 512), 63 | (1024, 1024), 64 | (2048, 2048), 65 | ] 66 | 67 | names = [ 68 | "512", 69 | "1024", 70 | "2048" 71 | ] 72 | 73 | for i in range(len(arg_lst)): 74 | model_path = "opt_outer_" + names[i] + "_cpu.pkl" 75 | entities = [] 76 | func = FUNC_TABLE["outer_product"].func 77 | args = arg_lst[i] 78 | entities.append(Entity("outer_product", args)) 79 | model_path = os.path.abspath(model_path) 80 | train_op_schedule_cpu_general_dx(entities, 10, 20, model_path, logfile="process_outer_" + names[i] + "_cpu.txt", device="cuda:1") 81 | -------------------------------------------------------------------------------- /flextensor/examples/transfer_cpu.py: -------------------------------------------------------------------------------- 1 | import os 2 | from flextensor.train import Entity, train_op_schedule_cpu_general_dx 3 | 4 | 5 | arg_lst = [ 6 | (1, 7, 7, 1024, 3, 3, 1024, 1, 1), 7 | # (8, 7, 7, 1024, 3, 3, 1024, 1, 1), 8 | # (64, 7, 7, 1024, 3, 3, 1024, 1, 1), 9 | # (256, 7, 7, 1024, 3, 3, 1024, 1, 1), 10 | (1, 14, 14, 1024, 1, 1, 512, 1, 0), 11 | (1, 28, 28, 256, 3, 3, 512, 1, 1), 12 | (1, 28, 28, 512, 1, 1, 256, 1, 0), 13 | (1, 56, 56, 128, 3, 3, 256, 1, 1), 14 | (1, 56, 56, 192, 1, 1, 128, 1, 0), 15 | (1, 112, 112, 64, 3, 3, 192, 1, 1), 16 | (1, 448, 448, 3, 7, 7, 64, 2, 3), 17 | (1, 1024, 1024, 1024), 18 | ] 19 | 20 | names = [ 21 | "yolo24_b1", 22 | # "yolo24_b8", 23 | # "yolo24_b64", 24 | # "yolo24_b256", 25 | "yolo19_b1", 26 | "yolo10_b1", 27 | "yolo7_b1", 28 | "yolo4_b1", 29 | "yolo3_b1", 30 | "yolo2_b1", 31 | "yolo1_b1", 32 | "gemm_1024" 33 | ] 34 | 35 | func_names = ["conv2d_nchw"] * 8 + ["matmul_batch"] 36 | 37 | 38 | def transfer(pre_train, post_train): 39 | entities = [] 40 | for i in pre_train: 41 | args = arg_lst[i] 42 | entities.append(Entity(func_names[i], args)) 43 | model_path = "cpu_transfer_pre{}_post{}.pkl".format(pre_train, post_train) 44 | model_path = os.path.abspath(model_path) 45 | train_op_schedule_cpu_general_dx(entities, 20, 50, model_path, 46 | logfile="process_transfer_pre{}_(post{})_cpu.txt".format(pre_train, post_train), 47 | device="cuda:3") 48 | entities = [] 49 | for i in post_train: 50 | args = arg_lst[i] 51 | entities.append(Entity(func_names[i], args)) 52 | model_path = "cpu_transfer_pre{}_post{}.pkl".format(pre_train, post_train) 53 | model_path = os.path.abspath(model_path) 54 | train_op_schedule_cpu_general_dx(entities, 20, 50, model_path, 55 | logfile="process_transfer_(pre_{})_post{}_cpu.txt".format(pre_train, post_train), 56 | device="cuda:3") 57 | 58 | 59 | if __name__ == "__main__": 60 | transfer([1, 3], [5]) 61 | transfer([0, 2], [4]) 62 | transfer([0, 1, 2], [6]) 63 | transfer([1, 3, 5], [8]) 64 | -------------------------------------------------------------------------------- /flextensor/examples/transfer_gpu.py: -------------------------------------------------------------------------------- 1 | import os 2 | from flextensor.train import Entity, train_op_schedule_gpu_general_dx 3 | 4 | 5 | arg_lst = [ 6 | (1, 7, 7, 1024, 3, 3, 1024, 1, 1), 7 | # (8, 7, 7, 1024, 3, 3, 1024, 1, 1), 8 | # (64, 7, 7, 1024, 3, 3, 1024, 1, 1), 9 | # (256, 7, 7, 1024, 3, 3, 1024, 1, 1), 10 | (1, 14, 14, 1024, 1, 1, 512, 1, 0), 11 | (1, 28, 28, 256, 3, 3, 512, 1, 1), 12 | (1, 28, 28, 512, 1, 1, 256, 1, 0), 13 | (1, 56, 56, 128, 3, 3, 256, 1, 1), 14 | (1, 56, 56, 192, 1, 1, 128, 1, 0), 15 | (1, 112, 112, 64, 3, 3, 192, 1, 1), 16 | (1, 448, 448, 3, 7, 7, 64, 2, 3), 17 | (1, 1024, 1024, 1024), 18 | ] 19 | 20 | names = [ 21 | "yolo24_b1", 22 | # "yolo24_b8", 23 | # "yolo24_b64", 24 | # "yolo24_b256", 25 | "yolo19_b1", 26 | "yolo10_b1", 27 | "yolo7_b1", 28 | "yolo4_b1", 29 | "yolo3_b1", 30 | "yolo2_b1", 31 | "yolo1_b1", 32 | "gemm_1024" 33 | ] 34 | 35 | func_names = ["conv2d_nchw"] * 8 + ["matmul_batch"] 36 | 37 | 38 | def transfer(pre_train, post_train): 39 | entities = [] 40 | for i in pre_train: 41 | args = arg_lst[i] 42 | entities.append(Entity(func_names[i], args)) 43 | model_path = "gpu_transfer_pre{}_post{}.pkl".format(pre_train, post_train) 44 | model_path = os.path.abspath(model_path) 45 | train_op_schedule_gpu_general_dx(entities, 20, 50, model_path, 46 | logfile="process_transfer_pre_{}_(post{})_gpu.txt".format(pre_train, post_train), 47 | device="cuda:0") 48 | entities = [] 49 | for i in post_train: 50 | args = arg_lst[i] 51 | entities.append(Entity(func_names[i], args)) 52 | model_path = "gpu_transfer0_pre{}_post{}.pkl".format(pre_train, post_train) 53 | model_path = os.path.abspath(model_path) 54 | train_op_schedule_gpu_general_dx(entities, 20, 50, model_path, 55 | logfile="process_transfer_(pre_{})_post{}_gpu.txt".format(pre_train, post_train), 56 | device="cuda:0") 57 | 58 | 59 | if __name__ == "__main__": 60 | transfer([1, 3], [5]) 61 | transfer([0, 2], [4]) 62 | transfer([0, 1, 2], [6]) 63 | transfer([1, 3, 5], [8]) 64 | -------------------------------------------------------------------------------- /flextensor/nn/README.md: -------------------------------------------------------------------------------- 1 | ## 添加算子步骤 2 | 3 | 1. 将算子的实现部分添加在`.ops.py`,如 4 | ```python 5 | def gemm(A, B, transposeA=False, transposeB=False): 6 | """Matrix multiplies matrix 7 | 8 | Args: 9 | ----------------------------- 10 | A: tvm.tensor.Tensor 11 | shape [height, width] 12 | B: tvm.tensor.Tensor 13 | shape [width, length] 14 | transposeA: (optional:False) bool 15 | transposeB: (optional:False) bool 16 | ----------------------------- 17 | 18 | Returns: 19 | ----------------------------- 20 | tvm.tensor.Tensor 21 | shape [height, length] 22 | ----------------------------- 23 | """ 24 | if transposeA and transposeB: 25 | k = tvm.reduce_axis((0, B.shape[1])) 26 | assert_print(A.shape[0].value == B.shape[1].value) 27 | return tvm.compute((A.shape[1], B.shape[0]), lambda i, j: tvm.sum(A[k, i] * B[j, k], axis=k)) 28 | elif transposeA and not transposeB: 29 | k = tvm.reduce_axis((0, B.shape[0])) 30 | assert_print(A.shape[0].value == B.shape[0].value) 31 | return tvm.compute((A.shape[1], B.shape[1]), lambda i, j: tvm.sum(A[k, i] * B[k, j], axis=k)) 32 | elif not transposeA and transposeB: 33 | k = tvm.reduce_axis((0, B.shape[1])) 34 | assert_print(A.shape[1].value == B.shape[1].value) 35 | return tvm.compute((A.shape[0], B.shape[0]), lambda i, j: tvm.sum(A[i, k] * B[j, k], axis=k)) 36 | else: 37 | k = tvm.reduce_axis((0, B.shape[0])) 38 | assert_print(A.shape[1].value == B.shape[0].value) 39 | return tvm.compute((A.shape[0], B.shape[1]), lambda i, j: tvm.sum(A[i, k] * B[k, j], axis=k)) 40 | ``` 41 | 2. 在`../configs/`中添加配置文件,如`../configs/gemm_config.py` 42 | 43 | ```python 44 | gemm_shapes = [ 45 | (32, 32, 32), 46 | (64, 64, 64), 47 | (128, 128, 128), 48 | (256, 256, 256), 49 | (512, 512, 512), 50 | (1024, 1024, 1024), 51 | (2048, 2048, 2048) 52 | ] 53 | ``` 54 | 3. 在`../task.py`中注册算子的task,如 55 | 56 | ```python 57 | from flextensor.nn.ops import gemm as op_gemm 58 | 59 | def gemm(N, K, M): 60 | A = tvm.placeholder((N, K)) 61 | B = tvm.placeholder((K, M)) 62 | Output = op_gemm(A, B) 63 | return [Output.op], [A, B, Output] 64 | 65 | from flextensor.configs.gemm_config import gemm_shapes 66 | 67 | for shape in gemm_shapes: 68 | N, K, M = shape 69 | for j in range(4): 70 | register_task(Task("gemm", "gemm", gemm, (N, K, M), "llvm", j)) 71 | register_task(Task("gemm", "gemm", gemm, (N, K, M), "cuda", j)) 72 | ``` 73 | 4. (可选)在`../test/test_ops.py`中添加算子的正确性测试 74 | 5. 在`../optimize/`中添加优化测试 -------------------------------------------------------------------------------- /flextensor/nn/__init__.py: -------------------------------------------------------------------------------- 1 | from .ops import (conv1d, conv_transpose1d, conv2d_nchw, conv_transpose2d_nchw, 2 | conv3d_ncdhw, conv_transpose3d_ncdhw, depthwise_conv2d_nchw, 3 | conv2d_nhwc, gemm_conv2d_nchw, gemv, gemm, batch_gemm, linear, 4 | bilinear, MTTKRP3d, pointwise_multiply, mean, variance, 5 | batch_normalization2d, block_circulant_matrix, MaxUnpooling1d, 6 | MaxUnpooling2d, ShiftConv2d_nhwc, PixelCNN, GatedPixelCNN, conv2d_nchwc, 7 | winograd_conv2d_nchw) 8 | from .layers import (YoloConvLayer1, YoloConvLayer2, YoloConvLayer3, YoloConvLayer4, 9 | YoloConvLayer5, YoloConvLayer6, YoloConvLayer7, YoloConvLayer8, 10 | YoloConvLayer9, YoloConvLayer10, YoloConvLayer11, YoloConvLayer12, 11 | YoloConvLayer13, YoloConvLayer14, YoloConvLayer15, 12 | YoloGemmConvLayer1, YoloGemmConvLayer17, YoloGemmConvLayer24, 13 | SqueezeNetFire8, SqueezeNetFire8Gemm) -------------------------------------------------------------------------------- /flextensor/optimize/README.md: -------------------------------------------------------------------------------- 1 | # Testing Operators for Mali GPU Optimized by FlexTensor 2 | 3 | [TOC] 4 | 5 | ## Preparation 6 | 7 | ### [Installation of TVM](https://tvm.apache.org/docs/install/) 8 | 9 | Note: Only need to turn on `USE_LLVM` in `config.cmake`. 10 | 11 | ### [Installation of TVM Java Frontend](https://github.com/apache/incubator-tvm/blob/main/jvm/README.md) 12 | 13 | ### [Installation&Setup of TVM-RPC App](https://github.com/apache/incubator-tvm/blob/main/apps/android_rpc/README.md) 14 | 15 | 16 | 17 | ## Test 18 | 19 | 1. Setup environment: 20 | 21 | 1. Startup TVM RPC tracker and TVM RPC app. 22 | 2. Export `/path/to/FlexTensor` to `PYTHONPATH`. 23 | 24 | 2. Open `FlexTensor/flextensor/optimize`. 25 | 26 | 3. Run the following commands to test optimized operators: gemm, conv1d, conv2d (suppose tracker's ip:port is 0.0.0.0:9190): 27 | 28 | ```shell 29 | python3 optimize_gemm.py \ 30 | --target_host "llvm -mtriple=aarch64-linux-android" \ 31 | --host 0.0.0.0 --port 9190 \ 32 | --use_rpc tracker \ 33 | --fcompile ndk \ 34 | --device_key android \ 35 | --target opencl \ 36 | --test gemm-config.log 37 | 38 | python3 optimize_conv1d.py \ 39 | --target_host "llvm -mtriple=aarch64-linux-android" \ 40 | --host 0.0.0.0 --port 9190 \ 41 | --use_rpc tracker \ 42 | --fcompile ndk \ 43 | --device_key android \ 44 | --target opencl \ 45 | --test conv1d-config.log 46 | 47 | python3 optimize_conv2d.py \ 48 | --target_host "llvm -mtriple=aarch64-linux-android" \ 49 | --host 0.0.0.0 --port 9190 \ 50 | --use_rpc tracker \ 51 | --fcompile ndk \ 52 | --device_key android \ 53 | --target opencl \ 54 | --test conv2d-config.log 55 | ``` 56 | 57 | Then you will see the generated kernels and evaluation results. 58 | 59 | -------------------------------------------------------------------------------- /flextensor/optimize/conv1d-config.log: -------------------------------------------------------------------------------- 1 | conv1d_conv1d_(1, 192, 3136, 128, 1, 1, 0, 1, 1)_opencl(0):[[{"fuse": [[1, 2, 3]], "spatial": [[1, 1, 1, 1], [1, 24, 1, 8], [1, 784, 2, 2]], "reduce": [], "reorder": [[1]], "inline": [], "unroll": [[512, 0]], "merge": [], "special": []}, {"fuse": [[1, 2, 3]], "spatial": [[1, 1, 1, 1], [8, 8, 2, 1], [112, 1, 28, 1]], "reduce": [[2, 2, 48], [1, 1, 1]], "reorder": [[1]], "inline": [], "unroll": [[1500, 0]], "merge": [], "special": []}], {"fuse": [], "spatial": [], "reduce": [], "reorder": [], "inline": [[0, 0]], "unroll": [], "merge": [], "special": []}] 2 | conv1d_conv1d_(1, 128, 3136, 256, 9, 1, 1, 1, 1)_opencl(0):[[{"fuse": [[1, 2, 3]], "spatial": [[1, 1, 1, 1], [1, 2, 1, 64], [3, 1, 1, 1046]], "reduce": [], "reorder": [[1]], "inline": [], "unroll": [[1500, 0]], "merge": [], "special": []}, {"fuse": [[1, 2, 3]], "spatial": [[1, 1, 1, 1], [1, 2, 1, 128], [1, 313, 5, 2]], "reduce": [[32, 4, 1], [1, 9, 1]], "reorder": [[0]], "inline": [], "unroll": [[1500, 0]], "merge": [], "special": []}], {"fuse": [], "spatial": [], "reduce": [], "reorder": [], "inline": [[1, 0]], "unroll": [], "merge": [], "special": []}] 3 | conv1d_conv1d_(1, 512, 784, 256, 1, 1, 0, 1, 1)_opencl(0):[[{"fuse": [[1, 2, 3]], "spatial": [[1, 1, 1, 1], [1, 64, 1, 8], [1, 16, 1, 49]], "reduce": [], "reorder": [[2]], "inline": [], "unroll": [[512, 0]], "merge": [], "special": []}, {"fuse": [[1, 2, 3]], "spatial": [[1, 1, 1, 1], [16, 4, 1, 4], [56, 1, 14, 1]], "reduce": [[1, 256, 2], [1, 1, 1]], "reorder": [[0]], "inline": [], "unroll": [[1, 1]], "merge": [], "special": []}], {"fuse": [], "spatial": [], "reduce": [], "reorder": [], "inline": [[0, 0]], "unroll": [], "merge": [], "special": []}] 4 | conv1d_conv1d_(1, 256, 784, 512, 9, 1, 1, 1, 1)_opencl(0):[[{"fuse": [[1, 2, 3]], "spatial": [[1, 1, 1, 1], [128, 1, 2, 1], [3, 1, 131, 2]], "reduce": [], "reorder": [[1]], "inline": [], "unroll": [[1500, 0]], "merge": [], "special": []}, {"fuse": [[1, 2, 3]], "spatial": [[1, 1, 1, 1], [1, 512, 1, 1], [2, 389, 1, 1]], "reduce": [[2, 32, 4], [3, 3, 1]], "reorder": [[2]], "inline": [], "unroll": [[512, 0]], "merge": [], "special": []}], {"fuse": [], "spatial": [], "reduce": [], "reorder": [], "inline": [[1, 0]], "unroll": [], "merge": [], "special": []}] 5 | conv1d_conv1d_(1, 1024, 196, 512, 1, 1, 0, 1, 1)_opencl(0):[[{"fuse": [[1, 2, 3]], "spatial": [[1, 1, 1, 1], [1024, 1, 1, 1], [2, 1, 49, 2]], "reduce": [], "reorder": [[2]], "inline": [], "unroll": [[0, 1]], "merge": [], "special": []}, {"fuse": [[1, 2, 3]], "spatial": [[1, 1, 1, 1], [32, 1, 8, 2], [1, 2, 14, 7]], "reduce": [[1, 32, 32], [1, 1, 1]], "reorder": [[1]], "inline": [], "unroll": [[512, 1]], "merge": [], "special": []}], {"fuse": [], "spatial": [], "reduce": [], "reorder": [], "inline": [[1, 0]], "unroll": [], "merge": [], "special": []}] 6 | conv1d_conv1d_(1, 512, 196, 1024, 9, 1, 1, 1, 1)_opencl(0):[[{"fuse": [[1, 2, 3]], "spatial": [[1, 1, 1, 1], [64, 1, 2, 4], [1, 1, 99, 2]], "reduce": [], "reorder": [[2]], "inline": [], "unroll": [[1500, 1]], "merge": [], "special": []}, {"fuse": [[1, 2, 3]], "spatial": [[1, 1, 1, 1], [1, 4, 1, 256], [1, 2, 1, 95]], "reduce": [[64, 2, 4], [1, 3, 3]], "reorder": [[0]], "inline": [], "unroll": [[1, 0]], "merge": [], "special": []}], {"fuse": [], "spatial": [], "reduce": [], "reorder": [], "inline": [[1, 0]], "unroll": [], "merge": [], "special": []}] 7 | -------------------------------------------------------------------------------- /flextensor/optimize/conv2d-config.log: -------------------------------------------------------------------------------- 1 | conv2d_mobile_v21_(1, 16, 112, 112, 96, 3, 2, 1, 1, 16)_opencl(0):[[{"fuse": [[2, 3, 4]], "spatial": [[1, 1, 1, 1], [8, 2, 1, 1], [114, 1, 1, 1], [1, 1, 57, 2]], "reduce": [], "reorder": [[1]], "inline": [], "unroll": [[1500, 1]], "merge": [], "special": []}, {"fuse": [[2, 3, 4]], "spatial": [[1, 1, 1, 1], [48, 1, 2, 1], [8, 1, 1, 7], [1, 1, 14, 4]], "reduce": [[1, 1, 1], [3, 1, 1], [3, 1, 1]], "reorder": [[0]], "inline": [], "unroll": [[0, 1]], "merge": [], "special": []}], {"fuse": [], "spatial": [], "reduce": [], "reorder": [], "inline": [[0, 0]], "unroll": [], "merge": [], "special": []}] 2 | conv2d_mobile_v22_(1, 24, 56, 56, 144, 3, 2, 1, 1, 24)_opencl(0):[[{"fuse": [[1, 2, 4]], "spatial": [[1, 1, 1, 1], [4, 3, 2, 1], [29, 1, 1, 2], [1, 1, 58, 1]], "reduce": [], "reorder": [[0]], "inline": [], "unroll": [[512, 1]], "merge": [], "special": []}, {"fuse": [[2, 3, 4]], "spatial": [[1, 1, 1, 1], [24, 2, 3, 1], [2, 2, 1, 7], [1, 1, 28, 1]], "reduce": [[1, 1, 1], [1, 3, 1], [1, 1, 3]], "reorder": [[1]], "inline": [], "unroll": [[1, 1]], "merge": [], "special": []}], {"fuse": [], "spatial": [], "reduce": [], "reorder": [], "inline": [[1, 0]], "unroll": [], "merge": [], "special": []}] 3 | conv2d_mobile_v23_(1, 32, 28, 28, 192, 3, 2, 1, 1, 32)_opencl(0):[[{"fuse": [[1, 2, 4]], "spatial": [[1, 1, 1, 1], [32, 1, 1, 1], [5, 1, 2, 3], [1, 1, 30, 1]], "reduce": [], "reorder": [[0]], "inline": [], "unroll": [[512, 0]], "merge": [], "special": []}, {"fuse": [[1, 2, 4]], "spatial": [[1, 1, 1, 1], [32, 2, 3, 1], [1, 1, 7, 2], [1, 2, 7, 1]], "reduce": [[1, 1, 1], [3, 1, 1], [1, 1, 3]], "reorder": [[2]], "inline": [], "unroll": [[0, 1]], "merge": [], "special": []}], {"fuse": [], "spatial": [], "reduce": [], "reorder": [], "inline": [[1, 0]], "unroll": [], "merge": [], "special": []}] 4 | conv2d_mobile_v24_(1, 64, 14, 14, 384, 3, 1, 1, 1, 64)_opencl(0):[[{"fuse": [[2, 3, 4]], "spatial": [[1, 1, 1, 1], [8, 1, 4, 2], [4, 1, 4, 1], [1, 1, 8, 2]], "reduce": [], "reorder": [[0]], "inline": [], "unroll": [[512, 1]], "merge": [], "special": []}, {"fuse": [[1, 3, 4]], "spatial": [[1, 1, 1, 1], [48, 1, 2, 4], [2, 1, 7, 1], [1, 1, 7, 2]], "reduce": [[1, 1, 1], [1, 1, 3], [1, 3, 1]], "reorder": [[0]], "inline": [], "unroll": [[512, 1]], "merge": [], "special": []}], {"fuse": [], "spatial": [], "reduce": [], "reorder": [], "inline": [[0, 0]], "unroll": [], "merge": [], "special": []}] 5 | conv2d_mobile_v25_(1, 96, 14, 14, 576, 3, 2, 1, 1, 96)_opencl(0):[[{"fuse": [[1, 3, 4]], "spatial": [[1, 1, 1, 1], [8, 1, 12, 1], [2, 1, 8, 1], [1, 2, 2, 4]], "reduce": [], "reorder": [[1]], "inline": [], "unroll": [[1, 1]], "merge": [], "special": []}, {"fuse": [[2, 3, 4]], "spatial": [[1, 1, 1, 1], [48, 1, 12, 1], [1, 1, 1, 7], [1, 1, 7, 1]], "reduce": [[1, 1, 1], [1, 1, 3], [3, 1, 1]], "reorder": [[2]], "inline": [], "unroll": [[0, 1]], "merge": [], "special": []}], {"fuse": [], "spatial": [], "reduce": [], "reorder": [], "inline": [[1, 0]], "unroll": [], "merge": [], "special": []}] 6 | conv2d_mobile_v26_(1, 160, 7, 7, 960, 3, 1, 1, 1, 160)_opencl(0):[[{"fuse": [[1, 3, 4]], "spatial": [[1, 1, 1, 1], [16, 5, 1, 2], [1, 1, 9, 1], [1, 1, 9, 1]], "reduce": [], "reorder": [[0]], "inline": [], "unroll": [[1500, 1]], "merge": [], "special": []}, {"fuse": [[1, 2, 4]], "spatial": [[1, 1, 1, 1], [160, 1, 6, 1], [1, 7, 1, 1], [1, 1, 7, 1]], "reduce": [[1, 1, 1], [1, 3, 1], [1, 3, 1]], "reorder": [[2]], "inline": [], "unroll": [[512, 0]], "merge": [], "special": []}], {"fuse": [], "spatial": [], "reduce": [], "reorder": [], "inline": [[1, 0]], "unroll": [], "merge": [], "special": []}] 7 | -------------------------------------------------------------------------------- /flextensor/optimize/depthwise_conv2d-config.log: -------------------------------------------------------------------------------- 1 | conv2d_depthwise_(1, 32, 112, 112, 1, 3, 1, 1, 1)_opencl(0):[[{"fuse": [[1, 3, 4]], "spatial": [[1, 1, 1, 1], [8, 4, 1, 1], [114, 1, 1, 1], [1, 2, 57, 1]], "reduce": [], "reorder": [[1]], "inline": [], "unroll": [[1, 1]], "merge": [], "special": []}, {"fuse": [[1, 3, 4]], "spatial": [[1, 1, 1, 1], [8, 2, 2, 1], [56, 1, 1, 2], [7, 1, 8, 2]], "reduce": [[1, 3, 1], [3, 1, 1]], "reorder": [[0]], "inline": [], "unroll": [[1500, 1]], "merge": [], "special": []}], {"fuse": [], "spatial": [], "reduce": [], "reorder": [], "inline": [[0, 0]], "unroll": [], "merge": [], "special": []}] 2 | conv2d_depthwise_(1, 16, 112, 112, 6, 3, 2, 1, 1)_opencl(0):[[{"fuse": [[1, 2, 4]], "spatial": [[1, 1, 1, 1], [1, 2, 1, 8], [1, 19, 1, 6], [1, 1, 1, 114]], "reduce": [], "reorder": [[2]], "inline": [], "unroll": [[1500, 0]], "merge": [], "special": []}, {"fuse": [[1, 3, 4]], "spatial": [[1, 1, 1, 1], [32, 3, 1, 1], [2, 7, 2, 2], [7, 1, 4, 2]], "reduce": [[1, 1, 3], [1, 1, 3]], "reorder": [[2]], "inline": [], "unroll": [[0, 1]], "merge": [], "special": []}], {"fuse": [], "spatial": [], "reduce": [], "reorder": [], "inline": [[0, 0]], "unroll": [], "merge": [], "special": []}] 3 | conv2d_depthwise_(1, 24, 56, 56, 6, 3, 2, 1, 1)_opencl(0):[[{"fuse": [[1, 2, 4]], "spatial": [[1, 1, 1, 1], [6, 4, 1, 1], [58, 1, 1, 1], [1, 1, 58, 1]], "reduce": [], "reorder": [[0]], "inline": [], "unroll": [[0, 0]], "merge": [], "special": []}, {"fuse": [[1, 3, 4]], "spatial": [[1, 1, 1, 1], [24, 1, 1, 6], [7, 1, 4, 1], [1, 1, 14, 2]], "reduce": [[3, 1, 1], [1, 3, 1]], "reorder": [[0]], "inline": [], "unroll": [[1, 1]], "merge": [], "special": []}], {"fuse": [], "spatial": [], "reduce": [], "reorder": [], "inline": [[0, 0]], "unroll": [], "merge": [], "special": []}] 4 | conv2d_depthwise_(1, 32, 28, 28, 6, 3, 2, 1, 1)_opencl(0):[[{"fuse": [[2, 3, 4]], "spatial": [[1, 1, 1, 1], [8, 2, 1, 2], [3, 1, 10, 1], [1, 2, 15, 1]], "reduce": [], "reorder": [[0]], "inline": [], "unroll": [[1500, 1]], "merge": [], "special": []}, {"fuse": [[1, 3, 4]], "spatial": [[1, 1, 1, 1], [32, 6, 1, 1], [7, 1, 2, 1], [1, 1, 14, 1]], "reduce": [[3, 1, 1], [1, 1, 3]], "reorder": [[2]], "inline": [], "unroll": [[1500, 0]], "merge": [], "special": []}], {"fuse": [], "spatial": [], "reduce": [], "reorder": [], "inline": [[1, 0]], "unroll": [], "merge": [], "special": []}] 5 | conv2d_depthwise_(1, 64, 14, 14, 6, 3, 1, 1, 1)_opencl(0):[[{"fuse": [[1, 2, 4]], "spatial": [[1, 1, 1, 1], [1, 1, 64, 1], [8, 1, 2, 1], [2, 8, 1, 1]], "reduce": [], "reorder": [[0]], "inline": [], "unroll": [[1, 1]], "merge": [], "special": []}, {"fuse": [[2, 3, 4]], "spatial": [[1, 1, 1, 1], [64, 1, 6, 1], [1, 2, 7, 1], [1, 1, 7, 2]], "reduce": [[1, 1, 3], [3, 1, 1]], "reorder": [[0]], "inline": [], "unroll": [[0, 0]], "merge": [], "special": []}], {"fuse": [], "spatial": [], "reduce": [], "reorder": [], "inline": [[1, 0]], "unroll": [], "merge": [], "special": []}] 6 | conv2d_depthwise_(1, 96, 14, 14, 6, 3, 2, 1, 1)_opencl(0):[[{"fuse": [[1, 2, 4]], "spatial": [[1, 1, 1, 1], [32, 1, 3, 1], [1, 2, 4, 2], [1, 2, 8, 1]], "reduce": [], "reorder": [[1]], "inline": [], "unroll": [[1500, 1]], "merge": [], "special": []}, {"fuse": [[2, 3, 4]], "spatial": [[1, 1, 1, 1], [192, 1, 3, 1], [1, 1, 7, 1], [1, 1, 7, 1]], "reduce": [[1, 3, 1], [3, 1, 1]], "reorder": [[0]], "inline": [], "unroll": [[1500, 1]], "merge": [], "special": []}], {"fuse": [], "spatial": [], "reduce": [], "reorder": [], "inline": [[1, 0]], "unroll": [], "merge": [], "special": []}] 7 | conv2d_depthwise_(1, 160, 7, 7, 6, 3, 1, 1, 1)_opencl(0):[[{"fuse": [[1, 3, 4]], "spatial": [[1, 1, 1, 1], [40, 1, 2, 2], [3, 3, 1, 1], [1, 1, 9, 1]], "reduce": [], "reorder": [[0]], "inline": [], "unroll": [[0, 1]], "merge": [], "special": []}, {"fuse": [[2, 3, 4]], "spatial": [[1, 1, 1, 1], [240, 1, 4, 1], [1, 1, 1, 7], [1, 1, 7, 1]], "reduce": [[1, 1, 3], [1, 1, 3]], "reorder": [[0]], "inline": [], "unroll": [[512, 1]], "merge": [], "special": []}], {"fuse": [], "spatial": [], "reduce": [], "reorder": [], "inline": [[1, 0]], "unroll": [], "merge": [], "special": []}] 8 | -------------------------------------------------------------------------------- /flextensor/optimize/gemm-config.log: -------------------------------------------------------------------------------- 1 | gemm_gemm_(256, 128, 128)_opencl(0):[[{"fuse": [[1, 2, 2]], "spatial": [[64, 1, 2, 2], [1, 1, 32, 4]], "reduce": [[2, 32, 2]], "reorder": [[0]], "inline": [], "unroll": [[0, 1]], "merge": [], "special": []}], {"fuse": [], "spatial": [], "reduce": [], "reorder": [], "inline": [[0]], "unroll": [], "merge": [], "special": []}] 2 | gemm_gemm_(256, 256, 256)_opencl(0):[[{"fuse": [[1, 2, 2]], "spatial": [[32, 4, 2, 1], [2, 1, 32, 4]], "reduce": [[1, 64, 4]], "reorder": [[0]], "inline": [], "unroll": [[512, 1]], "merge": [], "special": []}], {"fuse": [], "spatial": [], "reduce": [], "reorder": [], "inline": [[0]], "unroll": [], "merge": [], "special": []}] 3 | -------------------------------------------------------------------------------- /flextensor/optimize/gemm-config.old.log: -------------------------------------------------------------------------------- 1 | gemm_gemm_(128, 128, 128)_opencl(0):[[{"fuse": [[1, 2, 2]], "spatial": [[64, 2, 1, 1], [1, 1, 64, 2]], "reduce": [[64, 2, 1]], "reorder": [[0]], "inline": [], "unroll": [[1, 0]], "merge": [], "special": []}], {"fuse": [], "spatial": [], "reduce": [], "reorder": [], "inline": [[0]], "unroll": [], "merge": [], "special": []}] 2 | gemm_gemm_(256, 256, 256)_opencl(0):[[{"fuse": [[1, 2, 2]], "spatial": [[128, 1, 2, 1], [16, 1, 4, 4]], "reduce": [[4, 2, 32]], "reorder": [[1]], "inline": [], "unroll": [[1, 1]], "merge": [], "special": []}], {"fuse": [], "spatial": [], "reduce": [], "reorder": [], "inline": [[0]], "unroll": [], "merge": [], "special": []}] 3 | gemm_gemm_(256, 128, 128)_opencl(0):[[{"fuse": [[1, 2, 2]], "spatial": [[64, 2, 2, 1], [2, 1, 32, 2]], "reduce": [[32, 2, 2]], "reorder": [[0]], "inline": [], "unroll": [[1500, 0]], "merge": [], "special": []}], {"fuse": [], "spatial": [], "reduce": [], "reorder": [], "inline": [[0]], "unroll": [], "merge": [], "special": []}] 4 | gemm_gemm_(128, 256, 128)_opencl(0):[[{"fuse": [[1, 2, 2]], "spatial": [[32, 2, 2, 1], [1, 1, 64, 2]], "reduce": [[8, 8, 4]], "reorder": [[1]], "inline": [], "unroll": [[1500, 0]], "merge": [], "special": []}], {"fuse": [], "spatial": [], "reduce": [], "reorder": [], "inline": [[0]], "unroll": [], "merge": [], "special": []}] 5 | gemm_gemm_(128, 128, 256)_opencl(0):[[{"fuse": [[1, 2, 2]], "spatial": [[32, 2, 2, 1], [8, 1, 16, 2]], "reduce": [[1, 32, 4]], "reorder": [[0]], "inline": [], "unroll": [[512, 0]], "merge": [], "special": []}], {"fuse": [], "spatial": [], "reduce": [], "reorder": [], "inline": [[0]], "unroll": [], "merge": [], "special": []}] 6 | gemm_gemm_(128, 256, 256)_opencl(0):[[{"fuse": [[1, 2, 2]], "spatial": [[32, 4, 1, 1], [4, 1, 64, 1]], "reduce": [[16, 4, 4]], "reorder": [[1]], "inline": [], "unroll": [[1500, 0]], "merge": [], "special": []}], {"fuse": [], "spatial": [], "reduce": [], "reorder": [], "inline": [[0]], "unroll": [], "merge": [], "special": []}] 7 | gemm_gemm_(256, 128, 256)_opencl(0):[[{"fuse": [[1, 2, 2]], "spatial": [[64, 1, 2, 2], [4, 1, 8, 8]], "reduce": [[32, 4, 1]], "reorder": [[0]], "inline": [], "unroll": [[512, 0]], "merge": [], "special": []}], {"fuse": [], "spatial": [], "reduce": [], "reorder": [], "inline": [[0]], "unroll": [], "merge": [], "special": []}] 8 | gemm_gemm_(256, 256, 128)_opencl(0):[[{"fuse": [[1, 2, 2]], "spatial": [[128, 1, 2, 1], [4, 1, 8, 4]], "reduce": [[16, 16, 1]], "reorder": [[0]], "inline": [], "unroll": [[512, 1]], "merge": [], "special": []}], {"fuse": [], "spatial": [], "reduce": [], "reorder": [], "inline": [[0]], "unroll": [], "merge": [], "special": []}] 9 | -------------------------------------------------------------------------------- /flextensor/optimize/gemm-config.v0.log: -------------------------------------------------------------------------------- 1 | gemm_gemm_(128, 128, 128)_opencl(0):[[{"fuse": [[1, 2, 2]], "spatial": [[64, 2, 1, 1], [1, 1, 64, 2]], "reduce": [[64, 2, 1]], "reorder": [[0]], "inline": [], "unroll": [[1, 0]], "merge": [], "special": []}], {"fuse": [], "spatial": [], "reduce": [], "reorder": [], "inline": [[0]], "unroll": [], "merge": [], "special": []}] 2 | gemm_gemm_(256, 256, 256)_opencl(0):[[{"fuse": [[1, 2, 2]], "spatial": [[128, 1, 2, 1], [16, 1, 4, 4]], "reduce": [[4, 2, 32]], "reorder": [[1]], "inline": [], "unroll": [[1, 1]], "merge": [], "special": []}], {"fuse": [], "spatial": [], "reduce": [], "reorder": [], "inline": [[0]], "unroll": [], "merge": [], "special": []}] 3 | gemm_gemm_(256, 128, 128)_opencl(0):[[{"fuse": [[1, 2, 2]], "spatial": [[64, 2, 2, 1], [2, 1, 32, 2]], "reduce": [[32, 2, 2]], "reorder": [[0]], "inline": [], "unroll": [[1500, 0]], "merge": [], "special": []}], {"fuse": [], "spatial": [], "reduce": [], "reorder": [], "inline": [[0]], "unroll": [], "merge": [], "special": []}] 4 | gemm_gemm_(128, 256, 128)_opencl(0):[[{"fuse": [[1, 2, 2]], "spatial": [[32, 2, 2, 1], [1, 1, 64, 2]], "reduce": [[8, 8, 4]], "reorder": [[1]], "inline": [], "unroll": [[1500, 0]], "merge": [], "special": []}], {"fuse": [], "spatial": [], "reduce": [], "reorder": [], "inline": [[0]], "unroll": [], "merge": [], "special": []}] 5 | gemm_gemm_(128, 128, 256)_opencl(0):[[{"fuse": [[1, 2, 2]], "spatial": [[32, 2, 2, 1], [8, 1, 16, 2]], "reduce": [[1, 32, 4]], "reorder": [[0]], "inline": [], "unroll": [[512, 0]], "merge": [], "special": []}], {"fuse": [], "spatial": [], "reduce": [], "reorder": [], "inline": [[0]], "unroll": [], "merge": [], "special": []}] 6 | gemm_gemm_(128, 256, 256)_opencl(0):[[{"fuse": [[1, 2, 2]], "spatial": [[32, 4, 1, 1], [4, 1, 64, 1]], "reduce": [[16, 4, 4]], "reorder": [[1]], "inline": [], "unroll": [[1500, 0]], "merge": [], "special": []}], {"fuse": [], "spatial": [], "reduce": [], "reorder": [], "inline": [[0]], "unroll": [], "merge": [], "special": []}] 7 | gemm_gemm_(256, 128, 256)_opencl(0):[[{"fuse": [[1, 2, 2]], "spatial": [[64, 1, 2, 2], [4, 1, 8, 8]], "reduce": [[32, 4, 1]], "reorder": [[0]], "inline": [], "unroll": [[512, 0]], "merge": [], "special": []}], {"fuse": [], "spatial": [], "reduce": [], "reorder": [], "inline": [[0]], "unroll": [], "merge": [], "special": []}] 8 | gemm_gemm_(256, 256, 128)_opencl(0):[[{"fuse": [[1, 2, 2]], "spatial": [[128, 1, 2, 1], [4, 1, 8, 4]], "reduce": [[16, 16, 1]], "reorder": [[0]], "inline": [], "unroll": [[512, 1]], "merge": [], "special": []}], {"fuse": [], "spatial": [], "reduce": [], "reorder": [], "inline": [[0]], "unroll": [], "merge": [], "special": []}] 9 | -------------------------------------------------------------------------------- /flextensor/optimize/gemm-config.v1.log: -------------------------------------------------------------------------------- 1 | gemm_gemm_(256, 256, 256)_opencl(0):[[{"fuse": [[1, 2, 2]], "spatial": [[32, 4, 1, 2], [8, 2, 8, 2]], "reduce": [[8, 16, 2]], "reorder": [[0]], "inline": [], "unroll": [[1, 1]], "merge": [], "special": []}], {"fuse": [], "spatial": [], "reduce": [], "reorder": [], "inline": [[0]], "unroll": [], "merge": [], "special": []}] 2 | gemm_gemm_(256, 128, 128)_opencl(0):[[{"fuse": [[1, 2, 2]], "spatial": [[32, 1, 1, 8], [8, 2, 4, 2]], "reduce": [[1, 1, 128]], "reorder": [[0]], "inline": [], "unroll": [[1500, 1]], "merge": [], "special": []}], {"fuse": [], "spatial": [], "reduce": [], "reorder": [], "inline": [[0]], "unroll": [], "merge": [], "special": []}] 3 | -------------------------------------------------------------------------------- /flextensor/optimize/run_remote_opencl_conv1d.sh: -------------------------------------------------------------------------------- 1 | #!/bin/env bash 2 | running() { 3 | beg=$1 4 | end=$(python3 -c "print($beg + 1)") 5 | # timeout=$(python3 -c "print((($beg + 3) * 2) if $beg < 6 else 7)") 6 | timeout=4 7 | stdbuf --output=0 --error=0 python3 optimize_conv1d.py \ 8 | --target_host "llvm -mtriple=aarch64-linux-android" \ 9 | --host 0.0.0.0 --port 9190 \ 10 | --use_rpc tracker \ 11 | --fcompile ndk \ 12 | --device_key android \ 13 | --target opencl \ 14 | --timeout $timeout \ 15 | --parallel 6 \ 16 | -f $beg -t $end \ 17 | -l conv1d-config.log \ 18 | 1>conv1d-$beg.log 2>conv1d-$beg.log 19 | # --test conv1d-config.log 20 | } 21 | 22 | start=${1:-0} 23 | stop=${2:-6} 24 | 25 | set -x 26 | for ((i = $start; i < $stop; i++)); do 27 | running $i 28 | done 29 | -------------------------------------------------------------------------------- /flextensor/optimize/run_remote_opencl_conv2d.sh: -------------------------------------------------------------------------------- 1 | #!/bin/env bash 2 | running() { 3 | beg=$1 4 | end=$(python3 -c "print($beg + 1)") 5 | # timeout=$(python3 -c "print((($beg + 3) * 2) if $beg < 6 else 7)") 6 | timeout=5 7 | stdbuf --output=0 --error=0 python3 optimize_conv2d.py \ 8 | --target_host "llvm -mtriple=aarch64-linux-android" \ 9 | --host 0.0.0.0 --port 9190 \ 10 | --use_rpc tracker \ 11 | --fcompile ndk \ 12 | --device_key android \ 13 | --target opencl \ 14 | --timeout $timeout \ 15 | --parallel 6 \ 16 | -f $beg -t $end \ 17 | -l conv2d-config.log \ 18 | --shapes mobile_v2 \ 19 | 1>conv2d-$beg.log 2>conv2d-$beg.log 20 | # --test conv2d-config.log 21 | } 22 | 23 | start=${1:-0} 24 | stop=${2:-7} 25 | 26 | set -x 27 | for ((i = $start; i < $stop; i++)); do 28 | running $i 29 | done 30 | -------------------------------------------------------------------------------- /flextensor/optimize/run_remote_opencl_depthwise_conv2d.sh: -------------------------------------------------------------------------------- 1 | #!/bin/env bash 2 | running() { 3 | beg=$1 4 | end=$(python3 -c "print($beg + 1)") 5 | name=depthwise_conv2d 6 | # timeout=$(python3 -c "print((($beg + 3) * 2) if $beg < 6 else 7)") 7 | timeout=4 8 | stdbuf --output=0 --error=0 python3 optimize_$name.py \ 9 | --target_host "llvm -mtriple=aarch64-linux-android" \ 10 | --host 0.0.0.0 --port 9190 \ 11 | --use_rpc tracker \ 12 | --fcompile ndk \ 13 | --device_key android \ 14 | --target opencl \ 15 | --timeout $timeout \ 16 | --parallel 6 \ 17 | -f $beg -t $end \ 18 | -l $name-config.log \ 19 | --test $name-config.log --check 20 | # 1>$name-$beg.log 2>$name-$beg.log 21 | } 22 | 23 | start=${1:-0} 24 | stop=${2:-7} 25 | 26 | set -x 27 | for ((i = $start; i < $stop; i++)); do 28 | running $i 29 | done 30 | -------------------------------------------------------------------------------- /flextensor/optimize/run_remote_opencl_gemm.sh: -------------------------------------------------------------------------------- 1 | #!/bin/env bash 2 | running() { 3 | beg=$1 4 | end=$(python3 -c "print($beg + 1)") 5 | timeout=$(python3 -c "print((($beg + 3) * 2) if $beg < 6 else 7)") 6 | # timeout=4 7 | stdbuf --output=0 --error=0 python3 optimize_gemm.py \ 8 | --target_host "llvm -mtriple=aarch64-linux-android" \ 9 | --host 0.0.0.0 --port 9190 \ 10 | --use_rpc tracker \ 11 | --fcompile ndk \ 12 | --device_key android \ 13 | --target opencl \ 14 | --timeout $timeout \ 15 | --parallel 6 \ 16 | -f $beg -t $end \ 17 | --test gemm-config.log --check 18 | # -l gemm-config.log \ 19 | # 1>gemm-$beg.log 2>gemm-$beg.log 20 | } 21 | 22 | start=${1:-0} 23 | stop=${2:-12} 24 | 25 | set -x 26 | for ((i = $start; i < $stop; i++)); do 27 | running $i 28 | done 29 | -------------------------------------------------------------------------------- /flextensor/project/tensor_graph/README.md: -------------------------------------------------------------------------------- 1 | # Tensor Graph: Using GNN to optimize Tensor Operators 2 | 3 | 1. Prerequsiities: 4 | - Python >= 3.5 5 | - PyTorch >= 1.2.0 6 | - [PyTorch Geometric](https://pytorch-geometric.readthedocs.io/en/latest/notes/installation.html) 7 | - [TVM >= 0.6.0](https://docs.tvm.ai/install/from_source.html) 8 | - [FlexTensor](https://github.com/KnowingNothing/FlexTensor.git) 9 | 10 | 2. Run: 11 | `python train.py --help` to see optional knobs 12 | 13 | 3. Use trained model on Titan X: 14 | `python train.py --only_test --fmodel gemm_model/gemm_model.pkl --ftest dataset/gemm_test.txt --eval_dev 0` 15 | 16 | 4. The dataset: 17 | 18 | GEMM: gemm_train.txt, gemm_test.txt 19 | 20 | Conv2d: conv2d_train.txt, conv2d_test.txt 21 | 22 | 5. Any problems: 23 | File issues to https://github.com/KnowingNothing/FlexTensor.git 24 | Tensor Graph is a testing feature of FlexTensor currently. -------------------------------------------------------------------------------- /flextensor/project/tensor_graph/conv2d_model/conv2d_model.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pku-liang/FlexTensor/98db7a952a0508f792edfd876527c8517a5ccd36/flextensor/project/tensor_graph/conv2d_model/conv2d_model.pkl -------------------------------------------------------------------------------- /flextensor/project/tensor_graph/dataset/preprocess.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | 4 | ratio = 0.8 5 | 6 | 7 | if __name__ == "__main__": 8 | raw = [] 9 | with open("all.txt", "r") as fin: 10 | for line in fin: 11 | raw.append(line) 12 | if not raw[-1][-1] == "\n": 13 | raw[-1] = raw[-1] + "\n" 14 | np.random.shuffle(raw) 15 | length = int(ratio * len(raw)) 16 | train = raw[:length] 17 | test = raw[length:] 18 | with open("all_train.txt", "w") as fout: 19 | fout.writelines(train) 20 | with open("all_test.txt", "w") as fout: 21 | fout.writelines(test) 22 | -------------------------------------------------------------------------------- /flextensor/project/tensor_graph/gemm_model/gemm_model.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pku-liang/FlexTensor/98db7a952a0508f792edfd876527c8517a5ccd36/flextensor/project/tensor_graph/gemm_model/gemm_model.pkl -------------------------------------------------------------------------------- /flextensor/project/tensor_graph/node.py: -------------------------------------------------------------------------------- 1 | from __future__ import absolute_import 2 | 3 | import tvm 4 | from utils import strict_limit 5 | 6 | 7 | class Node(object): 8 | def __init__(self, feature, name=None): 9 | if not isinstance(feature, (list, tuple)): 10 | feature = [feature] 11 | self.feature = feature 12 | self.name = name 13 | 14 | 15 | def make_nodes_from_tensor(tensor): 16 | """ 17 | return: list of Node 18 | """ 19 | assert isinstance(tensor, tvm.te.tensor.Tensor), strict_limit("tvm.te.tensor.Tensor") 20 | node_lst = [] 21 | for dim, val in enumerate(tensor.shape): 22 | assert isinstance(val, tvm.tir.IntImm), strict_limit("tvm.tir.IntImm") 23 | node_lst.append(Node(val.value), name="%s/%d" % (tensor.name, dim)) 24 | return node_lst 25 | 26 | 27 | def make_node_from_var(var, feature): 28 | assert isinstance(var, tvm.tir.Var), strict_limit("tvm.tir.Var") 29 | return Node(feature, name=var.name) 30 | 31 | -------------------------------------------------------------------------------- /flextensor/project/tensor_graph/preprocess.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | 4 | ratio = 0.8 5 | 6 | 7 | if __name__ == "__main__": 8 | raw = [] 9 | with open("data.txt", "r") as fin: 10 | for line in fin: 11 | raw.append(line) 12 | np.random.shuffle(raw) 13 | length = int(ratio * len(raw)) 14 | train = raw[:length] 15 | test = raw[length:] 16 | with open("train.txt", "w") as fout: 17 | fout.writelines(train) 18 | with open("test.txt", "w") as fout: 19 | fout.writelines(test) -------------------------------------------------------------------------------- /flextensor/templates/__init__.py: -------------------------------------------------------------------------------- 1 | from .cpu import * 2 | from .cuda import * 3 | from .opencl import * 4 | -------------------------------------------------------------------------------- /flextensor/test/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pku-liang/FlexTensor/98db7a952a0508f792edfd876527c8517a5ccd36/flextensor/test/__init__.py -------------------------------------------------------------------------------- /flextensor/test/naive_schedule_all.py: -------------------------------------------------------------------------------- 1 | from __future__ import absolute_import 2 | 3 | import sys 4 | import tvm 5 | from flextensor.task import TASK_TABLE 6 | 7 | 8 | def print_source(s, bufs, target, file=sys.stdout): 9 | func = tvm.build(s, bufs, target) 10 | if target in ["cuda", "opencl"]: 11 | print(func.imported_modules[0].get_source(), file=file) 12 | 13 | def recursive_fuse(s, cur, flag=False): 14 | for t in s[cur].op.input_tensors: 15 | if isinstance(t.op, tvm.te.tensor.ComputeOp): 16 | recursive_fuse(s, t.op, True) 17 | if flag: 18 | s[cur].compute_inline() 19 | 20 | hit_set = set() 21 | for task in TASK_TABLE.values(): 22 | if (task.target == "cuda" and "gemm_conv" not in task.key 23 | and "mttkrp" not in task.key and "block_circulant_matrix" not in task.key 24 | and "pixel" not in task.key and "unpool" not in task.key and "shift" not in task.key 25 | and "packed" not in task.key): 26 | prefix = task.key.rsplit("_", 4)[0] 27 | if prefix in hit_set: 28 | continue 29 | hit_set.add(prefix) 30 | outops, bufs = task.func(*task.args) 31 | s = tvm.te.create_schedule(outops) 32 | bx = tvm.te.thread_axis("blockIdx.x") 33 | op = outops[0] 34 | recursive_fuse(s, op) 35 | outer, inner = s[op].split(s[op].op.axis[0], nparts=1) 36 | s[op].bind(outer, bx) 37 | print(task.key) 38 | print_source(s, bufs, "opencl") -------------------------------------------------------------------------------- /flextensor/test/pyimpl.py: -------------------------------------------------------------------------------- 1 | from __future__ import absolute_import 2 | 3 | 4 | import numpy as np 5 | 6 | 7 | def conv2d_nchwc(inputs, weight, bias=None, stride=1, padding=0, dilation=1, groups=1): 8 | """Convolution 2d NCHWc layout 9 | 10 | Args: 11 | ----------------------------- 12 | inputs : np.ndarray 13 | shape [batch, channel // vlen, height, width, vlen] 14 | weight : np.ndarray 15 | shape [out_channel // vlen, channel // vlen // groups, kernel_height, kernel_width, vlen(i), vlen(o)] 16 | bias : (optional:None) np.ndarray 17 | shape [out_channel // vlen, vlen] 18 | stride : (optional:1) int or tuple 19 | 20 | padding : (optional:0) int or tuple 21 | 22 | dilation: (optional:1) int 23 | 24 | groups : (optional:1) int 25 | ----------------------------- 26 | 27 | Returns: 28 | ----------------------------- 29 | np.ndarray 30 | shape [batch, out_channel // vlen, output_height, output_width, vlen] 31 | ----------------------------- 32 | """ 33 | batch_size, in_channel_chunk, in_h, in_w, in_channel_block = inputs.shape 34 | out_channel_chunk, channel_per_group_chunk, k_h, k_w, _in_channel_block, out_channel_block = weight.shape 35 | assert ((channel_per_group_chunk * groups) == in_channel_chunk) 36 | assert _in_channel_block == in_channel_block 37 | assert in_channel_block == out_channel_block 38 | out_channel_per_group = out_channel_chunk // groups 39 | assert ((out_channel_per_group * groups) == out_channel_chunk) 40 | 41 | stride = (stride, stride) if isinstance(stride, int) else stride 42 | padding = (padding, padding) if isinstance(padding, int) else padding 43 | dilation = (dilation, dilation) if isinstance(dilation, int) else dilation 44 | assert (isinstance(stride, tuple) and len(stride) == 2) 45 | assert (isinstance(padding, tuple) and len(padding) == 2) 46 | assert (isinstance(dilation, tuple) and len(dilation) == 2) 47 | 48 | out_h = (in_h + 2 * padding[0] - dilation[0] * (k_h - 1) - 1) // stride[0] + 1 49 | out_w = (in_w + 2 * padding[1] - dilation[1] * (k_w - 1) - 1) // stride[1] + 1 50 | 51 | output = np.zeros((batch_size, out_channel_chunk, out_h, out_w, out_channel_block), dtype=inputs.dtype) 52 | for b in range(batch_size): 53 | for c_c in range(out_channel_chunk): 54 | for h in range(out_h): 55 | for w in range(out_w): 56 | for c_b in range(out_channel_block): 57 | for rc_chunk in range(channel_per_group_chunk): 58 | for rc_block in range(in_channel_block): 59 | for rh in range(k_h): 60 | for rw in range(k_w): 61 | h_index = h * stride[0] + rh * dilation[0] - padding[0] 62 | w_index = w * stride[1] + rw * dilation[1] - padding[1] 63 | if 0 <= h_index < in_h and 0 <= w_index < in_w: 64 | output[b, c_c, h, w, c_b] += inputs[b, 65 | c_c // out_channel_per_group * channel_per_group_chunk + rc_chunk, 66 | h_index, 67 | w_index, 68 | rc_block] * weight[c_c, rc_chunk, rh, rw, rc_block, c_b] 69 | 70 | if bias is not None: 71 | for b in range(batch_size): 72 | for c_c in range(out_channel_chunk): 73 | for h in range(out_h): 74 | for w in range(out_w): 75 | for c_b in range(out_channel_block): 76 | output[b, c_c, h, w, c_b] += bias[c_c, c_b] 77 | 78 | return output -------------------------------------------------------------------------------- /flextensor/test/test_scheduler.py: -------------------------------------------------------------------------------- 1 | import time 2 | import tvm 3 | from flextensor.utils import Config 4 | from flextensor.task import Task 5 | from flextensor.scheduler import schedule, schedule_with_config 6 | from flextensor.measure import _evaluate 7 | 8 | 9 | def test(): 10 | # create an empty task but has the correct key we want 11 | task = Task("yolo1", None, (1, 3, 448, 448, 64, 7, 2, 3, 1, 1), "llvm", 0) 12 | beg = time.time() 13 | # s, bufs, configs = schedule(task.key) 14 | end = time.time() 15 | # print(tvm.lower(s, bufs, simple_mode=True)) 16 | # print("######################################") 17 | # print("op schedules:") 18 | # for config in configs.op_config_lst: 19 | # print("----------------------------------") 20 | # for name, value in config.items(): 21 | # if value: 22 | # print(name, value) 23 | # print("graph schedules:") 24 | # for name, value in configs.graph_config.items(): 25 | # if value: 26 | # print(name, value) 27 | op_configs = [ 28 | { 29 | "spatial": [[1, 1, 1, 1], [1, 1, 1, 3], [454, 1, 1, 1], [1, 227, 2, 1]], 30 | "unroll": [[1500, 1]] 31 | }, 32 | { 33 | "spatial": [[1, 1, 1, 1], [2, 4, 2, 4], [8, 1, 4, 7], [7, 1, 16, 2]], 34 | "reduce": [[1, 3, 1], [7, 1, 1], [7, 1, 1]], 35 | "unroll": [[1500, 1]] 36 | } 37 | ] 38 | graph_config = {"inline": [[0, 0]]} 39 | configs = Config(op_configs, graph_config) 40 | 41 | s, bufs = schedule_with_config(task.key, configs) 42 | time_cost = _evaluate(s, bufs, "llvm", 0, 10) 43 | print("Use", time_cost, "ms") 44 | print("Cost", end - beg, "s") 45 | 46 | 47 | if __name__ == "__main__": 48 | test() -------------------------------------------------------------------------------- /flextensor/test/test_tvm/grad/dqn_pytorch.py: -------------------------------------------------------------------------------- 1 | import math 2 | 3 | import torch 4 | import torch.nn as nn 5 | import torch.nn.functional as F 6 | 7 | 8 | def _calc_fc1_in_features(input_sz, channels_before_flatten): 9 | def _calc_conv_osize(sz, k, s, pad): 10 | return math.floor((sz + 2*pad - k) / s) + 1 11 | isz = input_sz 12 | isz = _calc_conv_osize(isz, 8, 4, 0) 13 | isz = _calc_conv_osize(isz, 4, 2, 0) 14 | isz = _calc_conv_osize(isz, 3, 1, 0) 15 | return isz * isz * channels_before_flatten 16 | 17 | 18 | class DQN(nn.Module): 19 | def __init__(self, num_actions=18, image_shape=(4, 84, 84)): 20 | super().__init__() 21 | 22 | self.num_actions = num_actions 23 | self.image_shape = image_shape 24 | input_c, input_h, input_w = image_shape 25 | assert input_h == input_w, "input image must be square" 26 | fc1_in_features = _calc_fc1_in_features(input_h, 64) 27 | 28 | self.conv1 = nn.Conv2d(input_c, 32, kernel_size=8, stride=4, padding=0, bias=True) 29 | self.conv2 = nn.Conv2d(32, 64, kernel_size=4, stride=2, padding=0, bias=True) 30 | self.conv3 = nn.Conv2d(64, 64, kernel_size=3, stride=1, padding=0, bias=True) 31 | self.fc1 = nn.Linear(fc1_in_features, 512, bias=True) 32 | self.fc2 = nn.Linear(512, num_actions, bias=True) 33 | 34 | def forward(self, x): 35 | x = F.relu(self.conv1(x)) 36 | x = F.relu(self.conv2(x)) 37 | x = F.relu(self.conv3(x)) 38 | x = torch.flatten(x) 39 | x = F.relu(self.fc1(x)) 40 | x = self.fc2(x) 41 | return x 42 | 43 | 44 | model = DQN() 45 | dummy_data = torch.randn(1, *model.image_shape) 46 | model(dummy_data) 47 | -------------------------------------------------------------------------------- /flextensor/test/test_tvm/grad/relay-dqn.py: -------------------------------------------------------------------------------- 1 | import tvm 2 | import numpy as np 3 | import torch 4 | from tvm import relay 5 | 6 | import tvm.relay.testing 7 | import tvm.contrib.graph_runtime as runtime 8 | 9 | from dqn_pytorch import DQN 10 | 11 | batch_size = 1 12 | 13 | num_actions = 18 14 | 15 | image_shape = (4, 84, 84) 16 | 17 | target = "llvm" 18 | 19 | dtype = "float32" 20 | 21 | input_shape = (batch_size, *image_shape) 22 | 23 | input_type = relay.TensorType(input_shape, dtype) 24 | 25 | print("Get DQN network...") 26 | net = relay.testing.dqn.get_net( 27 | batch_size, num_actions=num_actions, image_shape=image_shape, dtype=dtype) 28 | 29 | fmod, fparams = relay.testing.dqn.get_workload(batch_size) 30 | 31 | print("Get gradient...") 32 | bnet = relay.transform.gradient(fmod["main"], mode='first_order') # default: higher_order 33 | 34 | print("Make workload") 35 | mod, params = relay.testing.create_workload(bnet) # print(mod.get_global_vars()) # [GlobalVar(main)] 36 | 37 | pytorch_model = DQN(num_actions=num_actions, image_shape=image_shape) 38 | param_name_mapping = { 39 | 'conv1.weight': 'conv1_weight', 40 | 'conv1.bias': 'conv1_bias', 41 | 'conv2.weight': 'conv2_weight', 42 | 'conv2.bias': 'conv2_bias', 43 | 'conv3.weight': 'conv3_weight', 44 | 'conv3.bias': 'conv3_bias', 45 | 'fc1.weight': 'dense1_weight', 46 | 'fc1.bias': 'dense1_bias', 47 | 'fc2.weight': 'dense2_weight', 48 | 'fc2.bias': 'dense2_bias', 49 | } 50 | pytorch_model.load_state_dict({ 51 | pth_key: torch.from_numpy(params[tvm_key].asnumpy()) 52 | for pth_key, tvm_key in param_name_mapping.items() 53 | }, strict=True) 54 | pytorch_model.train() 55 | 56 | print("Build graph...") 57 | with relay.build_config(opt_level=3): 58 | graph, lib, params = relay.build_module.build( 59 | mod, target=target, params=params) 60 | 61 | ctx = tvm.device(str(target), 0) 62 | 63 | print("Create runtime...") 64 | module = runtime.create(graph, lib, ctx) 65 | 66 | print("Set inputs...") 67 | data_tvm = tvm.nd.array((np.random.uniform(size=input_shape)).astype(dtype)) 68 | module.set_input('data', data_tvm) 69 | module.set_input(**params) 70 | 71 | module.run() 72 | 73 | print(f'#outputs: {module.get_num_outputs()}') 74 | 75 | relay_output = module.get_output(0).asnumpy() 76 | pytorch_output = pytorch_model(torch.from_numpy(data_tvm.asnumpy())) 77 | pytorch_output_np = pytorch_output.data.numpy() 78 | pytorch_output.sum().backward() 79 | 80 | for output_idx in range(1, 12): 81 | output = module.get_output(output_idx) 82 | print(f'Shape: {output.shape}', end=' ') 83 | print(f'Mean: {output.asnumpy().mean()}') 84 | 85 | for name, param in pytorch_model.named_parameters(): 86 | print(f'{name}: {param.grad.mean().item()}') 87 | 88 | print(f'Allclose: {np.allclose(relay_output, pytorch_output_np)}') -------------------------------------------------------------------------------- /flextensor/test/test_tvm/graph/placeholder-only.py: -------------------------------------------------------------------------------- 1 | import tvm 2 | 3 | """ 4 | We can't build a single placeholder in TVM, 5 | which will returns Segmentation Fault 6 | """ 7 | 8 | A = tvm.te.placeholder([4, 4], dtype="float32", name="A") 9 | 10 | s = tvm.te.create_schedule(A.op) 11 | 12 | func = tvm.build(s, [A], "llvm") 13 | -------------------------------------------------------------------------------- /flextensor/test/test_tvm/graph/share-placeholder.py: -------------------------------------------------------------------------------- 1 | import tvm 2 | import numpy as np 3 | 4 | 5 | dtype = "float32" 6 | 7 | A = tvm.te.placeholder([4, 4], dtype=dtype, name="A") 8 | 9 | B = tvm.te.compute([4, 4], lambda i, j: A[i, j] + 1, name="B") 10 | 11 | C = tvm.te.compute([4, 4], lambda i, j: A[i, j] * 2, name="C") 12 | 13 | target = "llvm" 14 | 15 | s1 = tvm.te.create_schedule(B.op) 16 | 17 | s2 = tvm.te.create_schedule(C.op) 18 | 19 | s3 = tvm.te.create_schedule([B.op, C.op]) 20 | 21 | func1 = tvm.build(s1, [A, B], target=target) 22 | 23 | func2 = tvm.build(s2, [A, C], target=target) 24 | 25 | func3 = tvm.build(s3, [A, B, C], target=target) 26 | 27 | ctx = tvm.device(target) 28 | 29 | A_np = np.random.uniform(-1, 1, [4, 4]).astype(dtype) 30 | B_np = np.zeros([4, 4]).astype(dtype) 31 | C_np = np.zeros([4, 4]).astype(dtype) 32 | 33 | print("Inputs:") 34 | print(A_np) 35 | 36 | 37 | def run(func, id): 38 | A_tvm = tvm.nd.array(A_np, ctx) 39 | B_tvm = tvm.nd.array(B_np, ctx) 40 | C_tvm = tvm.nd.array(C_np, ctx) 41 | if id == 0: 42 | func(A_tvm, B_tvm) 43 | print("Outputs:") 44 | print(B_tvm.asnumpy()) 45 | elif id == 1: 46 | func(A_tvm, C_tvm) 47 | print("Outputs:") 48 | print(C_tvm.asnumpy()) 49 | elif id == 2: 50 | func(A_tvm, B_tvm, C_tvm) 51 | print("Outputs 1:") 52 | print(B_tvm.asnumpy()) 53 | print("Outputs 2:") 54 | print(C_tvm.asnumpy()) 55 | 56 | run(func1, 0) 57 | run(func2, 1) 58 | run(func3, 2) 59 | 60 | -------------------------------------------------------------------------------- /flextensor/test/test_tvm/legacy/multi_compute_inline.py: -------------------------------------------------------------------------------- 1 | import tvm 2 | 3 | A = tvm.te.placeholder((32, 32, 32, 32), dtype="float32", name="A") 4 | B = tvm.te.compute((32, 30, 32, 32), lambda i, j, p, q: (A[i, j, p, q] + A[i, j+1, p, q] + A[i, j+2, p, q]) / 3, name="B") 5 | C = tvm.te.compute((30, 30, 32, 32), lambda a, b, c, d: (B[a, b, c, d] + B[a + 1, b, c, d] + B[a + 2, b, c, d]), name="C") 6 | D = tvm.te.compute((30, 30, 32, 32), lambda h, k, l, m: (C[h, k, l, m] * 2), name="D") 7 | E = tvm.te.compute((30, 30, 32, 32), lambda h, k, l, m: (C[h, k, l, m] * 3), name="E") 8 | F = tvm.te.compute((30, 30, 32, 32), lambda h, k, l, m: (D[h, k, l, m] + E[h, k, l, m]), name="F") 9 | 10 | s = tvm.te.create_schedule(F.op) 11 | s[C].compute_inline() 12 | print(str(tvm.lower(s, [A, F], simple_mode=True))) 13 | -------------------------------------------------------------------------------- /flextensor/test/test_tvm/legacy/test_compute_inline.py: -------------------------------------------------------------------------------- 1 | import tvm 2 | 3 | 4 | def compute_inline_reduce(): 5 | A = tvm.te.placeholder((32, 32, 32, 32), dtype="float32", name="A") 6 | B = tvm.te.placeholder((32, 32), dtype="float32", name="B") 7 | k = tvm.te.reduce_axis((0, 32), name="k") 8 | C = tvm.te.compute((30, 30, 32, 32), lambda a, b, c, d: tvm.te.sum(A[a, b, c, k] * B[k, d], axis=k), name="C") 9 | D = tvm.te.compute((30, 30, 32, 32), lambda h, k, l, m: (C[h, k, l, m] * 2), name="D") 10 | E = tvm.te.compute((30, 30, 32, 32), lambda h, k, l, m: (C[h, k, l, m] * 3), name="E") 11 | F = tvm.te.compute((30, 30, 32, 32), lambda h, k, l, m: (D[h, k, l, m] + E[h, k, l, m]), name="F") 12 | 13 | s = tvm.te.create_schedule(F.op) 14 | s[C].compute_inline() 15 | try: 16 | tvm.build(s, [A, F], "llvm") 17 | except Exception as e: 18 | return False, str(e) 19 | return True, "pass" 20 | 21 | 22 | def compute_inline_output(): 23 | N = 1024 24 | M = 512 25 | A = tvm.te.placeholder((M, N), name="A") 26 | B, C = tvm.te.compute((M, N), lambda i, j: (A[i, j] + 1, A[i, j] * 2), name="B_C") 27 | D = tvm.te.compute((M, N), lambda i, j: B[i, j] * 2, name="D") 28 | 29 | s = tvm.te.create_schedule(D.op) 30 | s[B].compute_inline() 31 | try: 32 | tvm.build(s, [A, C, D], "llvm") 33 | except Exception as e: 34 | return False, str(e) 35 | return True, "pass" 36 | 37 | 38 | if __name__ == "__main__": 39 | print("compute_inline_reduce:", *compute_inline_reduce()) 40 | print("compute_inline_output:", *compute_inline_output()) 41 | -------------------------------------------------------------------------------- /flextensor/test/test_tvm/legacy/test_conv2d_hwcn_map.py: -------------------------------------------------------------------------------- 1 | """Example code to do convolution.""" 2 | import os 3 | import numpy as np 4 | import scipy.signal 5 | import tvm 6 | from tvm.contrib import nvcc 7 | import topi 8 | from topi.util import get_const_tuple 9 | 10 | TASK = "conv2d_hwcn_map" 11 | USE_MANUAL_CODE = False 12 | 13 | @tvm.register_func 14 | def tvm_callback_cuda_compile(code): 15 | ptx = nvcc.compile_cuda(code, target="ptx") 16 | return ptx 17 | 18 | def write_code(code, fname): 19 | with open(fname, "w") as f: 20 | f.write(code) 21 | 22 | @tvm.register_func 23 | def tvm_callback_cuda_postproc(code): 24 | if not os.path.exists("perf"): 25 | os.mkdir("perf") 26 | write_code(code, "perf/%s_generated.cu" % TASK) 27 | if USE_MANUAL_CODE: 28 | code = open("perf/%s_manual.cu" % TASK).read() 29 | return code 30 | 31 | 32 | def test_conv2d_hwcn_map(): 33 | batch = 64 34 | in_channel = 128 35 | in_height = 16 36 | in_width = 16 37 | num_filter = 128 38 | kernel = 3 39 | stride = 2 40 | padding = 'SAME' 41 | 42 | A = tvm.te.placeholder((in_height, in_width, in_channel, batch), name='A') 43 | W = tvm.te.placeholder((kernel, kernel, in_channel, num_filter), name='W') 44 | B = topi.nn.conv2d_hwcn(A, W, stride, padding) 45 | C = topi.nn.relu(B) 46 | s1 = topi.cuda.schedule_conv2d_hwcn([B]) 47 | s2 = topi.cuda.schedule_conv2d_hwcn([C]) 48 | 49 | a_np = np.random.uniform(size=get_const_tuple(A.shape)).astype(A.dtype) 50 | w_np = np.random.uniform(size=get_const_tuple(W.shape)).astype(W.dtype) 51 | b_np = topi.testing.conv2d_hwcn_python(a_np, w_np, stride, padding) 52 | c_np = np.maximum(b_np, 0) 53 | 54 | def check_device(device): 55 | if not tvm.runtime.module.enabled(device): 56 | print("Skip because %s is not enabled" % device) 57 | return 58 | ctx = tvm.device(device, 0) 59 | a = tvm.nd.array(a_np, ctx) 60 | w = tvm.nd.array(w_np, ctx) 61 | b = tvm.nd.array(np.zeros(get_const_tuple(B.shape), dtype=B.dtype), ctx) 62 | c = tvm.nd.array(np.zeros(get_const_tuple(C.shape), dtype=C.dtype), ctx) 63 | with tvm.build_config(auto_unroll_max_step=128, 64 | unroll_explicit=device == 'rocm'): 65 | func1 = tvm.build(s1, [A, W, B], device) 66 | func1(a, w, b) 67 | tvm.testing.assert_allclose(b.asnumpy(), b_np, rtol=1e-5) 68 | func2 = tvm.build(s2, [A, W, C], device) 69 | func2(a, w, c) 70 | tvm.testing.assert_allclose(c.asnumpy(), c_np, rtol=1e-5) 71 | 72 | for device in ['cuda', 'opencl', 'rocm']: 73 | check_device(device) 74 | 75 | 76 | if __name__ == "__main__": 77 | test_conv2d_hwcn_map() 78 | -------------------------------------------------------------------------------- /flextensor/test/test_tvm/legacy/test_multi_outputs.py: -------------------------------------------------------------------------------- 1 | import tvm 2 | 3 | 4 | A = tvm.te.placeholder((10, 10), name="A") 5 | B0, B1 = tvm.te.compute((10, 10), lambda i, j: (A[i, j] + 1, A[i, j] * 2), name="B") 6 | s = tvm.te.create_schedule(B1.op) 7 | cache0 = s.cache_write(B0.op.output(0), "local") 8 | cache1 = s.cache_write(B1.op.output(1), "local") 9 | print(tvm.lower(s, [A, B0, B1], simple_mode=True)) 10 | func = tvm.build(s, [A, B0, B1], "llvm") -------------------------------------------------------------------------------- /flextensor/test/test_tvm/legacy/test_one_thread.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pku-liang/FlexTensor/98db7a952a0508f792edfd876527c8517a5ccd36/flextensor/test/test_tvm/legacy/test_one_thread.py -------------------------------------------------------------------------------- /flextensor/test/test_tvm/legacy/test_reduce_map.py: -------------------------------------------------------------------------------- 1 | import os 2 | import tvm 3 | from tvm.contrib import nvcc 4 | import numpy as np 5 | 6 | import topi 7 | 8 | 9 | TASK = "reduce_map" 10 | USE_MANUAL_CODE = False 11 | 12 | 13 | @tvm.register_func 14 | def tvm_callback_cuda_compile(code): 15 | ptx = nvcc.compile_cuda(code, target="ptx") 16 | return ptx 17 | 18 | 19 | def write_code(code, fname): 20 | with open(fname, "w") as f: 21 | f.write(code) 22 | 23 | 24 | @tvm.register_func 25 | def tvm_callback_cuda_postproc(code): 26 | if not os.path.exists("perf"): 27 | os.mkdir("perf") 28 | write_code(code, "perf/%s_generated.cu" % TASK) 29 | if USE_MANUAL_CODE: 30 | code = open("perf/%s_manual.cu" % TASK).read() 31 | return code 32 | 33 | 34 | def test_reduce_map(in_shape, axis, keepdims, type="sum", test_id=0): 35 | global TASK 36 | # Build the logic and compile the function 37 | A = tvm.te.placeholder(shape=in_shape, name="A") 38 | if type == "sum": 39 | TASK = "sum_map_id%d" %test_id 40 | B = topi.sum(A, axis=axis, keepdims=keepdims) 41 | elif type == "max": 42 | TASK = "max_map_id%d" %test_id 43 | B = topi.max(A, axis=axis, keepdims=keepdims) 44 | elif type == "min": 45 | TASK = "min_map_id%d" %test_id 46 | B = topi.min(A, axis=axis, keepdims=keepdims) 47 | else: 48 | raise NotImplementedError 49 | s = topi.cuda.schedule_reduce(B) 50 | with tvm.build_config(auto_unroll_max_step=16, 51 | auto_unroll_min_depth=0): 52 | fcuda = tvm.build(s, [A, B], "cuda", name="sum") 53 | 54 | # Test 55 | in_npy = np.random.normal(size=in_shape).astype(np.float32) 56 | if type == "sum": 57 | out_npy = in_npy.sum(axis=axis, keepdims=keepdims) 58 | elif type == "max": 59 | out_npy = in_npy.max(axis=axis, keepdims=keepdims) 60 | elif type == "min": 61 | out_npy = in_npy.min(axis=axis, keepdims=keepdims) 62 | else: 63 | raise NotImplementedError 64 | 65 | data_tvm = tvm.nd.array(in_npy, ctx=tvm.gpu()) 66 | out_tvm = tvm.nd.empty(shape=out_npy.shape, ctx=tvm.gpu()) 67 | 68 | for _ in range(2): 69 | fcuda(data_tvm, out_tvm) 70 | tvm.testing.assert_allclose(out_tvm.asnumpy(), out_npy, rtol=4e-4, atol=4e-4) 71 | 72 | if __name__ == "__main__": 73 | test_reduce_map(in_shape=(128, 24, 128, 24), 74 | axis=(1, 2, 3), 75 | keepdims=True, 76 | type="sum", 77 | test_id=0) 78 | test_reduce_map(in_shape=(128, 24 * 128 * 24), 79 | axis=(1,), 80 | keepdims=False, 81 | type="max", 82 | test_id=1) 83 | test_reduce_map(in_shape=(32, 128, 24), 84 | axis=None, 85 | keepdims=True, 86 | type="sum", 87 | test_id=2) 88 | test_reduce_map(in_shape=(128, 24, 128, 24), 89 | axis=(0, 2), 90 | keepdims=False, 91 | type="min", 92 | test_id=3) 93 | -------------------------------------------------------------------------------- /flextensor/test/test_tvm_expr/grad/requires_grad.py: -------------------------------------------------------------------------------- 1 | import tvm 2 | 3 | A = tvm.te.placeholder([4, 4]) 4 | 5 | print("A requires_grad=", A.requires_grad) 6 | 7 | B = tvm.te.placeholder([4, 4], requires_grad=True) 8 | 9 | print("B requires_grad=", B.requires_grad) 10 | 11 | C = tvm.te.compute([4, 4], lambda i, j: A[i, j]) 12 | 13 | print("C requires_grad=", C.requires_grad) 14 | 15 | D = tvm.te.compute([4, 4], lambda i, j: A[i, j], requires_grad=True) 16 | 17 | print("D requires_grad=", D.requires_grad) 18 | 19 | E = tvm.te.compute([4, 4], lambda i, j: B[i, j]) 20 | 21 | print("E requires_grad=", E.requires_grad) 22 | 23 | F = tvm.te.compute([4, 4], lambda i, j: B[i, j], requires_grad=True) 24 | 25 | print("F requires_grad=", F.requires_grad) -------------------------------------------------------------------------------- /flextensor/test/test_tvm_expr/grad/te-avgpool-case1.py: -------------------------------------------------------------------------------- 1 | import tvm 2 | import numpy as np 3 | 4 | 5 | H = 6 6 | W = 6 7 | 8 | R = 2 9 | S = 2 10 | 11 | P = H // R 12 | Q = W // S 13 | 14 | dtype = "float32" 15 | 16 | A = tvm.te.placeholder([H, W], dtype=dtype, name="A") 17 | 18 | r = tvm.te.reduce_axis([0, R], name="r") 19 | s = tvm.te.reduce_axis([0, S], name="s") 20 | 21 | C = tvm.te.compute([P, Q], 22 | lambda i, j: tvm.te.sum(A[i * R + r, j * S + s]/(R*S), axis=[r, s]), name="C") 23 | 24 | dC = tvm.te.placeholder([P, Q], dtype=dtype, name="dC") 25 | 26 | 27 | dA = tvm.te.grad_op(A, C, dC) 28 | 29 | s = tvm.te.create_schedule(dA.op) 30 | 31 | print(tvm.lower(s, [A, dC, dA], simple_mode=True)) 32 | 33 | func = tvm.build(s, [A, dC, dA], target="llvm") 34 | 35 | A_np = np.random.uniform(-10, 10, [H, W]).astype("float32") 36 | dC_np = np.random.uniform(-10, 10, [P, Q]).astype("float32") 37 | dA_np = np.zeros([H, W]).astype("float32") 38 | 39 | ctx = tvm.device("llvm", 0) 40 | A_tvm = tvm.nd.array(A_np, ctx) 41 | dC_tvm = tvm.nd.array(dC_np, ctx) 42 | dA_tvm = tvm.nd.array(dA_np, ctx) 43 | 44 | func(A_tvm, dC_tvm, dA_tvm) 45 | 46 | print(dC_tvm) 47 | print(dA_tvm.asnumpy()) 48 | 49 | # =======> 50 | # compare the results with numpy 51 | golden_np = np.zeros([H, W]).astype("float32") 52 | for i in range(0, P): 53 | for j in range(0, Q): 54 | for di in range(0, R): 55 | for dj in range(0, S): 56 | assert(i+di < H) 57 | assert(j+dj < W) 58 | golden_np[i*R+di][j*S+dj] = dC_np[i][j] / (R * S) 59 | tvm.testing.assert_allclose(dA_tvm.asnumpy(), golden_np, rtol=1e-30) 60 | print("Compare with Numpy success!") 61 | -------------------------------------------------------------------------------- /flextensor/test/test_tvm_expr/grad/te-broadcast-case1.py: -------------------------------------------------------------------------------- 1 | import tvm 2 | import numpy as np 3 | 4 | 5 | H = 8 6 | W = 16 7 | 8 | dtype = "float32" 9 | 10 | A = tvm.te.placeholder([H], dtype=dtype, name="A") 11 | C = tvm.te.compute([H, W], 12 | lambda i, j: 13 | A[i], name="C") 14 | 15 | dC = tvm.te.placeholder([H, W], dtype=dtype, name="dC") 16 | 17 | dA = tvm.te.grad_op(A, C, dC) 18 | 19 | s = tvm.te.create_schedule(dA.op) 20 | 21 | print(tvm.lower(s, [A, dC, dA], simple_mode=True)) 22 | 23 | func = tvm.build(s, [A, dC, dA], target="llvm") 24 | 25 | A_np = np.random.uniform(-10, 10, [H]).astype("float32") 26 | dC_np = np.random.uniform(-10, 10, [H, W]).astype("float32") 27 | dA_np = np.zeros([H]).astype("float32") 28 | 29 | ctx = tvm.device("llvm", 0) 30 | A_tvm = tvm.nd.array(A_np, ctx) 31 | dC_tvm = tvm.nd.array(dC_np, ctx) 32 | dA_tvm = tvm.nd.array(dA_np, ctx) 33 | 34 | func(A_tvm, dC_tvm, dA_tvm) 35 | 36 | print(dA_tvm) 37 | 38 | # =======> 39 | # compare the results with numpy 40 | golden_np = np.sum(dC_np, axis=1) 41 | tvm.testing.assert_allclose(dA_tvm.asnumpy(), golden_np, rtol=1e-6) 42 | print("Compare with Numpy success!") 43 | -------------------------------------------------------------------------------- /flextensor/test/test_tvm_expr/grad/te-cat-case1.py: -------------------------------------------------------------------------------- 1 | import topi 2 | import tvm 3 | import numpy as np 4 | import torch 5 | 6 | 7 | dim0 = 8 8 | dim1 = 3 9 | dim2 = 4 10 | shape_size1 = [dim0, dim1] 11 | shape_size2 = [dim0, dim2] 12 | dtype = "float32" 13 | 14 | A = tvm.te.placeholder(shape_size1, dtype=dtype, name="A") 15 | B = tvm.te.placeholder(shape_size2, dtype=dtype, name="B") 16 | C = topi.concatenate([A, B], axis=1) 17 | 18 | dC = tvm.te.placeholder(C.shape, dtype=dtype, name="dC") 19 | dA, dB = tvm.te.mygradient(C, [A, B], dC) 20 | 21 | s = tvm.te.create_schedule([C.op, dA.op, dB.op]) 22 | 23 | print(tvm.lower(s, [A, B, dC, dA, dB], simple_mode=True)) 24 | 25 | func = tvm.build(s, [A, B, dC, dA, dB], target="llvm") 26 | 27 | A_np = np.random.uniform(-10, 10, shape_size1).astype("float32") 28 | B_np = np.random.uniform(-10, 10, shape_size2).astype("float32") 29 | 30 | dC_np = np.ones([dim0, dim1+dim2]).astype("float32") 31 | dA_np = np.zeros(shape_size1).astype("float32") 32 | dB_np = np.zeros(shape_size2).astype("float32") 33 | 34 | ctx = tvm.device("llvm", 0) 35 | A_tvm = tvm.nd.array(A_np, ctx) 36 | B_tvm = tvm.nd.array(B_np, ctx) 37 | 38 | dC_tvm = tvm.nd.array(dC_np, ctx) 39 | dA_tvm = tvm.nd.array(dA_np, ctx) 40 | dB_tvm = tvm.nd.array(dB_np, ctx) 41 | func(A_tvm, B_tvm, dC_tvm, dA_tvm, dB_tvm) 42 | 43 | print("dA_tvm", dA_tvm) 44 | 45 | # =======> 46 | # compare the results with pytorch 47 | A_torch = torch.tensor(A_np, requires_grad=True) 48 | B_torch = torch.tensor(B_np, requires_grad=True) 49 | C_torch = torch.cat([A_torch, B_torch], dim=1) 50 | loss = C_torch.sum() 51 | loss.backward() 52 | print("Pytorch gradient:\n", A_torch.grad.numpy(), B_torch.grad.numpy()) 53 | tvm.testing.assert_allclose(dA_tvm.asnumpy(), A_torch.grad.numpy(), atol=1e-30, rtol=1e-30) 54 | tvm.testing.assert_allclose(dB_tvm.asnumpy(), B_torch.grad.numpy(), atol=1e-30, rtol=1e-30) 55 | print("Compare with PyTorch success!") 56 | -------------------------------------------------------------------------------- /flextensor/test/test_tvm_expr/grad/te-conv2d-case1.py: -------------------------------------------------------------------------------- 1 | import tvm 2 | import numpy as np 3 | import torch 4 | 5 | 6 | N = 2 7 | nC = 16 8 | H = 14 9 | W = 14 10 | K = 8 11 | R = 3 12 | S = 3 13 | 14 | st = 1 15 | 16 | P = (H - R + 1) // st 17 | Q = (W - S + 1) // st 18 | 19 | dtype = "float32" 20 | 21 | A = tvm.te.placeholder([N, nC, H, W], dtype=dtype, name="A") 22 | B = tvm.te.placeholder([K, nC, R, S], dtype=dtype, name="B") 23 | c = tvm.te.reduce_axis([0, nC], name="c") 24 | r = tvm.te.reduce_axis([0, R], name="r") 25 | s = tvm.te.reduce_axis([0, S], name="s") 26 | C = tvm.te.compute([N, K, P, Q], 27 | lambda n, k, h, w : 28 | tvm.te.sum(A[n, c, h * st + r, w * st + s] * B[k, c, r, s], axis=[c,r,s]), name="C") 29 | 30 | dC = tvm.te.placeholder([N, K, P, Q], dtype=dtype, name="dC") 31 | 32 | print(C.op.body) 33 | 34 | print(dir(C.op.body[0].source[0])) 35 | 36 | print(tvm.te.expr_equal(C.op.body[0].source[0].b.args[0], C.op.body[0].source[0].b.args[1])) 37 | 38 | dA = tvm.te.grad_op(A, C, dC) 39 | 40 | s = tvm.te.create_schedule(dA.op) 41 | 42 | print(tvm.lower(s, [A, B, dC, dA], simple_mode=True)) 43 | 44 | func = tvm.build(s, [A, B, dC, dA], target="llvm") 45 | 46 | A_np = np.random.uniform(-10, 10, [N, nC, H, W]).astype("float32") 47 | B_np = np.random.uniform(-10, 10, [K, nC, R, S]).astype("float32") 48 | dC_np = np.random.uniform(-10, 10, [N, K, P, Q]).astype("float32") 49 | dA_np = np.zeros([N, nC, H, W]).astype("float32") 50 | 51 | ctx = tvm.device("llvm", 0) 52 | A_tvm = tvm.nd.array(A_np, ctx) 53 | B_tvm = tvm.nd.array(B_np, ctx) 54 | dC_tvm = tvm.nd.array(dC_np, ctx) 55 | dA_tvm = tvm.nd.array(dA_np, ctx) 56 | 57 | func(A_tvm, B_tvm, dC_tvm, dA_tvm) 58 | 59 | print(dA_tvm) 60 | 61 | # =======> 62 | # compare the results with pytorch 63 | A_torch = torch.tensor(A_np) 64 | B_torch = torch.tensor(B_np) 65 | dC_torch = torch.tensor(dC_np) 66 | golden_torch = torch.nn.functional.conv_transpose2d(dC_torch, B_torch) 67 | tvm.testing.assert_allclose(dA_tvm.asnumpy(), golden_torch.numpy(), rtol=1e-3) 68 | print("Compare with PyTorch success!") 69 | -------------------------------------------------------------------------------- /flextensor/test/test_tvm_expr/grad/te-conv2d-case2.py: -------------------------------------------------------------------------------- 1 | import tvm 2 | import numpy as np 3 | import torch 4 | 5 | 6 | N = 3 7 | nC = 1024 8 | H = 15 9 | W = 15 10 | K = 1024 11 | R = 3 12 | S = 3 13 | 14 | st = 2 15 | group = 2 16 | 17 | OG = K // group 18 | IG = nC // group 19 | 20 | P = (H - R + 1) // st + 1 21 | Q = (W - S + 1) // st + 1 22 | 23 | dtype = "float32" 24 | 25 | A = tvm.te.placeholder([N, nC, H, W], dtype=dtype, name="A") 26 | B = tvm.te.placeholder([K, nC, R, S], dtype=dtype, name="B") 27 | c = tvm.te.reduce_axis([0, nC], name="c") 28 | r = tvm.te.reduce_axis([0, R], name="r") 29 | s = tvm.te.reduce_axis([0, S], name="s") 30 | C = tvm.te.compute([N, K, P, Q], 31 | lambda n, k, h, w : 32 | tvm.te.sum(A[n, c, h * st + r, w * st + s] * B[k, c, r, s], axis=[c,r,s]), name="C") 33 | 34 | dC = tvm.te.placeholder([N, K, P, Q], dtype=dtype, name="dC") 35 | 36 | print(C.op.body) 37 | 38 | print(dir(C.op.body[0].source[0])) 39 | 40 | print(tvm.te.expr_equal(C.op.body[0].source[0].b.args[0], C.op.body[0].source[0].b.args[1])) 41 | 42 | dA = tvm.te.grad_op(A, C, dC) 43 | 44 | s = tvm.te.create_schedule(dA.op) 45 | 46 | print(tvm.lower(s, [A, B, dC, dA], simple_mode=True)) 47 | 48 | func = tvm.build(s, [A, B, dC, dA], target="llvm") 49 | 50 | A_np = np.random.uniform(-1, 1, [N, nC, H, W]).astype("float32") 51 | # B_np = np.ones([K, nC, R, S]).astype("float32") 52 | B_np = np.random.uniform(-1, 1, [K, nC, R, S]).astype("float32") 53 | # dC_np = np.ones([N, K, P, Q]).astype("float32") 54 | dC_np = np.random.uniform(-1, 1, [N, K, P, Q]).astype("float32") 55 | 56 | dA_np = np.zeros([N, nC, H, W]).astype("float32") 57 | 58 | ctx = tvm.device("llvm", 0) 59 | A_tvm = tvm.nd.array(A_np, ctx) 60 | B_tvm = tvm.nd.array(B_np, ctx) 61 | dC_tvm = tvm.nd.array(dC_np, ctx) 62 | dA_tvm = tvm.nd.array(dA_np, ctx) 63 | 64 | func(A_tvm, B_tvm, dC_tvm, dA_tvm) 65 | 66 | 67 | # compare the results with pytorch 68 | A_torch = torch.tensor(A_np) 69 | B_torch = torch.tensor(B_np) 70 | dC_torch = torch.tensor(dC_np) 71 | #without output_padding=1: shapes (2, 16, 14, 14), golden:(2, 16, 13, 13) mismatch 72 | golden_torch = torch.nn.functional.conv_transpose2d(dC_torch, B_torch, stride=(st, st), output_padding=0) 73 | # print("da_tvm", dA_tvm.shape) 74 | # print("golden_shape,", golden_torch.size()) 75 | 76 | # print("dA_tvm:", dA_tvm) 77 | # print("golden_torch", golden_torch) 78 | tvm.testing.assert_allclose(dA_tvm.asnumpy(), golden_torch.numpy(), atol=1e-3, rtol=1e-5) 79 | print("Success!\n") 80 | -------------------------------------------------------------------------------- /flextensor/test/test_tvm_expr/grad/te-conv2d-case3.py: -------------------------------------------------------------------------------- 1 | import tvm 2 | import numpy as np 3 | import torch 4 | 5 | 6 | N = 2 7 | nC = 16 8 | H = 14 9 | W = 14 10 | K = 8 11 | R = 3 12 | S = 3 13 | 14 | st = 2 15 | group = 2 16 | 17 | OG = K // group 18 | IG = nC // group 19 | 20 | P = (H - R + 1) // st 21 | Q = (W - S + 1) // st 22 | 23 | dtype = "float32" 24 | 25 | A = tvm.te.placeholder([N, nC, H, W], dtype=dtype, name="A") 26 | B = tvm.te.placeholder([K, IG, R, S], dtype=dtype, name="B") 27 | c = tvm.te.reduce_axis([0, IG], name="c") 28 | r = tvm.te.reduce_axis([0, R], name="r") 29 | s = tvm.te.reduce_axis([0, S], name="s") 30 | C = tvm.te.compute([N, K, P, Q], 31 | lambda n, k, h, w : 32 | tvm.te.sum(A[n, k // OG * IG + c, h * st + r, w * st + s] * B[k, c, r, s], axis=[c,r,s]), name="C") 33 | 34 | dC = tvm.te.placeholder([N, K, P, Q], dtype=dtype, name="dC") 35 | 36 | print(C.op.body) 37 | 38 | print(dir(C.op.body[0].source[0])) 39 | 40 | print(tvm.te.expr_equal(C.op.body[0].source[0].b.args[0], C.op.body[0].source[0].b.args[1])) 41 | 42 | dA = tvm.te.grad_op(A, C, dC) 43 | 44 | s = tvm.te.create_schedule(dA.op) 45 | 46 | print(tvm.lower(s, [A, B, dC, dA], simple_mode=True)) 47 | 48 | func = tvm.build(s, [A, B, dC, dA], target="llvm") 49 | 50 | A_np = np.random.uniform(-1, 1, [N, nC, H, W]).astype("float32") 51 | B_np = np.random.uniform(-1, 1, [K, IG, R, S]).astype("float32") 52 | dC_np = np.random.uniform(-1, 1, [N, K, P, Q]).astype("float32") 53 | dA_np = np.zeros([N, nC, H, W]).astype("float32") 54 | 55 | ctx = tvm.device("llvm", 0) 56 | A_tvm = tvm.nd.array(A_np, ctx) 57 | B_tvm = tvm.nd.array(B_np, ctx) 58 | dC_tvm = tvm.nd.array(dC_np, ctx) 59 | dA_tvm = tvm.nd.array(dA_np, ctx) 60 | 61 | func(A_tvm, B_tvm, dC_tvm, dA_tvm) 62 | 63 | 64 | # =======> 65 | # compare the results with pytorch 66 | A_torch = torch.tensor(A_np) 67 | B_torch = torch.tensor(B_np) 68 | dC_torch = torch.tensor(dC_np) 69 | #without output_padding=1: shapes (2, 16, 14, 14), golden:(2, 16, 13, 13) mismatch 70 | golden_torch = torch.nn.functional.conv_transpose2d(dC_torch, B_torch, stride=(st, st), output_padding=1, groups=group) 71 | tvm.testing.assert_allclose(dA_tvm.asnumpy(), golden_torch.numpy(), rtol=1e-3) 72 | print("Compare with PyTorch success!") 73 | -------------------------------------------------------------------------------- /flextensor/test/test_tvm_expr/grad/te-conv2d-topi-case1.py: -------------------------------------------------------------------------------- 1 | import tvm 2 | import numpy as np 3 | import torch 4 | import topi 5 | 6 | 7 | N = 2 8 | nC = 16 9 | H = 14 10 | W = 14 11 | K = 8 12 | R = 3 13 | S = 3 14 | 15 | st = 1 16 | 17 | P = (H - R + 1) // st 18 | Q = (W - S + 1) // st 19 | 20 | dtype = "float32" 21 | 22 | A = tvm.te.placeholder([N, nC, H, W], dtype=dtype, name="A") 23 | B = tvm.te.placeholder([K, nC, R, S], dtype=dtype, name="B") 24 | c = tvm.te.reduce_axis([0, nC], name="c") 25 | r = tvm.te.reduce_axis([0, R], name="r") 26 | s = tvm.te.reduce_axis([0, S], name="s") 27 | C = topi.nn.conv2d_nchw(A, B, 1, 0, 1, out_dtype=dtype) 28 | #C = tvm.te.compute([N, K, P, Q], 29 | # lambda n, k, h, w : 30 | # tvm.te.sum(A[n, c, h * st + r, w * st + s] * B[k, c, r, s], axis=[c,r,s]), name="C") 31 | 32 | dC = tvm.te.placeholder([N, K, P, Q], dtype=dtype, name="dC") 33 | 34 | print(C.op.body) 35 | 36 | print(dir(C.op.body[0].source[0])) 37 | 38 | print(tvm.te.expr_equal(C.op.body[0].source[0].b.args[0], C.op.body[0].source[0].b.args[1])) 39 | 40 | dA, = tvm.te.mygradient(C, [A], dC) 41 | #dA = tvm.te.grad_op(A, C, dC) 42 | 43 | s = tvm.te.create_schedule(dA.op) 44 | 45 | print(tvm.lower(s, [A, B, dC, dA], simple_mode=True)) 46 | 47 | func = tvm.build(s, [A, B, dC, dA], target="llvm") 48 | 49 | A_np = np.random.uniform(-10, 10, [N, nC, H, W]).astype("float32") 50 | B_np = np.random.uniform(-10, 10, [K, nC, R, S]).astype("float32") 51 | dC_np = np.random.uniform(-10, 10, [N, K, P, Q]).astype("float32") 52 | dA_np = np.zeros([N, nC, H, W]).astype("float32") 53 | 54 | ctx = tvm.device("llvm", 0) 55 | A_tvm = tvm.nd.array(A_np, ctx) 56 | B_tvm = tvm.nd.array(B_np, ctx) 57 | dC_tvm = tvm.nd.array(dC_np, ctx) 58 | dA_tvm = tvm.nd.array(dA_np, ctx) 59 | 60 | func(A_tvm, B_tvm, dC_tvm, dA_tvm) 61 | 62 | print(dA_tvm) 63 | 64 | # =======> 65 | # compare the results with pytorch 66 | A_torch = torch.tensor(A_np) 67 | B_torch = torch.tensor(B_np) 68 | dC_torch = torch.tensor(dC_np) 69 | golden_torch = torch.nn.functional.conv_transpose2d(dC_torch, B_torch) 70 | tvm.testing.assert_allclose(dA_tvm.asnumpy(), golden_torch.numpy(), rtol=1e-5, atol=1e-4) 71 | print("Compare with PyTorch success!") 72 | -------------------------------------------------------------------------------- /flextensor/test/test_tvm_expr/grad/te-cross_entropy-case1.py: -------------------------------------------------------------------------------- 1 | import tvm 2 | import numpy as np 3 | import torch 4 | 5 | 6 | def cross_entropy(inputs, targets, weights, reduction="mean"): 7 | N, C = inputs.shape 8 | c = tvm.te.reduce_axis([0, C], "c") 9 | sum_val = tvm.te.compute([N], lambda i: tvm.te.sum(tvm.tir.exp(inputs[i, c]), axis=[c]), "sum_val") 10 | if reduction == "mean": 11 | rn = tvm.te.reduce_axis([0, N], "rn") 12 | rc = tvm.te.reduce_axis([0, C], "rc") 13 | sum_weight = tvm.te.compute([1], lambda i: tvm.te.sum(weights[i+rc]*targets[rn, rc]/N, axis=[rn, rc]), "mean_weight", requires_grad=False) 14 | elif reduction == "sum": 15 | sum_weight = tvm.te.compute([1], lambda i: tvm.tir.expr.const(1, weights.dtype)/N, "sum_weight", requires_grad=False) 16 | else: 17 | raise NameError() 18 | rrn = tvm.te.reduce_axis([0, N], "rrn") 19 | rrc = tvm.te.reduce_axis([0, C], "rrc") 20 | # return tvm.te.compute([1], 21 | # lambda i: tvm.te.sum( 22 | # weights[rrc] * targets[i+rrn, rrc] * (tvm.tir.log(sum_val[i+rrn]) - inputs[i+rrn, rrc]*targets[rrn, rrc])/(N*sum_weight[i]), 23 | # axis=[rrn, rrc]), 24 | # "cross_entropy") 25 | return tvm.te.compute([1], 26 | lambda i: tvm.te.sum( 27 | targets[i+rrn, rrc] * (tvm.tir.log(sum_val[i+rrn]) - inputs[i+rrn, rrc]*targets[i+rrn, rrc])/(N), 28 | axis=[rrn, rrc]), 29 | "cross_entropy") 30 | 31 | 32 | N = 100 33 | C = 100 34 | dtype = "float32" 35 | ltype = "int64" 36 | target = "llvm" 37 | 38 | A = tvm.te.placeholder([N, C], dtype=dtype, name="A") 39 | targets = tvm.te.placeholder([N, C], dtype=dtype, name="targets", requires_grad=False) 40 | labels = tvm.te.placeholder([N], dtype=ltype, name="labels", requires_grad=False) 41 | weights = tvm.te.placeholder([C], dtype=dtype, name="weights", requires_grad=False) 42 | 43 | loss = cross_entropy(A, targets, weights, reduction="mean") 44 | 45 | dloss = tvm.te.placeholder([1], dtype=dtype, name="dloss") 46 | 47 | dA, = tvm.te.mygradient(loss, [A], dloss) 48 | 49 | s = tvm.te.create_schedule([loss.op, dA.op]) 50 | 51 | print(tvm.lower(s, [A, targets, weights, loss, dloss, dA], simple_mode=True)) 52 | 53 | func = tvm.build(s, [A, targets, weights, loss, dloss, dA], target) 54 | 55 | A_np = np.random.uniform(-1, 1, [N, C]).astype(dtype) 56 | dA_np = np.zeros([N, C]).astype(dtype) * 0 + 1 57 | labels_np = np.random.randint(0, C, [N]).astype(ltype) 58 | # labels_np[0] = 1 59 | targets_np = np.zeros([N, C]).astype(dtype) 60 | for i in range(N): 61 | targets_np[i][labels_np[i]] = 1.0 62 | weights_np = np.random.uniform(-1, 1, [C]).astype(dtype) * 0 + 1 63 | loss_np = np.zeros([1]).astype(dtype) 64 | dloss_np = np.random.uniform(-1, 1, [1]).astype(dtype) * 0 + 1 65 | 66 | ctx = tvm.device(target, 0) 67 | A_tvm = tvm.nd.array(A_np, ctx) 68 | dA_tvm = tvm.nd.array(dA_np, ctx) 69 | targets_tvm = tvm.nd.array(targets_np, ctx) 70 | weights_tvm = tvm.nd.array(weights_np, ctx) 71 | loss_tvm = tvm.nd.array(loss_np, ctx) 72 | dloss_tvm = tvm.nd.array(dloss_np, ctx) 73 | 74 | func(A_tvm, targets_tvm, weights_tvm, loss_tvm, dloss_tvm, dA_tvm) 75 | print(loss_tvm) 76 | print(dA_tvm) 77 | 78 | A_torch = torch.tensor(A_np, requires_grad=True) 79 | labels_torch = torch.tensor(labels_np) 80 | weights_torch = torch.tensor(weights_np) 81 | loss_torch = torch.nn.functional.cross_entropy(A_torch, labels_torch, weights_torch, reduction="mean") 82 | 83 | print(loss_torch.detach().numpy()) 84 | loss_torch.backward() 85 | print(A_torch.grad.numpy()) 86 | 87 | tvm.testing.assert_allclose(dA_tvm.asnumpy(), A_torch.grad.numpy(), rtol=1e-30, atol=1e-9) 88 | print("Compare to PyTorch success!") -------------------------------------------------------------------------------- /flextensor/test/test_tvm_expr/grad/te-downcast-case1.py: -------------------------------------------------------------------------------- 1 | import tvm 2 | import numpy as np 3 | 4 | 5 | H = 8 6 | 7 | dtype = "float32" 8 | 9 | A = tvm.te.placeholder([H, H], dtype=dtype, name="A") 10 | C = tvm.te.compute([H], 11 | lambda i: 12 | A[i, i], name="C") 13 | 14 | dC = tvm.te.placeholder([H], dtype=dtype, name="dC") 15 | 16 | dA = tvm.te.grad_op(A, C, dC) 17 | 18 | s = tvm.te.create_schedule(dA.op) 19 | 20 | print(tvm.lower(s, [A, dC, dA], simple_mode=True)) 21 | 22 | func = tvm.build(s, [A, dC, dA], target="llvm") 23 | 24 | A_np = np.random.uniform(-10, 10, [H, H]).astype("float32") 25 | dC_np = np.random.uniform(-10, 10, [H]).astype("float32") 26 | dA_np = np.zeros([H, H]).astype("float32") 27 | 28 | ctx = tvm.device("llvm", 0) 29 | A_tvm = tvm.nd.array(A_np, ctx) 30 | dC_tvm = tvm.nd.array(dC_np, ctx) 31 | dA_tvm = tvm.nd.array(dA_np, ctx) 32 | 33 | func(A_tvm, dC_tvm, dA_tvm) 34 | 35 | print(dA_tvm) 36 | 37 | # =======> 38 | # compare the results with numpy 39 | golden_np = np.diag(dC_np) 40 | tvm.testing.assert_allclose(dA_tvm.asnumpy(), golden_np, rtol=1e-30) 41 | print("Compare with Numpy success!") 42 | -------------------------------------------------------------------------------- /flextensor/test/test_tvm_expr/grad/te-flatten.py: -------------------------------------------------------------------------------- 1 | import tvm 2 | import numpy as np 3 | 4 | 5 | H = 8 6 | W = 16 7 | 8 | dtype = "float32" 9 | 10 | A = tvm.te.placeholder([H, W], dtype=dtype, name="A") 11 | C = tvm.te.compute([H * W], 12 | lambda i: 13 | A[i//W, i%W], name="C") 14 | 15 | dC = tvm.te.placeholder([H * W], dtype=dtype, name="dC") 16 | 17 | dA = tvm.te.grad_op(A, C, dC) 18 | 19 | s = tvm.te.create_schedule(dA.op) 20 | 21 | print(tvm.lower(s, [A, dC, dA], simple_mode=True)) 22 | 23 | func = tvm.build(s, [A, dC, dA], target="llvm") 24 | 25 | A_np = np.random.uniform(-10, 10, [H, W]).astype("float32") 26 | dC_np = np.random.uniform(-10, 10, [H * W]).astype("float32") 27 | dA_np = np.zeros([H, W]).astype("float32") 28 | 29 | ctx = tvm.device("llvm", 0) 30 | A_tvm = tvm.nd.array(A_np, ctx) 31 | dC_tvm = tvm.nd.array(dC_np, ctx) 32 | dA_tvm = tvm.nd.array(dA_np, ctx) 33 | 34 | func(A_tvm, dC_tvm, dA_tvm) 35 | 36 | print(dA_tvm) 37 | 38 | # =======> 39 | # compare the results with numpy 40 | golden_np = np.reshape(dC_np, (H, W)) 41 | tvm.testing.assert_allclose(dA_tvm.asnumpy(), golden_np, rtol=1e-30) 42 | print("Compare with Numpy success!") -------------------------------------------------------------------------------- /flextensor/test/test_tvm_expr/grad/te-gemm.py: -------------------------------------------------------------------------------- 1 | import tvm 2 | import numpy as np 3 | import torch 4 | 5 | 6 | H = 8 7 | W = 4 8 | K = 3 9 | 10 | dtype = "float32" 11 | 12 | A = tvm.te.placeholder([H, K], dtype=dtype, name="A") 13 | B = tvm.te.placeholder([K, W], dtype=dtype, name="B") 14 | k = tvm.te.reduce_axis([0, K], name="k") 15 | C = tvm.te.compute([H, W], 16 | lambda h, w : 17 | tvm.te.sum(A[h, k] * B[k, w], axis=[k]), name="C") 18 | 19 | dC = tvm.te.placeholder([H, W], dtype=dtype, name="dC") 20 | 21 | dA = tvm.te.grad_op(A, C, dC) 22 | 23 | s = tvm.te.create_schedule(dA.op) 24 | 25 | print(tvm.lower(s, [A, B, dC, dA], simple_mode=True)) 26 | 27 | func = tvm.build(s, [A, B, dC, dA], target="llvm") 28 | 29 | A_np = np.random.uniform(-10, 10, [H, K]).astype("float32") 30 | B_np = np.random.uniform(-10, 10, [K, W]).astype("float32") 31 | dC_np = np.random.uniform(-10, 10, [H, W]).astype("float32") 32 | dA_np = np.zeros([H, K]).astype("float32") 33 | 34 | ctx = tvm.device("llvm", 0) 35 | A_tvm = tvm.nd.array(A_np, ctx) 36 | B_tvm = tvm.nd.array(B_np, ctx) 37 | dC_tvm = tvm.nd.array(dC_np, ctx) 38 | dA_tvm = tvm.nd.array(dA_np, ctx) 39 | 40 | func(A_tvm, B_tvm, dC_tvm, dA_tvm) 41 | 42 | print(dA_tvm) 43 | 44 | # =======> 45 | # compare the results with numpy 46 | golden_np = np.matmul(dC_np, B_np.T) 47 | tvm.testing.assert_allclose(dA_tvm.asnumpy(), golden_np, rtol=1e-5) 48 | print("Compare with Numpy success!") 49 | 50 | 51 | -------------------------------------------------------------------------------- /flextensor/test/test_tvm_expr/grad/te-maxpool-case1.py: -------------------------------------------------------------------------------- 1 | import tvm 2 | import numpy as np 3 | import torch 4 | 5 | 6 | H = 4 7 | W = 4 8 | 9 | R = 2 10 | S = 2 11 | 12 | P = H // R 13 | Q = W // S 14 | 15 | dtype = "float32" 16 | 17 | 18 | def mse_loss(inputs, targets): 19 | N = inputs.shape[0] 20 | K = inputs.shape[1] 21 | n = tvm.te.reduce_axis([0, inputs.shape[0]], name="n") 22 | k = tvm.te.reduce_axis([0, inputs.shape[1]], name="k") 23 | # return tvm.te.compute([1], lambda i: tvm.te.sum((inputs[i + n, k]-targets[i + n, k])*(inputs[i + n, k]-targets[i + n, k])/(N*K), axis=[n, k]), name="mse") 24 | return tvm.te.compute([1], lambda i: tvm.te.sum(tvm.tir.power((inputs[i + n, k]-targets[i + n, k]), 2)/(N*K), axis=[n, k]), name="mse", requires_grad=True) 25 | 26 | 27 | A = tvm.te.placeholder([H, W], dtype=dtype, name="A", requires_grad=True) 28 | label = tvm.te.placeholder([P, Q], dtype=dtype, name="label", requires_grad=False) 29 | 30 | p = tvm.te.reduce_axis([0, R], "p") 31 | q = tvm.te.reduce_axis([0, S], "q") 32 | 33 | B = tvm.te.compute([P, Q], lambda a, b: tvm.te.max(A[a*R+p, b*S+q]-1e-5, axis=[p, q]), requires_grad=False, name="max_value") 34 | C = tvm.te.compute([H, W], lambda u, v: tvm.tir.if_then_else(A[u, v] > B[u//R, v//S], 1.0, 0.0), requires_grad=False, name="map") 35 | 36 | r = tvm.te.reduce_axis([0, R], "r") 37 | s = tvm.te.reduce_axis([0, S], "s") 38 | 39 | D = tvm.te.compute([P, Q], 40 | lambda i, j: tvm.te.sum(A[i*R+r, j*S+s] * C[i*R+r, j*S+s], axis=[r, s]), name="D", requires_grad=True) 41 | 42 | E = mse_loss(D, label) 43 | 44 | dA, = tvm.te.mygradient(E, [A]) 45 | 46 | s = tvm.te.create_schedule([E.op, dA.op]) 47 | 48 | print(tvm.lower(s, [A, label, E, dA], simple_mode=True)) 49 | 50 | func = tvm.build(s, [A, label, E, dA], target="llvm") 51 | 52 | A_np = np.random.uniform(-10, 10, [H, W]).astype("float32") 53 | label_np = np.random.uniform(-10, 10, [P, Q]).astype("float32") 54 | E_np = np.zeros([1]).astype("float32") 55 | dA_np = np.zeros([H, W]).astype("float32") 56 | 57 | ctx = tvm.device("llvm", 0) 58 | A_tvm = tvm.nd.array(A_np, ctx) 59 | label_tvm = tvm.nd.array(label_np, ctx) 60 | E_tvm = tvm.nd.array(E_np, ctx) 61 | dA_tvm = tvm.nd.array(dA_np, ctx) 62 | 63 | func(A_tvm, label_tvm, E_tvm, dA_tvm) 64 | 65 | print(E_tvm) 66 | 67 | # ==> compare to pytorch 68 | 69 | A0_torch = torch.tensor(A_np, requires_grad=True) 70 | A_torch = A0_torch.unsqueeze(0).unsqueeze(1) 71 | label_torch = torch.tensor(label_np, requires_grad=False) 72 | D_torch = torch.max_pool2d(A_torch, [R, S]).squeeze() 73 | E_torch = torch.nn.functional.mse_loss(D_torch, label_torch) 74 | print(E_torch.detach().numpy()) 75 | E_torch.backward() 76 | 77 | tvm.testing.assert_allclose(E_tvm.asnumpy(), E_torch.detach().numpy(), atol=1e-5, rtol=1e-30) 78 | tvm.testing.assert_allclose(dA_tvm.asnumpy(), A0_torch.grad.numpy(), atol=1e-5, rtol=1e-30) -------------------------------------------------------------------------------- /flextensor/test/test_tvm_expr/grad/te-mse_loss-case1.py: -------------------------------------------------------------------------------- 1 | import tvm 2 | import numpy as np 3 | import torch 4 | 5 | batch_size = 3 6 | num_classes = 5 7 | shape_size = [batch_size, num_classes] 8 | dtype = "float32" 9 | ltype = "int64" 10 | 11 | A = tvm.te.placeholder(shape_size, dtype=dtype, name="A", requires_grad=True) 12 | targets = tvm.te.placeholder(shape_size, dtype=dtype, name="targets", requires_grad=False) 13 | n = tvm.te.reduce_axis([0, A.shape[0]], name="n") 14 | k = tvm.te.reduce_axis([0, A.shape[1]], name="k") 15 | loss = tvm.te.compute([1], lambda i: tvm.te.sum( 16 | (A[i + n, k]-targets[n, k])*(A[i + n, k]-targets[n, k]), axis=[n, k]), name="mse", requires_grad=True) 17 | 18 | dloss = tvm.te.placeholder([1], dtype=dtype, name="dloss") 19 | dA, = tvm.te.mygradient(loss, [A], dloss) 20 | 21 | s = tvm.te.create_schedule([loss.op, dA.op]) 22 | 23 | print(tvm.lower(s, [A, targets, loss, dloss, dA], simple_mode=True)) 24 | 25 | func = tvm.build(s, [A, targets, loss, dloss, dA], target="llvm") 26 | 27 | A_np = np.random.uniform(-10, 10, shape_size).astype(dtype) 28 | dA_np = np.zeros([batch_size, num_classes]).astype(dtype) * 0 + 1 29 | labels_np = np.random.randint(0, num_classes, [batch_size]).astype(ltype) 30 | targets_np = np.zeros([batch_size, num_classes]).astype(dtype) 31 | for i in range(batch_size): 32 | targets_np[i][labels_np[i]] = 1.0 33 | loss_np = np.zeros([1]).astype(dtype) 34 | dloss_np = np.random.uniform(-1, 1, [1]).astype(dtype) * 0 + 1 35 | 36 | ctx = tvm.device("llvm", 0) 37 | A_tvm = tvm.nd.array(A_np, ctx) 38 | dA_tvm = tvm.nd.array(dA_np, ctx) 39 | targets_tvm = tvm.nd.array(targets_np, ctx) 40 | loss_tvm = tvm.nd.array(loss_np, ctx) 41 | dloss_tvm = tvm.nd.array(dloss_np, ctx) 42 | 43 | func(A_tvm, targets_tvm, loss_tvm, dloss_tvm, dA_tvm) 44 | 45 | print("loss_tvm", loss_tvm) 46 | print("dA_tvm", dA_tvm) 47 | 48 | # =======> 49 | # compare the results with pytorch 50 | A_torch = torch.tensor(A_np, requires_grad=True) 51 | targets_torch = torch.tensor(targets_np) 52 | loss_torch = torch.nn.functional.mse_loss(A_torch, targets_torch, reduction="sum") 53 | print("loss_pytorch", loss_torch.detach().numpy()) 54 | loss_torch.backward() 55 | print("dA_pytorch", A_torch.grad.numpy()) 56 | tvm.testing.assert_allclose(dA_tvm.asnumpy(), A_torch.grad.numpy(), rtol=1e-30, atol=1e-30) 57 | print("Compare to PyTorch success!") 58 | -------------------------------------------------------------------------------- /flextensor/test/test_tvm_expr/grad/te-padding-case1.py: -------------------------------------------------------------------------------- 1 | import tvm 2 | import numpy as np 3 | import torch 4 | 5 | 6 | N = 2 7 | nC = 16 8 | H = 14 9 | W = 14 10 | K = 16 11 | R = 3 12 | S = 3 13 | 14 | padding = 1 15 | 16 | P = H + 2 * padding 17 | Q = W + 2 * padding 18 | 19 | dtype = "float32" 20 | 21 | A = tvm.te.placeholder([N, nC, H, W], dtype=dtype, name="A") 22 | C = tvm.te.compute([N, K, P, Q], 23 | lambda n, k, h, w : 24 | tvm.tir.if_then_else( 25 | tvm.tir.all(h >= padding, h < P-padding, w >= padding, w < Q-padding), 26 | A[n, k, h-padding, w-padding], 0.0), 27 | name="C") 28 | 29 | dC = tvm.te.placeholder([N, K, P, Q], dtype=dtype, name="dC") 30 | 31 | print(C.op.body[0].name) 32 | 33 | print(type(C.op.body[0].args[1])) 34 | 35 | dA = tvm.te.grad_op(A, C, dC) 36 | 37 | s = tvm.te.create_schedule(dA.op) 38 | 39 | print(tvm.lower(s, [A, dC, dA], simple_mode=True)) 40 | 41 | func = tvm.build(s, [A, dC, dA], target="llvm") 42 | 43 | A_np = np.random.uniform(-10, 10, [N, nC, H, W]).astype("float32") 44 | dC_np = np.random.uniform(-10, 10, [N, K, P, Q]).astype("float32") 45 | dA_np = np.zeros([N, nC, H, W]).astype("float32") 46 | 47 | ctx = tvm.device("llvm", 0) 48 | A_tvm = tvm.nd.array(A_np, ctx) 49 | dC_tvm = tvm.nd.array(dC_np, ctx) 50 | dA_tvm = tvm.nd.array(dA_np, ctx) 51 | 52 | func(A_tvm, dC_tvm, dA_tvm) 53 | 54 | print(dA_tvm) 55 | 56 | # =======> 57 | # compare the results with numpy 58 | golden_np = dC_np[:,:, padding:P-padding, padding:Q-padding] 59 | tvm.testing.assert_allclose(dA_tvm.asnumpy(), golden_np, rtol=1e-30) 60 | print("Compare with Numpy success!") -------------------------------------------------------------------------------- /flextensor/test/test_tvm_expr/grad/te-power-case1.py: -------------------------------------------------------------------------------- 1 | import tvm 2 | import numpy as np 3 | 4 | 5 | H = 4 6 | W = 2 7 | 8 | dtype = "float32" 9 | 10 | A = tvm.te.placeholder([H, W], dtype=dtype, name="A") 11 | 12 | C = tvm.te.compute([H, W], 13 | lambda h, w : 14 | tvm.tir.power(A[h, w]+1e-9, 2), name="C") 15 | 16 | dC = tvm.te.placeholder([H, W], dtype=dtype, name="dC") 17 | 18 | dA = tvm.te.grad_op(A, C, dC) 19 | 20 | s = tvm.te.create_schedule(dA.op) 21 | 22 | s = tvm.te.create_schedule([C.op, dA.op]) 23 | 24 | print(tvm.lower(s, [A, C, dC, dA], simple_mode=True)) 25 | 26 | func = tvm.build(s, [A, C, dC, dA], target="llvm") 27 | 28 | A_np = np.random.uniform(-10, 10, [H, W]).astype("float32") 29 | dC_np = np.random.uniform(-10, 10, [H, W]).astype("float32") 30 | C_np = np.zeros([H, W]).astype("float32") 31 | dA_np = np.zeros([H, W]).astype("float32") 32 | 33 | ctx = tvm.device("llvm", 0) 34 | A_tvm = tvm.nd.array(A_np, ctx) 35 | C_tvm = tvm.nd.array(C_np, ctx) 36 | dC_tvm = tvm.nd.array(dC_np, ctx) 37 | dA_tvm = tvm.nd.array(dA_np, ctx) 38 | 39 | func(A_tvm, C_tvm, dC_tvm, dA_tvm) 40 | 41 | print(A_tvm) 42 | print(C_tvm) 43 | print(dC_tvm) 44 | print(dA_tvm) 45 | -------------------------------------------------------------------------------- /flextensor/test/test_tvm_expr/grad/te-repeat-case1.py: -------------------------------------------------------------------------------- 1 | import tvm 2 | 3 | 4 | H = 8 5 | 6 | dtype = "float32" 7 | 8 | A = tvm.te.placeholder([H, H], dtype=dtype, name="A") 9 | k = tvm.te.reduce_axis([0, H], name="k") 10 | C = tvm.te.compute([H, H], 11 | lambda h, w : 12 | tvm.te.sum(A[h, k] * A[k, w], axis=[k]), name="C") 13 | 14 | dC = tvm.te.compute([H, H], lambda h, w: 1.0, name="dC") 15 | 16 | dA = tvm.te.grad_op(A, C, dC) 17 | 18 | s = tvm.te.create_schedule(dA.op) 19 | 20 | print(tvm.lower(s, [A, dC, dA], simple_mode=True)) 21 | -------------------------------------------------------------------------------- /flextensor/test/test_tvm_expr/grad/te-softmax-case1.py: -------------------------------------------------------------------------------- 1 | import tvm 2 | import numpy as np 3 | import torch 4 | 5 | 6 | N = 2 7 | H = 14 8 | 9 | dtype = "float32" 10 | 11 | factor = 1 12 | 13 | 14 | def softmax(inputs): 15 | K = inputs.shape[-1] 16 | k = tvm.te.reduce_axis([0, K], name="k") 17 | k1 = tvm.te.reduce_axis([0, K], name="k1") 18 | max_val = tvm.te.compute([N, K], lambda n, h: tvm.te.max(inputs[n, k1], axis=[k1]), name="mean_val", requires_grad=True) 19 | exp_val = tvm.te.compute(inputs.shape, lambda n, h: tvm.tir.exp(inputs[n, h]-max_val[n, h]), name="Softmax_exp", requires_grad=True) 20 | sum_val = tvm.te.compute(exp_val.shape, lambda n, h: tvm.te.sum(exp_val[n, k], axis=[k]), name="Softmax_sum", requires_grad=True) 21 | final_val = tvm.te.compute(exp_val.shape, lambda n, h: exp_val[n, h]/(sum_val[n, h]), name="Softmax_div", requires_grad=True) 22 | return [exp_val, sum_val, final_val] 23 | 24 | 25 | def mse_loss(inputs, targets): 26 | N = inputs.shape[0] 27 | K = inputs.shape[1] 28 | n = tvm.te.reduce_axis([0, inputs.shape[0]], name="n") 29 | k = tvm.te.reduce_axis([0, inputs.shape[1]], name="k") 30 | # return tvm.te.compute([1], lambda i: tvm.te.sum((inputs[i + n, k]-targets[i + n, k])*(inputs[i + n, k]-targets[i + n, k])/(N*K), axis=[n, k]), name="mse") 31 | return tvm.te.compute([1], lambda i: tvm.te.sum(tvm.tir.power((inputs[i + n, k]-targets[i + n, k]), 2)/(N*K), axis=[n, k]), name="mse", requires_grad=True) 32 | 33 | 34 | A = tvm.te.placeholder([N, H], dtype=dtype, name="A", requires_grad=True) 35 | label = tvm.te.placeholder([N, H], dtype=dtype, name="label", requires_grad=False) 36 | B, C, D = softmax(A) 37 | E = mse_loss(D, label) 38 | print(E.requires_grad) 39 | 40 | dD = tvm.te.placeholder([N, H], dtype=dtype, name="dD") 41 | dE = tvm.te.placeholder([1], dtype=dtype, name="dE") 42 | 43 | dA, = tvm.te.mygradient(E, [A]) 44 | 45 | s = tvm.te.create_schedule([E.op, dA.op]) 46 | 47 | print(tvm.lower(s, [A, label, D, E, dA], simple_mode=True)) 48 | 49 | func = tvm.build(s, [A, label, D, E, dA], target="llvm") 50 | 51 | A_np = np.random.uniform(-100, 100, [N, H]).astype("float32") 52 | label_np = np.random.uniform(-1, 1, [N, H]).astype("float32") 53 | D_np = np.zeros([N, H]).astype("float32") 54 | E_np = np.zeros([1]).astype("float32") 55 | 56 | dA_np = np.zeros([N, H]).astype("float32") 57 | 58 | ctx = tvm.device("llvm", 0) 59 | A_tvm = tvm.nd.array(A_np, ctx) 60 | label_tvm = tvm.nd.array(label_np, ctx) 61 | D_tvm = tvm.nd.array(D_np, ctx) 62 | dA_tvm = tvm.nd.array(dA_np, ctx) 63 | E_tvm = tvm.nd.array(E_np, ctx) 64 | 65 | func(A_tvm, label_tvm, D_tvm, E_tvm, dA_tvm) 66 | 67 | # print("TVM result:\n", D_tvm) 68 | print("TVM gradient:\n", dA_tvm) 69 | 70 | # =======> 71 | # compare the results with pytorch 72 | A_torch = torch.tensor(A_np, requires_grad=True) 73 | label_torch = torch.tensor(label_np, requires_grad=False) 74 | E_torch = torch.tensor(E_np) 75 | 76 | D_torch = torch.nn.functional.softmax(A_torch, dim=1) 77 | E_torch = torch.nn.functional.mse_loss(D_torch, label_torch) 78 | # print("Pytorch result:\n", D_torch.detach().numpy()) 79 | E_torch.backward() 80 | print("Pytorch gradient:\n", A_torch.grad.numpy()) 81 | tvm.testing.assert_allclose(D_tvm.asnumpy(), D_torch.detach().numpy(), atol=1e-6*factor, rtol=1e-5) 82 | tvm.testing.assert_allclose(dA_tvm.asnumpy(), A_torch.grad.numpy(), atol=1e-6*factor, rtol=1e-5) 83 | print("Compare with PyTorch success!") 84 | -------------------------------------------------------------------------------- /flextensor/test/test_tvm_expr/grad/te-sub-case1.py: -------------------------------------------------------------------------------- 1 | import tvm 2 | 3 | 4 | H = 8 5 | W = 9 6 | 7 | dtype = "float32" 8 | 9 | A = tvm.te.placeholder([H, W], dtype=dtype, name="A") 10 | 11 | C = tvm.te.compute([H, W], 12 | lambda h, w : 13 | A[h, w] * 4 - A[h, w] * A[h, w], name="C") 14 | 15 | dC = tvm.te.compute([H, W], lambda h, w: 1.0, name="dC") 16 | 17 | dA = tvm.te.grad_op(A, C, dC) 18 | 19 | s = tvm.te.create_schedule(dA.op) 20 | 21 | print(tvm.lower(s, [A, dC, dA], simple_mode=True)) 22 | -------------------------------------------------------------------------------- /flextensor/test/test_tvm_expr/grad/te-sub-case2.py: -------------------------------------------------------------------------------- 1 | import tvm 2 | 3 | 4 | H = 8 5 | W = 9 6 | 7 | dtype = "float32" 8 | 9 | A = tvm.te.placeholder([H], dtype=dtype, name="A") 10 | 11 | C = tvm.te.compute([H, W], 12 | lambda h, w : 13 | A[h] * 4 - A[h] * A[h], name="C") 14 | 15 | dC = tvm.te.compute([H, W], lambda h, w: 1.0, name="dC") 16 | 17 | dA = tvm.te.grad_op(A, C, dC) 18 | 19 | s = tvm.te.create_schedule(dA.op) 20 | 21 | print(tvm.lower(s, [A, dC, dA], simple_mode=True)) 22 | -------------------------------------------------------------------------------- /flextensor/test/test_tvm_expr/grad/test_report.md: -------------------------------------------------------------------------------- 1 | ## Tested Ops 2 | 3 | | op name | case No. | grad to | configs | gradient | build | correctness | 4 | | --- | --- | --- | --- | --- | --- | --- | 5 | | GEMM | 1 | A | | yes | yes | rtol=1e-5 | 6 | | Conv2d | 1 | A | st=1, pad=0, group=1, dilation=1 | yes | yes | rtol=1e-3 | 7 | | Conv2d in topi | 1 | A | st=1, pad=0, group=1, dilation=1 | yes | yes | rtol=1e-5, atol=1e-4 | 8 | | Conv2d | 2 | A | st=2, pad=0, group=1, dilation=1 | yes | yes | rtol=1e-3 | 9 | | Conv2d | 3 | A | st=2, pad=0, group=2, dilation=1 | yes | yes | rtol=1e-3 | 10 | | Flatten | 1 | A | | yes | yes | rtol<1e-30 | 11 | | Downcast | 1 | A | | yes | yes | rtol<1e-30 | 12 | | Broadcast | 1 | A | | yes | yes | rtol=1e-6 | 13 | | Padding | 1 | A | | yes | yes | rtol<1e-30 | 14 | | AvgPool | 1 | A | | yes | yes | rtol<1e-30 | 15 | | Softmax | 1 | A | | yes | yes | atol=1e-6, rtol=1e-5 | 16 | | Maxpool | 1 | A | | yes | yes | atol=1e-5, rtol=1e-30 | 17 | | Tanh | 1 | A | | yes | yes | atol=1e-6, rtol=1e-7 | 18 | | ReLU | 1 | A | | yes | yes | atol<1e-30, rtol<1e-30 | 19 | | Mse_loss | 1 | A | | yes | yes | rtol<1e-30, atol<1e-30 | 20 | | Cross_entropy | 1 | A | | yes | yes | rtol=1e-30, atol=1e-9 | 21 | | Concatenate | 1 | A,B | | yes | yes | rtol<1e-30, atol<1e-30 | -------------------------------------------------------------------------------- /flextensor/test/test_tvm_expr/grad/tir-relu-case1.py: -------------------------------------------------------------------------------- 1 | import tvm 2 | import numpy as np 3 | import torch 4 | 5 | 6 | dim1 = 8 7 | dim2 = 4 8 | shape_size = [dim1, dim2] 9 | dtype = "float32" 10 | 11 | A = tvm.te.placeholder(shape_size, dtype=dtype, name="A", requires_grad=True) 12 | zeros = tvm.tir.expr.const(0, dtype) 13 | func = lambda *args: tvm.tir.if_then_else(A[args] > zeros, A[args], zeros) 14 | C = tvm.te.compute(A.shape, func, "ReLU", requires_grad=True) 15 | 16 | dC = tvm.te.placeholder(A.shape, dtype=dtype, name="dC") 17 | dA, = tvm.te.mygradient(C, [A], dC) 18 | #dA = tvm.te.grad_op(A, C, dC) 19 | 20 | s = tvm.te.create_schedule(dA.op) 21 | 22 | print(tvm.lower(s, [A, dC, dA], simple_mode=True)) 23 | 24 | func = tvm.build(s, [A, dC, dA], target="llvm") 25 | 26 | A_np = np.random.uniform(-10, 10, shape_size).astype("float32") 27 | #elements are all 1 28 | dC_np = np.ones(shape_size).astype("float32") 29 | dA_np = np.zeros(shape_size).astype("float32") 30 | 31 | ctx = tvm.device("llvm", 0) 32 | A_tvm = tvm.nd.array(A_np, ctx) 33 | dC_tvm = tvm.nd.array(dC_np, ctx) 34 | dA_tvm = tvm.nd.array(dA_np, ctx) 35 | 36 | func(A_tvm, dC_tvm, dA_tvm) 37 | 38 | print("dA_tvm", dA_tvm) 39 | 40 | # =======> 41 | # compare the results with pytorch 42 | A_torch = torch.tensor(A_np, requires_grad=True) 43 | C_torch = torch.nn.ReLU()(A_torch) 44 | loss = C_torch.sum() 45 | loss.backward() 46 | print("Pytorch gradient:\n", A_torch.grad.numpy()) 47 | tvm.testing.assert_allclose(dA_tvm.asnumpy(), A_torch.grad.numpy(), atol=1e-30, rtol=1e-30) 48 | print("Compare with PyTorch success!") 49 | -------------------------------------------------------------------------------- /flextensor/test/test_tvm_expr/grad/tir-tanh-case1.py: -------------------------------------------------------------------------------- 1 | import tvm 2 | import numpy as np 3 | import torch 4 | 5 | 6 | dim1 = 8 7 | dim2 = 4 8 | shape_size = [dim1, dim2] 9 | dtype = "float32" 10 | 11 | A = tvm.te.placeholder(shape_size, dtype=dtype, name="A", requires_grad=True) 12 | C = tvm.te.compute(A.shape, lambda *args: tvm.tir.tanh(A[args]), "tanh", requires_grad=True) 13 | 14 | dC = tvm.te.placeholder(A.shape, dtype=dtype, name="dC") 15 | dA, = tvm.te.mygradient(C, [A], dC) 16 | #dA = tvm.te.grad_op(A, C, dC) 17 | 18 | s = tvm.te.create_schedule(dA.op) 19 | 20 | print(tvm.lower(s, [A, dC, dA], simple_mode=True)) 21 | 22 | func = tvm.build(s, [A, dC, dA], target="llvm") 23 | 24 | A_np = np.random.uniform(-10, 10, shape_size).astype("float32") 25 | #elements are all 1 26 | dC_np = np.ones(shape_size).astype("float32") 27 | dA_np = np.zeros(shape_size).astype("float32") 28 | 29 | ctx = tvm.device("llvm", 0) 30 | A_tvm = tvm.nd.array(A_np, ctx) 31 | dC_tvm = tvm.nd.array(dC_np, ctx) 32 | dA_tvm = tvm.nd.array(dA_np, ctx) 33 | 34 | func(A_tvm, dC_tvm, dA_tvm) 35 | 36 | print("dA_tvm", dA_tvm) 37 | 38 | # =======> 39 | # compare the results with pytorch 40 | A_torch = torch.tensor(A_np, requires_grad=True) 41 | C_torch = torch.tanh(A_torch) 42 | loss = C_torch.sum() 43 | loss.backward() 44 | print("Pytorch gradient:\n", A_torch.grad.numpy()) 45 | tvm.testing.assert_allclose(dA_tvm.asnumpy(), A_torch.grad.numpy(), atol=1e-6, rtol=1e-7) 46 | print("Compare with PyTorch success!") 47 | 48 | 49 | -------------------------------------------------------------------------------- /flextensor/test/test_tvm_expr/train/get_lm_data.sh: -------------------------------------------------------------------------------- 1 | echo "=== Acquiring datasets ===" 2 | echo "---" 3 | mkdir -p save 4 | 5 | mkdir -p data 6 | cd data 7 | 8 | echo "- Downloading WikiText-2 (WT2)" 9 | wget --quiet --continue https://s3.amazonaws.com/research.metamind.io/wikitext/wikitext-2-v1.zip 10 | unzip -q wikitext-2-v1.zip 11 | cd wikitext-2 12 | mv wiki.train.tokens train.txt 13 | mv wiki.valid.tokens valid.txt 14 | mv wiki.test.tokens test.txt 15 | cd .. 16 | 17 | echo "- Downloading WikiText-103 (WT2)" 18 | wget --continue https://s3.amazonaws.com/research.metamind.io/wikitext/wikitext-103-v1.zip 19 | unzip -q wikitext-103-v1.zip 20 | cd wikitext-103 21 | mv wiki.train.tokens train.txt 22 | mv wiki.valid.tokens valid.txt 23 | mv wiki.test.tokens test.txt 24 | cd .. 25 | 26 | echo "- Downloading enwik8 (Character)" 27 | mkdir -p enwik8 28 | cd enwik8 29 | wget --continue http://mattmahoney.net/dc/enwik8.zip 30 | wget https://raw.githubusercontent.com/salesforce/awd-lstm-lm/master/data/enwik8/prep_enwik8.py 31 | python prep_enwik8.py 32 | cd .. 33 | 34 | echo "- Downloading Penn Treebank (PTB)" 35 | wget --quiet --continue http://www.fit.vutbr.cz/~imikolov/rnnlm/simple-examples.tgz 36 | tar -xzf simple-examples.tgz 37 | 38 | mkdir -p penn 39 | cd penn 40 | mv ../simple-examples/data/ptb.train.txt train.txt 41 | mv ../simple-examples/data/ptb.test.txt test.txt 42 | mv ../simple-examples/data/ptb.valid.txt valid.txt 43 | cd .. 44 | 45 | echo "- Downloading Penn Treebank (Character)" 46 | mkdir -p pennchar 47 | cd pennchar 48 | mv ../simple-examples/data/ptb.char.train.txt train.txt 49 | mv ../simple-examples/data/ptb.char.test.txt test.txt 50 | mv ../simple-examples/data/ptb.char.valid.txt valid.txt 51 | cd .. 52 | 53 | rm -rf simple-examples/ 54 | 55 | echo "---" 56 | echo "Happy language modeling :)" -------------------------------------------------------------------------------- /flextensor/test/test_tvm_expr/train/mi_lstm_pytorch.py: -------------------------------------------------------------------------------- 1 | from typing import List 2 | 3 | import torch 4 | import torch.nn as nn 5 | 6 | 7 | class MultiplicativeIntegration(nn.Module): 8 | def __init__(self, 9 | inputs_sizes: List[int], 10 | output_sizes: List[int], 11 | bias: bool, 12 | bias_start: float = 0.0, 13 | alpha_start: float = 1.0, 14 | beta_start: float = 1.0): 15 | super().__init__() 16 | self.inputs_sizes = inputs_sizes 17 | self.output_sizes = output_sizes 18 | total_output_size = sum(output_sizes) 19 | total_input_size = sum(inputs_sizes) 20 | self.bias_start = bias_start 21 | self.alpha_start = alpha_start 22 | self.beta_start = beta_start 23 | self.weights = nn.Parameter(torch.empty(total_input_size, total_output_size)) 24 | self.alphas = nn.Parameter(torch.empty([total_output_size])) 25 | self.betas = nn.Parameter(torch.empty([2*total_output_size])) 26 | self.biases = nn.Parameter(torch.empty([total_output_size])) if bias else None 27 | self.reset_parameters() 28 | 29 | def forward(self, input0, input1): 30 | # input0.shape = (seq_len x batch_size x input_size), input1.shape = (seq_len x batch_size x num_units) 31 | # w1.shape = (input_size x 4 * num_units), w2.shape = (num_units x 4 * num_units) 32 | w1, w2 = torch.split(self.weights, self.inputs_sizes, dim=0) 33 | # b1.shape, b2.shape = (4 * num_units) 34 | b1, b2 = torch.split(self.betas, sum(self.output_sizes), dim=0) 35 | # wx1.shape = (seq_len x batch_size x 4 * num_units), wx2.shape = (seq_len x batch_size x 4 * num_units) 36 | wx1, wx2 = input0 @ w1, input1 @ w2 37 | # res.shape = (seq_len x batch_size x 4 * num_units) 38 | res = self.alphas * wx1 * wx2 + b1 * wx1 + b2 * wx2 39 | if self.biases is not None: res += self.biases 40 | return res 41 | 42 | def reset_parameters(self): 43 | nn.init.xavier_uniform_(self.weights, gain=1.0) 44 | nn.init.constant_(self.alphas, self.alpha_start) 45 | nn.init.constant_(self.betas, self.beta_start) 46 | if self.biases is not None: 47 | nn.init.constant_(self.biases, self.bias_start) 48 | 49 | 50 | class MILSTMCell(nn.Module): 51 | def __init__(self, input_size, num_units, forget_bias=0.0, 52 | bias_start=0.0, alpha_start=1.0, 53 | beta_start=1.0, activation=torch.tanh): 54 | super().__init__() 55 | self._input_size = input_size 56 | self._num_units = num_units 57 | self._forget_bias = forget_bias 58 | self._bias_start = bias_start 59 | self._alpha_start = alpha_start 60 | self._beta_start = beta_start 61 | self._activation = activation 62 | self.mi_module = MultiplicativeIntegration( 63 | inputs_sizes=[input_size, num_units], 64 | output_sizes=[num_units, num_units, num_units, num_units], 65 | bias=True, 66 | bias_start=bias_start, 67 | alpha_start=alpha_start, 68 | beta_start=beta_start, 69 | ) 70 | 71 | def forward(self, inputs, state): 72 | # c/h.shape = (seq_len x batch_size x num_units) 73 | c, h = state 74 | # i = input_gate, j = new_input, f = forget_gate, o = output_gate 75 | concat = self.mi_module(inputs, h) 76 | # i/j/f/o.shape = (seq_len x batch_size x num_units) 77 | i, j, f, o = torch.split(concat, self._num_units, dim=2) 78 | # new_c.shape = (seq_len x batch_size x num_units) 79 | new_c = c * torch.sigmoid(f + self._forget_bias) + torch.sigmoid(i + self._activation(j)) 80 | # new_h.shape = (seq_len x batch_size x num_units) 81 | new_h = self._activation(new_c) * torch.sigmoid(o) 82 | new_state = new_c, new_h 83 | return new_h, new_state 84 | -------------------------------------------------------------------------------- /flextensor/test/test_tvm_expr/train/scrnn_pytorch.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | 4 | 5 | class SCRNCell(nn.Module): 6 | def __init__(self, input_size, num_units, context_units, alpha): 7 | super().__init__() 8 | self._input_size = input_size 9 | self._num_units = num_units 10 | self._context_units = context_units 11 | self._alpha = alpha 12 | self.B = nn.Parameter(torch.empty(input_size, context_units)) 13 | self.V = nn.Parameter(torch.empty(context_units, num_units)) 14 | self.U = nn.Parameter(torch.empty(num_units, num_units)) 15 | self.fc = nn.Linear(context_units + input_size + num_units, num_units, bias=False) 16 | self.reset_parameters() # weight initialization: glorot uniform 17 | 18 | # NOTE: rnn_cell_impl._linear: https://github.com/tensorflow/tensorflow/blob/r1.5/tensorflow/contrib/rnn/python/ops/core_rnn_cell.py#L127 19 | def forward(self, inputs, state): 20 | # state_h.shape = (seq_len x batch_size x num_units), state_c.shape = (seq_len x batch_size x context_units) 21 | state_h, state_c = state.split([self._num_units, self._context_units], dim=2) 22 | # context_state.shape = (seq_len x batch_size x context_units) 23 | context_state = (1 - self._alpha) * (inputs @ self.B) + self._alpha * state_c 24 | # hidden_state.shape = (seq_len x batch_size x num_units) 25 | state_h = state_h.expand(inputs.shape[0], -1, -1) 26 | hidden_state = torch.sigmoid(self.fc(torch.cat([context_state, inputs, state_h], dim=2))) 27 | # output.shape = (seq_len x batch_size x num_units) 28 | output = hidden_state @ self.U + context_state @ self.V 29 | # new_state.shape = (seq_len x batch_size x (num_units+context_units)) 30 | new_state = torch.cat([hidden_state, context_state], dim=2) 31 | return output, new_state 32 | 33 | def reset_parameters(self): 34 | for weight in self.parameters(): 35 | nn.init.xavier_uniform_(weight, gain=1.0) 36 | -------------------------------------------------------------------------------- /flextensor/testing/__init__.py: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /flextensor/testing/array_mul.py: -------------------------------------------------------------------------------- 1 | import tvm 2 | import numpy as np 3 | from flextensor.utils import to_tuple 4 | 5 | 6 | def array_mul(N): 7 | A = tvm.te.placeholder((N,), dtype="float32") 8 | B = tvm.te.placeholder((N,), dtype="float32") 9 | C = tvm.te.compute((N,), lambda i: A[i] + B[i]) 10 | return [C.op], [A, B, C] 11 | 12 | 13 | 14 | def test_array_mul(extent=1024, target="llvm", dev_id=0, number=10, verbose=False): 15 | time_cost_lst = [] 16 | for N in range(1, extent+1): 17 | ctx = tvm.device(target, dev_id) 18 | ary_ops, ary_bufs = array_mul(N) 19 | ary_inputs = [tvm.nd.array(np.random.uniform(size=to_tuple(buf.shape)).astype(buf.dtype), ctx) for buf in ary_bufs[:-1]] 20 | ary_inputs += [tvm.nd.array(np.zeros(shape=to_tuple(buf.shape), dtype=buf.dtype), ctx) for buf in ary_bufs[-1:]] 21 | 22 | s = tvm.te.create_schedule(ary_ops) 23 | func = tvm.build(s, ary_bufs, target) 24 | evaluator = func.time_evaluator(func.entry_name, ctx, number=number) 25 | 26 | cost = evaluator(*ary_inputs).mean * 1e3 27 | # print("N=", N, "cost=", "%f(ms)"%cost, "(target=%s, dev_id=%d, number=%d)"%(target, dev_id, number)) 28 | time_cost_lst.append(cost) 29 | 30 | res_lst = [x / time_cost_lst[0] for x in time_cost_lst] 31 | print("array_mul |(target=%s, dev_id=%d, number=%d)"%(target, dev_id, number)) 32 | if verbose: 33 | for i, res in enumerate(res_lst): 34 | print("time_cost: ext=%d / ext=1 = %f"%(i + 1, res)) 35 | else: 36 | print("time_cost: ext=%d / ext=1 = %f"%(extent, res_lst[-1])) 37 | 38 | 39 | if __name__ == "__main__": 40 | test_array_mul(extent=1024, number=1000, verbose=True) -------------------------------------------------------------------------------- /flextensor/testing/net/pytorch-overfeat.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | import time 4 | 5 | 6 | class ConvBlock(nn.Module): 7 | def __init__(self, in_channels, out_channels, kernel_size, stride=1, padding=0, dilation=1, groups=1, bias=False): 8 | super(ConvBlock, self).__init__() 9 | self.conv = nn.Conv2d(in_channels, out_channels, kernel_size, stride=stride, 10 | padding=padding, dilation=dilation, groups=groups, bias=bias) 11 | 12 | def forward(self, inputs): 13 | ret = self.conv(inputs) 14 | return ret 15 | 16 | 17 | class Flatten(nn.Module): 18 | def __init__(self): 19 | super(Flatten, self).__init__() 20 | 21 | def forward(self, inputs): 22 | return torch.flatten(inputs) 23 | 24 | 25 | class OverFeat(nn.Module): 26 | def __init__(self, image_channel=3, num_classes=1470): 27 | super(OverFeat, self).__init__() 28 | self.net = nn.Sequential( 29 | ConvBlock(image_channel, 96, 11, 4, 5), 30 | nn.MaxPool2d(2, 2), 31 | ConvBlock(96, 256, 5, 1, 2), 32 | nn.MaxPool2d(2, 2), 33 | ConvBlock(256, 512, 3, 1, 1), 34 | ConvBlock(512, 1024, 3, 1, 1), 35 | ConvBlock(1024, 1024, 3, 1, 1), 36 | nn.MaxPool2d(2, 2), 37 | Flatten(), 38 | nn.Linear(1024 * 6 * 6, 3072), 39 | nn.Linear(3072, 4096), 40 | nn.Linear(4096, num_classes) 41 | ) 42 | 43 | def forward(self, inputs): 44 | return self.net(inputs) 45 | 46 | 47 | if __name__ == "__main__": 48 | net = OverFeat(3, 1000) 49 | net.cuda("cuda:0") 50 | batch_size = 1 51 | inputs = torch.randn([batch_size, 3, 192, 192]).cuda("cuda:0") 52 | output = net(inputs) 53 | 54 | torch.cuda.synchronize() 55 | beg = time.time() 56 | device_time = 0.0 57 | for i in range(50): 58 | start = torch.cuda.Event(enable_timing=True) 59 | finish = torch.cuda.Event(enable_timing=True) 60 | start.record() 61 | net(inputs) 62 | finish.record() 63 | torch.cuda.synchronize() 64 | device_time += start.elapsed_time(finish) 65 | end = time.time() 66 | print("Host time pass {}ms".format((end - beg) * 1e3 / 50)) 67 | print("Device time pass {}ms".format(device_time / 50)) -------------------------------------------------------------------------------- /flextensor/testing/net/pytorch-yolo-v1.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | import time 4 | 5 | 6 | class ConvBlock(nn.Module): 7 | def __init__(self, in_channels, out_channels, kernel_size, stride=1, padding=0, dilation=1, groups=1, bias=False): 8 | super(ConvBlock, self).__init__() 9 | self.conv = nn.Conv2d(in_channels, out_channels, kernel_size, stride=stride, 10 | padding=padding, dilation=dilation, groups=groups, bias=bias) 11 | self.act = nn.ReLU() 12 | 13 | def forward(self, inputs): 14 | ret = self.conv(inputs) 15 | ret = self.act(ret) 16 | return ret 17 | 18 | 19 | class Flatten(nn.Module): 20 | def __init__(self): 21 | super(Flatten, self).__init__() 22 | 23 | def forward(self, inputs): 24 | return torch.flatten(inputs) 25 | 26 | 27 | class YOLO(nn.Module): 28 | def __init__(self, image_channel=3, num_classes=1470): 29 | super(YOLO, self).__init__() 30 | self.net = nn.Sequential( 31 | ConvBlock(image_channel, 64, 7, 2, 3), 32 | nn.MaxPool2d(2, 2), 33 | ConvBlock(64, 192, 3, 1, 1), 34 | nn.MaxPool2d(2, 2), 35 | ConvBlock(192, 128, 1, 1, 0), 36 | ConvBlock(128, 256, 3, 1, 1), 37 | ConvBlock(256, 256, 1, 1, 0), 38 | ConvBlock(256, 512, 3, 1, 1), 39 | nn.MaxPool2d(2, 2), 40 | ConvBlock(512, 256, 1, 1, 0), 41 | ConvBlock(256, 512, 3, 1, 1), 42 | ConvBlock(512, 256, 1, 1, 0), 43 | ConvBlock(256, 512, 3, 1, 1), 44 | ConvBlock(512, 256, 1, 1, 0), 45 | ConvBlock(256, 512, 3, 1, 1), 46 | ConvBlock(512, 256, 1, 1, 0), 47 | ConvBlock(256, 512, 3, 1, 1), 48 | ConvBlock(512, 512, 1, 1, 0), 49 | ConvBlock(512, 1024, 3, 1, 1), 50 | nn.MaxPool2d(2, 2), 51 | ConvBlock(1024, 512, 1, 1, 0), 52 | ConvBlock(512, 1024, 3, 1, 1), 53 | ConvBlock(1024, 512, 1, 1, 0), 54 | ConvBlock(512, 1024, 3, 1, 1), 55 | ConvBlock(1024, 1024, 3, 1, 1), 56 | ConvBlock(1024, 1024, 3, 2, 1), 57 | ConvBlock(1024, 1024, 3, 1, 1), 58 | ConvBlock(1024, 1024, 3, 1, 1), 59 | Flatten(), 60 | nn.Linear(1024 * 7 * 7, 4096), 61 | nn.ReLU(), 62 | nn.Linear(4096, num_classes) 63 | ) 64 | 65 | def forward(self, inputs): 66 | return self.net(inputs) 67 | 68 | 69 | if __name__ == "__main__": 70 | net = YOLO(3, 1470) 71 | net.cuda("cuda:0") 72 | batch_size = 1 73 | inputs = torch.randn([batch_size, 3, 448, 448]).cuda("cuda:0") 74 | output = net(inputs) 75 | 76 | torch.cuda.synchronize() 77 | beg = time.time() 78 | device_time = 0.0 79 | for i in range(50): 80 | start = torch.cuda.Event(enable_timing=True) 81 | finish = torch.cuda.Event(enable_timing=True) 82 | start.record() 83 | net(inputs) 84 | finish.record() 85 | torch.cuda.synchronize() 86 | device_time += start.elapsed_time(finish) 87 | end = time.time() 88 | print("Host time pass {}ms".format((end - beg) * 1e3 / 50)) 89 | print("Device time pass {}ms".format(device_time / 50)) -------------------------------------------------------------------------------- /flextensor/testing/others/assemble.py: -------------------------------------------------------------------------------- 1 | ''' 2 | This is used to assemble results from experiments 3 | ''' 4 | import os 5 | 6 | 7 | if __name__ == "__main__": 8 | dir_names = os.listdir(".") 9 | dir_names = list(filter(lambda x: os.path.isdir(x), dir_names)) 10 | print(dir_names) 11 | for name in dir_names: 12 | print(name.split("conv")) 13 | write_lines = [] 14 | dir_names = sorted(dir_names, key=lambda x: int(x.split("conv")[1])) 15 | for dir_name in dir_names: 16 | dir_path = os.path.join(".", dir_name) 17 | file_names = os.listdir(dir_path) 18 | for file_name in file_names: 19 | if "config" in file_name: 20 | file_path = os.path.join(dir_path, file_name) 21 | with open(file_path, "r") as fin: 22 | lines = fin.readlines() 23 | if lines: 24 | line = lines[-1] 25 | write_lines.append(line) 26 | with open("configs.txt", "w") as fout: 27 | for line in write_lines: 28 | fout.write(line) -------------------------------------------------------------------------------- /flextensor/testing/others/compare_conv_cpu.py: -------------------------------------------------------------------------------- 1 | import time 2 | import tvm 3 | import torch 4 | import numpy as np 5 | from flextensor.configs.conv2d_config import all_conv_shapes 6 | from flextensor.utils import to_tuple 7 | from flextensor.nn import conv2d_nchw 8 | 9 | 10 | def evaluate(s, bufs, target, dev_id, number=10): 11 | ctx = tvm.device(target, dev_id) 12 | tvm_arys = [] 13 | for arg in bufs: 14 | shape = to_tuple(arg.shape) 15 | tmp = np.random.uniform(-10, 10, size=shape).astype(arg.dtype) 16 | tmp = tvm.nd.array(tmp, ctx) 17 | tvm_arys.append(tmp) 18 | func, evaluator = None, None 19 | try: 20 | func = tvm.build(s, bufs, target) 21 | # evaluator = func.time_evaluator(func.entry_name, ctx, number=number) 22 | # time_cost = evaluator(*tvm_arys).mean * 1e3 23 | beg = time.time() 24 | for i in range(number): 25 | func(*tvm_arys) 26 | end = time.time() 27 | time_cost = (end - beg) * 1e3 / number 28 | return time_cost 29 | except Exception as e: 30 | print(e) 31 | return float("inf") 32 | 33 | 34 | def pytorch_conv(batch, channel, out_channel, height, width, k_h, k_w, stride, pad, target, number=10): 35 | A = torch.rand((batch, channel, height, width), dtype=torch.float32) 36 | W = torch.rand((out_channel, channel, k_h, k_w), dtype=torch.float32) 37 | if target == "cuda": 38 | A = A.cuda() 39 | W = W.cuda() 40 | # warm-up 41 | Out = torch.nn.functional.conv2d(A, W, stride=stride, padding=pad) 42 | beg = time.time() 43 | for i in range(number): 44 | Out = torch.nn.functional.conv2d(A, W, stride=stride, padding=pad) 45 | end = time.time() 46 | return (end - beg) * 1e3 / number 47 | 48 | 49 | def tvm_conv(batch, channel, out_channel, height, width, k_h, k_w, stride, pad, target, devid=0, number=10): 50 | A = tvm.te.placeholder((batch, channel, height, width), dtype="float32") 51 | W = tvm.te.placeholder((out_channel, channel, k_h, k_w), dtype="float32") 52 | Output = conv2d_nchw(A, W, stride=stride, padding=pad) 53 | s = tvm.te.create_schedule(Output.op) 54 | bufs = [A, W, Output] 55 | return evaluate(s, bufs, target, devid, number) 56 | 57 | 58 | def compare(write_file): 59 | for config in all_conv_shapes: 60 | batch, channel, h, w, out_channel, _, k_h, k_w, _, stride, pad, _, _ = config 61 | torch_time = pytorch_conv(batch, channel, out_channel, h, w, k_h, k_w, stride, pad, "llvm", 10) 62 | tvm_time = tvm_conv(batch, channel, out_channel, h, w, k_h, k_w, stride, pad, "llvm", 1, 10) 63 | shape = (batch, channel, h, w, out_channel, k_h, k_w, stride, pad) 64 | print(shape, ": torch:", torch_time, " tvm", tvm_time, file=write_file, flush=True) 65 | 66 | 67 | if __name__ == "__main__": 68 | with open("cmp_conv_cpu.txt", "w") as f: 69 | compare(f) -------------------------------------------------------------------------------- /flextensor/testing/others/hand-craft/complex-gemm.cl: -------------------------------------------------------------------------------- 1 | #include 2 | 3 | #include 4 | 5 | extern "C" void default_function_kernel0( float* C, float* A1, float* A, float* B1, float* B) { 6 | #pragma HLS INTERFACE m_axi port=C offset=slave bundle=gmem 7 | #pragma HLS INTERFACE s_axilite port=C bundle=control 8 | #pragma HLS INTERFACE m_axi port=A1 offset=slave bundle=gmem 9 | #pragma HLS INTERFACE s_axilite port=A1 bundle=control 10 | #pragma HLS INTERFACE m_axi port=A offset=slave bundle=gmem 11 | #pragma HLS INTERFACE s_axilite port=A bundle=control 12 | #pragma HLS INTERFACE m_axi port=B1 offset=slave bundle=gmem 13 | #pragma HLS INTERFACE s_axilite port=B1 bundle=control 14 | #pragma HLS INTERFACE m_axi port=B offset=slave bundle=gmem 15 | #pragma HLS INTERFACE s_axilite port=B bundle=control 16 | #pragma HLS INTERFACE s_axilite port=return bundle=control 17 | 18 | for (int i_inner_outer = 0; i_inner_outer < 32; ++i_inner_outer) { 19 | for (int j_outer = 0; j_outer < 32; ++j_outer) { 20 | for (int i_inner_inner_init = 0; i_inner_inner_init < 32; ++i_inner_inner_init) { 21 | for (int j_inner_init = 0; j_inner_init < 16; ++j_inner_init) { 22 | C[((((i_inner_outer * 16384) + (i_inner_inner_init * 512)) + (j_outer * 16)) + j_inner_init)] = 0.000000e+00f; 23 | } 24 | } 25 | for (int k_outer = 0; k_outer < 32; ++k_outer) { 26 | for (int i = 0; i < 32; ++i) { 27 | for (int j = 0; j < 8; ++j) { 28 | A1[((i * 8) + j)] = A[((((i_inner_outer * 8192) + (i * 256)) + (k_outer * 8)) + j)]; 29 | } 30 | } 31 | for (int i1 = 0; i1 < 8; ++i1) { 32 | for (int j1 = 0; j1 < 16; ++j1) { 33 | B1[((i1 * 16) + j1)] = B[((((k_outer * 4096) + (i1 * 512)) + (j_outer * 16)) + j1)]; 34 | } 35 | } 36 | for (int i_inner_inner = 0; i_inner_inner < 32; ++i_inner_inner) { 37 | for (int k_inner = 0; k_inner < 8; ++k_inner) { 38 | for (int j_inner = 0; j_inner < 16; ++j_inner) { 39 | C[((((i_inner_outer * 16384) + (i_inner_inner * 512)) + (j_outer * 16)) + j_inner)] = (C[((((i_inner_outer * 16384) + (i_inner_inner * 512)) + (j_outer * 16)) + j_inner)] + (A1[((i_inner_inner * 8) + k_inner)] * B1[((k_inner * 16) + j_inner)])); 40 | } 41 | } 42 | } 43 | } 44 | } 45 | } 46 | } -------------------------------------------------------------------------------- /flextensor/testing/others/hand-craft/conv_example.cl: -------------------------------------------------------------------------------- 1 | #define USER_DEFINED_COMPUTE_SIZE_COMPUTE xxx 2 | #define USER_DEFINED_COMPUTE_SIZE_PLACEHOLDER xxx 3 | #define USER_DEFINED_COMPUTE_SIZE_PLACEHOLDER1 xxx 4 | 5 | __kernel void default_function_kernel0(__global float* restrict compute, __global float* restrict placeholder, __global float* restrict placeholder1) { 6 | // declare use local memory 7 | local float local_compute[USER_DEFINED_COMPUTE_SIZE_COMPUTE]; 8 | local float local_placeholder[USER_DEFINED_COMPUTE_SIZE_PLACEHOLDER]; 9 | local float local_placeholder1[USER_DEFINED_COMPUTE_SIZE_PLACEHOLDER1]; 10 | 11 | // read data to local memory 12 | event_t evt[3]; 13 | evt[0] = async_work_group_copy(local_compute, compute, USER_DEFINED_COMPUTE_SIZE_COMPUTE, 0); 14 | evt[1] = async_work_group_copy(local_placeholder, placeholder, USER_DEFINED_COMPUTE_SIZE_PLACEHOLDER, 0); 15 | evt[2] = async_work_group_copy(local_placeholder1, placeholder1, USER_DEFINED_COMPUTE_SIZE_PLACEHOLDER1, 0); 16 | // barrier 17 | wait_group_events(2, evt); 18 | 19 | // compute pipeline 20 | for (int b_inner = 0; b_inner < 8; ++b_inner) { 21 | for (int c = 0; c < 1024; ++c) { 22 | for (int h = 0; h < 7; ++h) { 23 | for (int w = 0; w < 7; ++w) { 24 | local_compute[((((b_inner * 50176) + (c * 49)) + (h * 7)) + w)] = 0.000000e+00f; 25 | for (int rc = 0; rc < 1024; ++rc) { 26 | for (int rw = 0; rw < 3; ++rw) { 27 | // FPGA pipeline 28 | __attribute_((xcl_pipeline_loop)){ 29 | for (int rh = 0; rh < 3; ++rh) { 30 | local_compute[((((b_inner * 50176) + (c * 49)) + (h * 7)) + w)] = (local_compute[((((b_inner * 50176) + (c * 49)) + (h * 7)) + w)] + ((float)(((1 <= ((h * 2) + rh)) && (1 <= ((w * 2) + rw))) ? local_placeholder[(((((((b_inner * 200704) + (rc * 196)) + (h * 28)) + (rh * 14)) + (w * 2)) + rw) - 15)] : 0.000000e+00f) * local_placeholder1[((((c * 9216) + (rc * 9)) + (rh * 3)) + rw)])); 31 | } 32 | } 33 | } 34 | } 35 | } 36 | } 37 | } 38 | } 39 | 40 | 41 | // write data back to global memory 42 | evt[0] = async_work_group_copy(compute, local_compute, USER_DEFINED_COMPUTE_SIZE_COMPUTE, 0); 43 | evt[1] = async_work_group_copy(placeholder, local_placeholder, USER_DEFINED_COMPUTE_SIZE_PLACEHOLDER, 0); 44 | evt[2] = async_work_group_copy(placeholder1, local_placeholder1, USER_DEFINED_COMPUTE_SIZE_PLACEHOLDER1, 0); 45 | // barrier 46 | wait_group_events(2, evt); 47 | } -------------------------------------------------------------------------------- /flextensor/testing/others/hand-craft/hcl_gemm.py: -------------------------------------------------------------------------------- 1 | """ 2 | Test heterocl 3 | """ 4 | import heterocl as hcl 5 | import numpy as np 6 | import time 7 | 8 | def gemm(m=1024, n=1024, k=1024, dtype=hcl.Int(), target=None): 9 | matrix_1 = hcl.placeholder((m, k), dtype=dtype) 10 | matrix_2 = hcl.placeholder((k, n), dtype=dtype) 11 | 12 | def kernel(matrix_1, matrix_2): 13 | r = hcl.reduce_axis(0, k, 'k') 14 | 15 | mat1_buf = hcl.compute((m, k), 16 | lambda x, y: matrix_1[x, y], 17 | dtype=dtype, 18 | name="mat1_buf") 19 | 20 | mat2_buf = hcl.compute((k, n), 21 | lambda x, y: matrix_2[x, y], 22 | dtype=dtype, 23 | name="mat2_buf") 24 | 25 | return hcl.compute((m, n), 26 | lambda x, y: hcl.sum(mat1_buf[x, r] * mat2_buf[r, y], 27 | axis=r, dtype=dtype), 28 | dtype=dtype, 29 | name="out_matrix") 30 | 31 | s = hcl.create_schedule([matrix_1, matrix_2], kernel) 32 | out_matrix = kernel.out_matrix 33 | mat1_buf = kernel.mat1_buf 34 | mat2_buf = kernel.mat2_buf 35 | 36 | m_block = 4 37 | n_block = 8 38 | k_block = 16 39 | 40 | m , k = s[mat1_buf].op.axis 41 | #print(m , k) 42 | x0, x1 = s[mat1_buf].split(m, factor=m_block) 43 | z0, z1 = s[mat1_buf].split(k, factor=k_block) 44 | s[mat1_buf].reorder(x0, z0, x1, z1) 45 | 46 | k , n = s[mat2_buf].op.axis 47 | z0, z1 = s[mat2_buf].split(k, factor=k_block) 48 | y0, y1 = s[mat2_buf].split(n, factor=n_block) 49 | s[mat2_buf].reorder(y0, z0, y1, z1) 50 | 51 | 52 | m, n, k = s[out_matrix].op.axis 53 | #print(m, n, k) 54 | #s[out_matrix].reorder(n, m, k) 55 | 56 | x0, x1 = s[out_matrix].split(m, factor=m_block) 57 | y0, y1 = s[out_matrix].split(n, factor=n_block) 58 | z0, z1 = s[out_matrix].split(k, factor=k_block) 59 | 60 | s[out_matrix].reorder( x0, y0, z0, x1, y1, z1) 61 | 62 | s[mat1_buf].compute_at(s[out_matrix], s[out_matrix].op.axis[0]) 63 | 64 | 65 | #s[mat1_buf].compute_at(s[out_matrix], z0) 66 | 67 | #s[mat1_buf].compute_at(s[out_matrix], z1) 68 | 69 | #s[out_matrix].pipeline(x1) 70 | 71 | f = hcl.build(s, target=target) 72 | print(type(f)) 73 | print(f) 74 | #code = hcl.lower(s) 75 | #print(code) 76 | return f 77 | 78 | def time_gemm(dtype, m=1024, n=1024, k=1024, target=None): 79 | hcl.init(dtype) 80 | f = gemm(m, n, k, dtype, target) 81 | np_1 = np.random.randint(10, size=(m, k)) 82 | np_2 = np.random.randint(10, size=(k, n)) 83 | np_3 = np.matmul(np_1, np_2) 84 | 85 | hcl_m1 = hcl.asarray(np_1, dtype=dtype) 86 | hcl_m2 = hcl.asarray(np_2, dtype=dtype) 87 | hcl_m3 = hcl.asarray(np.zeros((m, n)), dtype=dtype) 88 | f(hcl_m1, hcl_m2, hcl_m3) 89 | begin = time.time() 90 | for i in range(10): 91 | f(hcl_m1, hcl_m2, hcl_m3) 92 | end = time.time() 93 | print("dtype is: ", dtype) 94 | print("average of 10 runs takes: {} sec".format((end - begin) / 10)) 95 | np.testing.assert_allclose(hcl_m3.asnumpy(), np_3, rtol=1e-03) 96 | 97 | ############################################################################### 98 | # Test the algorithm with different data types 99 | #dtypes = [hcl.Int(32), hcl.Float(), hcl.Fixed(32, 16)] 100 | dtypes = [hcl.Float()] 101 | for dtype in dtypes: 102 | time_gemm(dtype, m=256, n=512, k=1024, target="vhls") 103 | -------------------------------------------------------------------------------- /flextensor/testing/others/hand-craft/schedule_conv2d_nchw_x86.py: -------------------------------------------------------------------------------- 1 | """ 2 | High performance schedule for conv2d_nchw 3 | Target X86 CPU 4 | 5 | ==================================== 6 | **Author**: `Size Zheng` 7 | """ 8 | import tvm 9 | from flextensor.measure import _evaluate 10 | from flextensor.nn import * 11 | 12 | 13 | def schedule_yolo_conv_x86(s, outputs, inputs, weight): 14 | # inline the padding operation 15 | padded = outputs.op.input_tensors[0] 16 | 17 | # create cache 18 | write_cache = s.cache_write(outputs, "local") 19 | 20 | # tunable parameters 21 | b_factors = [1, 1, 1] 22 | k_factors = [8, 2, 32] 23 | p_factors = [14, 2, 2] 24 | q_factors = [2, 1, 28] 25 | rc_factors = [32, 4, 2] # outer-->inner 26 | ry_factors = [1, 3, 1] 27 | rx_factors = [3, 1, 1] 28 | 29 | # split the spatial axes 30 | b, k, p, q = s[outputs].op.axis 31 | bo, bi = s[outputs].split(b, nparts=b_factors[0]) 32 | ko, ki = s[outputs].split(k, nparts=k_factors[0]) 33 | po, pi = s[outputs].split(p, nparts=p_factors[0]) 34 | qo, qi = s[outputs].split(q, nparts=q_factors[0]) 35 | 36 | vbo, bi = s[outputs].split(bi, nparts=b_factors[1]) 37 | vko, ki = s[outputs].split(ki, nparts=k_factors[1]) 38 | vpo, pi = s[outputs].split(pi, nparts=p_factors[1]) 39 | vqo, qi = s[outputs].split(qi, nparts=q_factors[1]) 40 | 41 | # reorder 42 | s[outputs].reorder(bo, ko, po, qo, vbo, vko, vpo, vqo, bi, ki, pi, qi) 43 | 44 | # fuse 45 | outer = s[outputs].fuse(bo, ko, po, qo) 46 | middle = s[outputs].fuse(vbo, vko, vpo, vqo) 47 | # inner = s[outputs].fuse(bi, ki, pi, qi) 48 | 49 | # vectorize 50 | # s[outputs].vectorize(inner) 51 | 52 | # parallel 53 | s[outputs].parallel(outer) 54 | 55 | # compute at write cache 56 | s[write_cache].compute_at(s[outputs], middle) 57 | 58 | # split reduce axes 59 | wb, wk, wp, wq = s[write_cache].op.axis 60 | # print(s[write_cache].op.reduce_axis) 61 | rc, ry, rx = s[write_cache].op.reduce_axis 62 | rco, rci = s[write_cache].split(rc, nparts=rc_factors[0]) 63 | rcm, rci = s[write_cache].split(rci, nparts=rc_factors[1]) 64 | rxo, rxi = s[write_cache].split(rx, nparts=rx_factors[0]) 65 | rxm, rxi = s[write_cache].split(rxi, nparts=rx_factors[1]) 66 | ryo, ryi = s[write_cache].split(ry, nparts=ry_factors[0]) 67 | rym, ryi = s[write_cache].split(ryi, nparts=ry_factors[1]) 68 | 69 | # reorder 70 | s[write_cache].reorder(rco, ryo, rxo, rcm, rym, rxm, rci, ryi, rxi, wb, wk, wp, wq) 71 | 72 | 73 | s[outputs].pragma(outer, 'auto_unroll_max_step', 1500) 74 | s[write_cache].vectorize(s[write_cache].op.axis[-1]) 75 | 76 | s[padded].compute_inline() 77 | 78 | 79 | def try_yolo_conv(batch_size=1): 80 | # get the compute 81 | yolo_conv = YoloConvLayer6() 82 | input_shape = yolo_conv.get_intput_shape() 83 | inputs = tvm.te.placeholder((batch_size, *input_shape), dtype="float32") 84 | weight = yolo_conv.get_weight() 85 | outputs = yolo_conv(inputs) 86 | bias = yolo_conv.get_bias() 87 | 88 | s = tvm.te.create_schedule(outputs.op) 89 | schedule_yolo_conv_x86(s, outputs, inputs, weight) 90 | 91 | if bias is None: 92 | arg_bufs = [inputs, weight, outputs] 93 | else: 94 | arg_bufs = [inputs, weight, bias, outputs] 95 | stmt = tvm.lower(s, arg_bufs, simple_mode=True) 96 | print(stmt) 97 | dev_id = 1 98 | time_cost = _evaluate(s, arg_bufs, "llvm", dev_id, 100) 99 | print("Yolo conv6 use", time_cost, "ms") 100 | 101 | 102 | if __name__ == "__main__": 103 | try_yolo_conv(batch_size=1) -------------------------------------------------------------------------------- /flextensor/testing/others/hand-craft/schedule_gemm_vhls.py: -------------------------------------------------------------------------------- 1 | import heterocl as hcl 2 | 3 | 4 | def kernel_gemm(A, B): 5 | k = hcl.reduce_axis(0, A.shape[1], "k") 6 | return hcl.compute( 7 | (A.shape[0], B.shape[1]), 8 | lambda i, j: hcl.sum(A[i, k] * B[k, j], axis=k), 9 | "C") 10 | 11 | 12 | def main(): 13 | M = 512 14 | N = 512 15 | K = 512 16 | A = hcl.placeholder((M, K), dtype="float32", name="A") 17 | B = hcl.placeholder((K, N), dtype="float32", name="B") 18 | 19 | s = hcl.create_schedule([A, B], kernel_gemm) 20 | # split 21 | C = kernel_gemm.C 22 | m, n, k = s[C].op.axis 23 | mo, mi = s[C].split(m, factor=16) 24 | no, ni = s[C].split(n, factor=32) 25 | ko, ki = s[C].split(k, factor=8) 26 | 27 | 28 | # reorder shuffle 29 | s[C].reorder(mo, no, mi, ni, ko, ki) 30 | 31 | 32 | # reorder local 33 | s[C].reorder(mi, ko, ki, ni) 34 | 35 | 36 | # reshape 37 | s.reshape(C, [512//16, 16, 512//32, 32]) 38 | 39 | 40 | # partition 41 | s.partition(A, dim=3) 42 | 43 | 44 | # pipeline 45 | s[C].pipeline(mi) 46 | 47 | 48 | # reuse_at 49 | # nothing to do 50 | 51 | print(hcl.build(s, target="vhls")) 52 | 53 | 54 | 55 | 56 | if __name__ == "__main__": 57 | main() 58 | """ 59 | // result: 60 | #include 61 | #include 62 | #include 63 | 64 | void default_function(float A[512][512], float B[512][512], ap_int<32> C[32][16][16][32]) { 65 | #pragma HLS array_partition variable=A complete dim=3 66 | for (ap_int<32> i_outer = 0; i_outer < 32; ++i_outer) { 67 | for (ap_int<32> j_outer = 0; j_outer < 16; ++j_outer) { 68 | for (ap_int<32> i_inner = 0; i_inner < 16; ++i_inner) { 69 | #pragma HLS pipeline 70 | for (ap_int<32> k_outer = 0; k_outer < 64; ++k_outer) { 71 | ap_int<32> sum; 72 | sum = 0; 73 | for (ap_int<32> k_inner = 0; k_inner < 8; ++k_inner) { 74 | for (ap_int<32> j_inner = 0; j_inner < 32; ++j_inner) { 75 | sum = ((ap_int<32>)((A[(i_inner + (i_outer * 16))][(k_inner + (k_outer * 8))] * B[(k_inner + (k_outer * 8))][(j_inner + (j_outer * 32))]) + ((float)sum))); 76 | } 77 | } 78 | C[i_outer][i_inner][j_outer][j_inner] = sum; 79 | } 80 | } 81 | } 82 | } 83 | 84 | """ -------------------------------------------------------------------------------- /flextensor/testing/others/hand-craft/schedule_shfit_x86.py: -------------------------------------------------------------------------------- 1 | import tvm 2 | import math 3 | import torch 4 | import numpy as np 5 | from flextensor.nn import ShiftConv2d_nhwc 6 | 7 | shift_conv2d_shape = [ 8 | # ShiftNet(https://arxiv.org/abs/1801.09392) with input size: 256*256 9 | (1, 128, 128, 64, 3, 1), 10 | (1, 128, 128, 64, 3, 1), 11 | (1, 64, 64, 128, 5, 1), 12 | (1, 32, 32, 256, 3, 1), 13 | (1, 16, 16, 512, 3, 1) 14 | ] 15 | 16 | DEV_ID = 0 17 | 18 | 19 | def schedule_shift_1_x86(s, Img, KernelIndex, Output): 20 | return 21 | 22 | 23 | def evaluate(shape, schedule_func): 24 | N, H, W, C, k, dilation = shape 25 | stride = 1 26 | Img = tvm.te.placeholder([N, H, W, C], dtype="float32") 27 | KernelIndex = tvm.te.placeholder([C], dtype="int32") 28 | Output = ShiftConv2d_nhwc(Img, KernelIndex, k, dilation, stride) 29 | 30 | s = tvm.te.create_schedule(Output.op) 31 | schedule_func(s, Img, KernelIndex, Output) 32 | 33 | func = tvm.build(s, [Img, KernelIndex, Output], "llvm") 34 | Img_torch = torch.rand([N, H, W, C], dtype=torch.float32) 35 | Kernel_torch = torch.rand([C, k, k], dtype=torch.float32) 36 | KernelIndex_torch = torch.argmax(Kernel_torch.reshape([C, -1]), dim=1) 37 | 38 | paddings = [math.ceil(((stride - 1) * H - stride + dilation * (k - 1)) / 2), 39 | math.ceil(((stride - 1) * W - stride + dilation * (k - 1)) / 2)] 40 | image_height = H 41 | image_width = W 42 | out_height = math.floor((image_height + 2 * paddings[0]- dilation * (k - 1) - 1) / stride + 1) 43 | out_width = math.floor((image_width + 2 * paddings[1] - dilation * (k - 1) - 1) / stride + 1) 44 | output_shape = (N, out_height, out_width, C) 45 | 46 | Output_torch = torch.zeros(output_shape, dtype=torch.float32) 47 | 48 | ctx = tvm.device("llvm", DEV_ID) 49 | 50 | Img_tvm = tvm.nd.array(Img_torch.numpy().astype(np.float32), ctx) 51 | KernelIndex_tvm = tvm.nd.array(KernelIndex_torch.numpy().astype(np.int32), ctx) 52 | Output_tvm = tvm.nd.array(Output_torch.numpy().astype(np.float32), ctx) 53 | 54 | 55 | evaluator = func.time_evaluator(func.entry_name, ctx, number=10) 56 | time_cost = evaluator(Img_tvm, KernelIndex_tvm, Output_tvm).mean * 1e3 57 | 58 | return time_cost 59 | 60 | 61 | 62 | def main(): 63 | print(evaluate(shift_conv2d_shape[0], schedule_shift_1_x86)) 64 | 65 | 66 | if __name__ == "__main__": 67 | main() -------------------------------------------------------------------------------- /flextensor/testing/others/hand-craft/simple-gemm.cl: -------------------------------------------------------------------------------- 1 | #include 2 | 3 | #include 4 | 5 | extern "C" void default_function_kernel0( float* C, float* A, float* B) { 6 | #pragma HLS INTERFACE m_axi port=C offset=slave bundle=gmem 7 | #pragma HLS INTERFACE s_axilite port=C bundle=control 8 | #pragma HLS INTERFACE m_axi port=A offset=slave bundle=gmem 9 | #pragma HLS INTERFACE s_axilite port=A bundle=control 10 | #pragma HLS INTERFACE m_axi port=B offset=slave bundle=gmem 11 | #pragma HLS INTERFACE s_axilite port=B bundle=control 12 | #pragma HLS INTERFACE s_axilite port=return bundle=control 13 | 14 | for (int i_inner_outer = 0; i_inner_outer < 32; ++i_inner_outer) { 15 | for (int j_outer = 0; j_outer < 32; ++j_outer) { 16 | for (int i_inner_inner_init = 0; i_inner_inner_init < 32; ++i_inner_inner_init) { 17 | for (int j_inner_init = 0; j_inner_init < 16; ++j_inner_init) { 18 | C[((((i_inner_outer * 16384) + (i_inner_inner_init * 512)) + (j_outer * 16)) + j_inner_init)] = 0.000000e+00f; 19 | } 20 | } 21 | for (int k_outer = 0; k_outer < 32; ++k_outer) { 22 | for (int i_inner_inner = 0; i_inner_inner < 32; ++i_inner_inner) { 23 | for (int k_inner = 0; k_inner < 8; ++k_inner) { 24 | for (int j_inner = 0; j_inner < 16; ++j_inner) { 25 | C[((((i_inner_outer * 16384) + (i_inner_inner * 512)) + (j_outer * 16)) + j_inner)] = (C[((((i_inner_outer * 16384) + (i_inner_inner * 512)) + (j_outer * 16)) + j_inner)] + (A[((((i_inner_outer * 8192) + (i_inner_inner * 256)) + (k_outer * 8)) + k_inner)] * B[((((k_outer * 4096) + (k_inner * 512)) + (j_outer * 16)) + j_inner)])); 26 | } 27 | } 28 | } 29 | } 30 | } 31 | } 32 | } 33 | -------------------------------------------------------------------------------- /flextensor/testing/others/hand-craft/tune_conv2d_NCHWc.py: -------------------------------------------------------------------------------- 1 | from __future__ import absolute_import 2 | 3 | import time 4 | import json 5 | import tvm 6 | from flextensor.task import register_task, Task 7 | from flextensor.measure import _evaluate 8 | from flextensor.nn import conv2d_nchwc 9 | from flextensor.configs.conv2d_config import yolo_shapes_b1 10 | from flextensor.scheduler import schedule, schedule_with_config 11 | from flextensor.utils import RpcInfo 12 | 13 | 14 | def conv2d_nchwc_compute_avx2(N, C, H, W, K, k=3, use_bias=False, st=1, pad=0, dilation=1, group=1, vlen1=8, vlen2=8): 15 | inputs = tvm.te.placeholder([N, C // vlen1 // group, H, W, vlen1], dtype="float32") 16 | weight = tvm.te.placeholder([K // vlen2, C // vlen1 // group, k, k, vlen1, vlen2], dtype="float32") 17 | if use_bias: 18 | bias = tvm.te.placeholder([K // vlen2, vlen2], dtype="float32") 19 | else: 20 | bias = None 21 | output = conv2d_nchwc(inputs, weight, bias, stride=st, padding=pad, dilation=dilation, groups=group) 22 | if use_bias: 23 | return output, [inputs, weight, bias, output] 24 | else: 25 | return [output.op], [inputs, weight, output] 26 | 27 | 28 | if __name__ == "__main__": 29 | N, C, H, W, K, _, k, _, _, st, pad, dilation, group = yolo_shapes_b1[5] 30 | 31 | use_bias = False 32 | vlen = 8 33 | target = "llvm" 34 | dev_id = 0 35 | trials = 100 36 | timeout = 10 37 | parallel = 20 38 | method = "searching" 39 | force_inline = True 40 | use_model = False 41 | logfile = open("tmp.log", "w") 42 | rpc_info = RpcInfo("0.0.0.0", 9090, target_host="llvm") 43 | 44 | args = (N, C, H, W, K, k, use_bias, st, pad, dilation, group) 45 | task = Task("conv2d_nchwc", "yolo_conv6", conv2d_nchwc_compute_avx2, args, target, dev_id=dev_id) 46 | register_task(task, override=False) 47 | 48 | beg = time.time() 49 | s, bufs, configs = schedule( 50 | task.key, 51 | op_trial=trials, 52 | timeout=timeout, 53 | op_stop=30, 54 | parallel=parallel, 55 | method=method, 56 | use_model=use_model, 57 | trials=[trials//10, trials], 58 | force_inline=force_inline, 59 | rpc_info=rpc_info, 60 | slevel=2, 61 | rlevel=2 62 | ) 63 | end = time.time() 64 | 65 | print("######################################") 66 | print("op schedules:") 67 | for config in configs.op_config_lst: 68 | print("----------------------------------") 69 | for name, value in config.items(): 70 | if value: 71 | print(name, value) 72 | print("graph schedules:") 73 | for name, value in configs.graph_config.items(): 74 | if value: 75 | print(name, value) 76 | string = json.dumps(configs) 77 | line = task.key + ":" + string 78 | print(line, file=logfile, flush=True) 79 | s, bufs = schedule_with_config(task.key, configs) 80 | time_cost = _evaluate(s, bufs, target, dev_id, 10) 81 | print("Use", time_cost, "ms", "throughput: %f GFLOPS" % (N * C * H * W * K * k * k / st / st / group / 1e6 / time_cost)) 82 | print("Cost", end - beg, "s") 83 | 84 | logfile.close() 85 | -------------------------------------------------------------------------------- /flextensor/testing/others/hand-craft/tvm_pragma.py: -------------------------------------------------------------------------------- 1 | import tvm 2 | import numpy as np 3 | import os 4 | 5 | 6 | M = 1024 7 | N = 512 8 | K = 256 9 | 10 | 11 | def test1(): 12 | A = tvm.te.placeholder([M, K], name="A") 13 | B = tvm.te.placeholder([K, N], name="B") 14 | k = tvm.te.reduce_axis((0, K), name="k") 15 | # A1 = tvm.te.compute([M, K], lambda i, j: A[i, j], "A1") 16 | # B1 = tvm.te.compute([K, N], lambda i, j: B[i, j], "B1") 17 | C = tvm.te.compute([M, N], lambda i, j: tvm.te.sum(A[i, k] * B[k, j], axis=[k]), "C") 18 | 19 | s = tvm.te.create_schedule(C.op) 20 | 21 | A1 = s.cache_read(A, "local", [C]) 22 | B1 = s.cache_read(B, "local", [C]) 23 | 24 | m, n = s[C].op.axis 25 | om, im = s[C].split(m, nparts=1) 26 | s[C].bind(om, tvm.te.thread_axis("blockIdx.x")) 27 | mo, mi = s[C].split(im, factor=32) 28 | no, ni = s[C].split(n, factor=16) 29 | k = s[C].op.reduce_axis[0] 30 | ko, ki = s[C].split(k, factor=8) 31 | 32 | s[C].reorder(mo, no, ko, mi, ki, ni) 33 | # s[C].bind(no, tvm.te.thread_axis("threadIdx.x")) 34 | 35 | s[A1].compute_at(s[C], ko) 36 | s[B1].compute_at(s[C], ko) 37 | 38 | # print(tvm.lower(s, [A, B, C])) 39 | 40 | f = tvm.build(s, [A, B, C], target="opencl") 41 | print(dir(f)) 42 | print(f.get_source()) 43 | print(f.imported_modules[0].get_source()) 44 | 45 | 46 | def test2(): 47 | tgt_host="llvm" 48 | tgt="aocl_sw_emu" 49 | n = tvm.te.var("n") 50 | A = tvm.te.placeholder((n,), name='A') 51 | B = tvm.te.placeholder((n,), name='B') 52 | C = tvm.te.compute(A.shape, lambda i: A[i] + B[i], name="C") 53 | 54 | s = tvm.te.create_schedule(C.op) 55 | px, x = s[C].split(C.op.axis[0], nparts=1) 56 | 57 | s[C].bind(px, tvm.te.thread_axis("pipeline")) 58 | 59 | fadd = tvm.build(s, [A, B, C], tgt, target_host=tgt_host, name="myadd") 60 | 61 | fadd.save("myadd.o") 62 | fadd.imported_modules[0].save("myadd.aocx") 63 | 64 | tvm.contrib.cc.create_shared("myadd.so", ["myadd.o"]) 65 | 66 | 67 | def run_aocl(): 68 | tgt="aocl_sw_emu" 69 | 70 | fadd = tvm.runtime.module.load_module("myadd.so") 71 | fadd_dev = tvm.runtime.module.load_module("myadd.aocx") 72 | fadd.import_module(fadd_dev) 73 | 74 | ctx = tvm.device(tgt, 0) 75 | 76 | n = 1024 77 | a = tvm.nd.array(np.random.uniform(size=n).astype("float32"), ctx) 78 | b = tvm.nd.array(np.random.uniform(size=n).astype("float32"), ctx) 79 | c = tvm.nd.array(np.zeros(n, dtype="float32"), ctx) 80 | 81 | fadd(a, b, c) 82 | tvm.testing.assert_allclose(c.asnumpy(), a.asnumpy() + b.asnumpy()) 83 | 84 | 85 | def test3(): 86 | A = tvm.te.placeholder([M, K], name="A") 87 | B = tvm.te.placeholder([K, N], name="B") 88 | k = tvm.te.reduce_axis((0, K), name="k") 89 | C = tvm.te.compute([M, N], lambda i, j: tvm.te.sum(A[i, k] * B[k, j], axis=[k]), "C") 90 | 91 | s = tvm.te.create_schedule(C.op) 92 | 93 | m, n = s[C].op.axis 94 | om, im = s[C].split(m, nparts=1) 95 | s[C].bind(om, tvm.te.thread_axis("pipeline")) 96 | mo, mi = s[C].split(im, factor=32) 97 | no, ni = s[C].split(n, factor=16) 98 | k = s[C].op.reduce_axis[0] 99 | ko, ki = s[C].split(k, factor=8) 100 | 101 | s[C].reorder(mo, no, ko, mi, ki, ni) 102 | 103 | print(tvm.lower(s, [A, B, C])) 104 | 105 | f = tvm.build(s, [A, B, C], target="sdaccel") 106 | print(dir(f)) 107 | print(f.get_source()) 108 | print(f.imported_modules[0].get_source()) 109 | 110 | 111 | def main(): 112 | test1() 113 | # test3() 114 | 115 | 116 | if __name__ == "__main__": 117 | main() -------------------------------------------------------------------------------- /flextensor/testing/others/profile/Makefile: -------------------------------------------------------------------------------- 1 | CXX := nvcc 2 | TARGET := profile_autate 3 | CUDNN_PATH := /usr/local/cuda-10.1 4 | HEADERS := -I $(CUDNN_PATH)/include 5 | LIBS := -L $(CUDNN_PATH)/lib64 -L/usr/local/lib 6 | LIBDEVICE := --dont-use-profile -ldir $(CUDNN_PATH)/nvvm/libdevice 7 | CXXFLAGS := -arch=sm_60 -std=c++11 -O2 8 | 9 | all: conv 10 | 11 | conv: $(TARGET).cu 12 | $(CXX) $(CXXFLAGS) $(LIBDEVICE) $(HEADERS) $(LIBS) $(TARGET).cu -o $(TARGET) 13 | 14 | .phony: clean 15 | 16 | clean: 17 | rm $(TARGET) || echo -n "" -------------------------------------------------------------------------------- /flextensor/testing/others/profile/compute_flops.py: -------------------------------------------------------------------------------- 1 | from __future__ import absolute_import 2 | from flextensor.configs.conv2d_config import yolo_shapes 3 | 4 | 5 | def gflops(batch, in_channel, out_channel, H, W, k_h, k_w, stride, padding, dilation): 6 | out_h = (H + 2 * padding - dilation * (k_h - 1) - 1) // stride + 1 7 | out_w = (W + 2 * padding - dilation * (k_w - 1) - 1) // stride + 1 8 | return 2 * batch * out_h * out_w * in_channel * out_channel * k_h * k_w / 1e9 9 | 10 | 11 | def perf(gflops, millis): 12 | return gflops / (millis / 1e3) 13 | 14 | 15 | if __name__ == "__main__": 16 | # flextensor 17 | millis = [ 18 | 0.1006952, 19 | 0.2825153, 20 | 0.0252457, 21 | 0.2062096, 22 | 0.0571187, 23 | 0.7426347, 24 | 0.0372696, 25 | 0.210653, 26 | 0.0540586, 27 | 0.7972785, 28 | 0.0652985, 29 | 0.2498188, 30 | 0.5609756, 31 | 0.3801411, 32 | 0.2718407 33 | ] 34 | # autotvm 35 | # millis = [ 36 | # 0.145611, 37 | # 0.385738, 38 | # 0.038619, 39 | # 0.311103, 40 | # 0.080117, 41 | # 0.897629, 42 | # 0.059699, 43 | # 0.287437, 44 | # 0.090796, 45 | # 0.903871, 46 | # 0.069489, 47 | # 0.399444, 48 | # 0.668653, 49 | # 0.588122, 50 | # 0.555237 51 | # ] 52 | i = 0 53 | for shape in yolo_shapes: 54 | batch, in_c, H, W, out_c, _, k_h, k_w, _, stride, padding, dilation, groups = shape 55 | print(perf(gflops(batch, in_c, out_c, H, W, k_h, k_w, stride, padding, dilation), millis[i])) 56 | i += 1 57 | -------------------------------------------------------------------------------- /flextensor/testing/others/profile/profile_flextensor_yolo_b1_conv3.cu: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pku-liang/FlexTensor/98db7a952a0508f792edfd876527c8517a5ccd36/flextensor/testing/others/profile/profile_flextensor_yolo_b1_conv3.cu --------------------------------------------------------------------------------