├── .gitignore
├── LICENSE
├── README.md
└── flextensor
    ├── __init__.py
    ├── baselines
        ├── bilinear_baseline.py
        ├── block_matrix_circulant_baseline.cu
        ├── block_matrix_circulant_baseline.py
        ├── conv-autotvm
        │   ├── autotvm_opt_topi_conv2d_cpu.py
        │   ├── tune_conv2d_cuda.py
        │   ├── tune_conv2d_nchw_cuda.py
        │   ├── tune_depthwise_cuda.py
        │   ├── tune_nnvm_cuda.py
        │   └── tune_relay_cuda.py
        ├── conv-cudnn
        │   ├── Makefile
        │   └── cudnn_conv.cu
        ├── conv-mkl
        │   └── simple_conv.cpp
        ├── conv-tvm
        │   └── opt_conv_cuda.py
        ├── conv1d_baseline.py
        ├── conv2d_baseline.py
        ├── conv3d_baseline.py
        ├── conv_transpose1d_baseline.py
        ├── conv_transpose2d_baseline.py
        ├── conv_transpose3d_baseline.py
        ├── depthwise_baseline.py
        ├── dilation_baseline.py
        ├── flextensor
        │   ├── conv1d-p100.txt
        │   ├── conv1d-titanxp.txt
        │   ├── conv1d-v100.txt
        │   ├── conv3d-p100.txt
        │   ├── conv3d-titanxp.txt
        │   ├── conv3d-v100.txt
        │   ├── dilation-v100.txt
        │   ├── group_conv2d-v100.txt
        │   ├── yolo-conv2d-b128-v100.txt
        │   ├── yolo-conv2d-b32-v100.txt
        │   ├── yolo-conv2d-p100.txt
        │   ├── yolo-conv2d-titanxp.txt
        │   └── yolo-conv2d-v100.txt
        ├── gatedPixelCNN_baseline.py
        ├── gemm-cublas
        │   ├── Makefile
        │   ├── Makefile.old
        │   ├── cublas_batch_gemm.cu
        │   ├── cublas_gemm.cu
        │   ├── helper_cuda.h
        │   └── helper_string.h
        ├── gemm_baseline.py
        ├── gemv_baseline.py
        ├── grouped_baseline.py
        ├── pixelCNN_baseline.py
        ├── shift_conv2d_baseline.py
        ├── sparse
        │   └── sparse-gemm.py
        ├── taco
        │   └── taco-gemm.c
        ├── unpooling1d_baseline.py
        └── unpooling2d_baseline.py
    ├── configs
        ├── PixelCNN_config.py
        ├── bilinear_config.py
        ├── block_circulant_matrix_config.py
        ├── conv1d_config.py
        ├── conv2d_config.py
        ├── conv3d_config.py
        ├── conv_transpose2d_config.py
        ├── depthwise_config.py
        ├── dilation_config.py
        ├── gated_pixelcnn_config.py
        ├── gemm_config.py
        ├── gemv_config.py
        ├── grouped_config.py
        ├── maxunpooling1d_config.py
        ├── maxunpooling2d_config.py
        ├── mttkrp_config.py
        └── shift_conv2d_config.py
    ├── examples
        ├── __init__.py
        ├── autotvm_opt_conv1_cpu.py
        ├── autotvm_opt_topi_conv2d_cpu.py
        ├── autotvm_opt_topi_conv2d_gpu.py
        ├── autotvm_opt_topi_matmul_cpu.py
        ├── opt_blur2d_cpu.py
        ├── opt_conv3d_cpu.py
        ├── opt_conv_cpu.py
        ├── opt_conv_gpu.py
        ├── opt_gemm_cpu.py
        ├── opt_gemm_gpu.py
        ├── opt_mttkrp3_cpu.py
        ├── opt_outer_cpu.py
        ├── run_experiments.py
        ├── single_operation.py
        ├── transfer_cpu.py
        └── transfer_gpu.py
    ├── measure.py
    ├── model.py
    ├── nn
        ├── README.md
        ├── __init__.py
        ├── layers.py
        └── ops.py
    ├── optimize
        ├── README.md
        ├── common.py
        ├── conv1d-config.log
        ├── conv2d-config.log
        ├── depthwise_conv2d-config.log
        ├── gemm-config.log
        ├── gemm-config.old.log
        ├── gemm-config.v0.log
        ├── gemm-config.v1.log
        ├── optimize_bilinear.py
        ├── optimize_block_circulant_matrix.py
        ├── optimize_conv1d.py
        ├── optimize_conv2d.py
        ├── optimize_conv2d_1x1_packed.py
        ├── optimize_conv3d.py
        ├── optimize_conv_transpose1d.py
        ├── optimize_conv_transpose2d.py
        ├── optimize_conv_transpose3d.py
        ├── optimize_depthwise_conv2d.py
        ├── optimize_dilation_conv2d.py
        ├── optimize_gatedPixelCNN.py
        ├── optimize_gemm.py
        ├── optimize_gemm_conv2d.py
        ├── optimize_gemv.py
        ├── optimize_grouped_conv2d.py
        ├── optimize_mttkrp.py
        ├── optimize_pixelCNN.py
        ├── optimize_shift_conv2d.py
        ├── optimize_test_conv.py
        ├── optimize_unpooling1d.py
        ├── optimize_unpooling2d.py
        ├── run_remote_opencl_conv1d.sh
        ├── run_remote_opencl_conv2d.sh
        ├── run_remote_opencl_depthwise_conv2d.sh
        └── run_remote_opencl_gemm.sh
    ├── project
        └── tensor_graph
        │   ├── README.md
        │   ├── build.py
        │   ├── conv2d_model
        │       ├── conv2d_model.pkl
        │       └── log_conv2d_train.txt
        │   ├── dataset
        │       ├── all.txt
        │       ├── all_test.txt
        │       ├── all_train.txt
        │       ├── conv2d.txt
        │       ├── conv2d_test.txt
        │       ├── conv2d_train.txt
        │       ├── gemm.txt
        │       ├── gemm_test.txt
        │       ├── gemm_train.txt
        │       └── preprocess.py
        │   ├── example.py
        │   ├── expr_visitor.py
        │   ├── gemm_model
        │       ├── gemm_model.pkl
        │       └── log_gemm_train.txt
        │   ├── graph.py
        │   ├── model.py
        │   ├── node.py
        │   ├── ops.py
        │   ├── preprocess.py
        │   ├── softmax_issue.py
        │   ├── space.py
        │   ├── train.py
        │   └── utils.py
    ├── scheduler.py
    ├── space.py
    ├── task.py
    ├── templates
        ├── __init__.py
        ├── cpu.py
        ├── cuda.py
        ├── opencl.py
        └── utils.py
    ├── test
        ├── __init__.py
        ├── check_grouped_results.py
        ├── naive_schedule_all.py
        ├── pyimpl.py
        ├── test_halide
        │   ├── network.cpp
        │   └── tutorial.md
        ├── test_ops.py
        ├── test_scheduler.py
        ├── test_tvm
        │   ├── grad
        │   │   ├── dqn_pytorch.py
        │   │   ├── layers.py
        │   │   ├── relay-dqn.py
        │   │   └── relay-lenet.py
        │   ├── graph
        │   │   ├── placeholder-only.py
        │   │   └── share-placeholder.py
        │   └── legacy
        │   │   ├── android_gemm_square.py
        │   │   ├── conv.py
        │   │   ├── cross_compilation_and_rpc.py
        │   │   ├── cuda_gemm_square.py
        │   │   ├── depthwise_conv2d_test.py
        │   │   ├── gemm_int8.py
        │   │   ├── lstm.py
        │   │   ├── matexp.py
        │   │   ├── multi_compute_inline.py
        │   │   ├── opt_conv_cpu.py
        │   │   ├── opt_conv_cuda.py
        │   │   ├── opt_gemm.py
        │   │   ├── test_broadcast_map.py
        │   │   ├── test_compute_inline.py
        │   │   ├── test_conv2d_hwcn_map.py
        │   │   ├── test_conv_int8_intel.py
        │   │   ├── test_multi_outputs.py
        │   │   ├── test_one_thread.py
        │   │   ├── test_reduce_map.py
        │   │   ├── tune_relay_x86.py
        │   │   └── variant_scale.py
        └── test_tvm_expr
        │   ├── grad
        │       ├── requires_grad.py
        │       ├── te-avgpool-case1.py
        │       ├── te-broadcast-case1.py
        │       ├── te-cat-case1.py
        │       ├── te-conv2d-case1.py
        │       ├── te-conv2d-case2.py
        │       ├── te-conv2d-case3.py
        │       ├── te-conv2d-topi-case1.py
        │       ├── te-cross_entropy-case1.py
        │       ├── te-downcast-case1.py
        │       ├── te-flatten.py
        │       ├── te-gemm.py
        │       ├── te-maxpool-case1.py
        │       ├── te-mse_loss-case1.py
        │       ├── te-padding-case1.py
        │       ├── te-power-case1.py
        │       ├── te-repeat-case1.py
        │       ├── te-softmax-case1.py
        │       ├── te-sub-case1.py
        │       ├── te-sub-case2.py
        │       ├── test_report.md
        │       ├── tir-relu-case1.py
        │       └── tir-tanh-case1.py
        │   ├── network
        │       └── lenet.py
        │   └── train
        │       ├── get_lm_data.sh
        │       ├── lenet-CEloss-new-api.py
        │       ├── lenet-CEloss.py
        │       ├── lenet.py
        │       ├── lltm.py
        │       ├── mi_lstm_pytorch.py
        │       ├── mlp.py
        │       ├── scrnn_pytorch.py
        │       └── train-language-modeling.py
    ├── testing
        ├── __init__.py
        ├── array_mul.py
        ├── get_feature.py
        ├── net
        │   ├── nnvm-mobilenet-v1.py
        │   ├── nnvm-mobilenet-v2.py
        │   ├── overfeat.py
        │   ├── pytorch-overfeat.py
        │   ├── pytorch-yolo-v1.py
        │   └── yolo_v1.py
        ├── others
        │   ├── assemble.py
        │   ├── compare_conv_cpu.py
        │   ├── hand-craft
        │   │   ├── complex-gemm.cl
        │   │   ├── config_yolo1_cpu.py
        │   │   ├── config_yolo1_cuda.py
        │   │   ├── config_yolo24_cpu.py
        │   │   ├── config_yolo24_cuda.py
        │   │   ├── conv_example.cl
        │   │   ├── gemmini-conv2d-3x3-nhwc-spike.py
        │   │   ├── gemmini-conv2d-3x3-nhwc-zync.py
        │   │   ├── gemmini-gemv-spike.py
        │   │   ├── gemmini-ttm-spike.py
        │   │   ├── hcl_gemm.py
        │   │   ├── optimize_conv2d.py
        │   │   ├── schedule_conv2d_1x1.py
        │   │   ├── schedule_conv2d_nchw_cuda.py
        │   │   ├── schedule_conv2d_nchw_x86.py
        │   │   ├── schedule_conv2d_nchwc_x86.py
        │   │   ├── schedule_conv2d_vhls.py
        │   │   ├── schedule_gemm_conv2d_x86.py
        │   │   ├── schedule_gemm_vhls.py
        │   │   ├── schedule_shfit_x86.py
        │   │   ├── schedule_shift_cuda.py
        │   │   ├── simple-gemm.cl
        │   │   ├── tune_conv2d_NCHWc.py
        │   │   └── tvm_pragma.py
        │   ├── profile
        │   │   ├── Makefile
        │   │   ├── compute_flops.py
        │   │   ├── profile_autate_yolo_b8_conv11.cu
        │   │   ├── profile_autotvm_yolo_b8_conv11.cu
        │   │   ├── profile_flextensor_yolo_b1_conv3.cu
        │   │   ├── run_tune.txt
        │   │   ├── yolo_conv11_opencl_autate.cl
        │   │   └── yolo_conv11_opencl_autotvm.cl
        │   ├── test_conv2d_cuda_behavior.py
        │   ├── test_conv2d_cuda_different_schedule.py
        │   ├── test_conv2d_llvm_behavior.py
        │   └── tune_batch_conv2d_cuda.py
        ├── test_ir_visit.py
        └── test_ir_visit_print.py
    ├── train.py
    └── utils.py


/.gitignore:
--------------------------------------------------------------------------------
 1 | __pycache__
 2 | notes
 3 | experiment_data
 4 | *vscode*
 5 | .idea
 6 | .DS_Store
 7 | *.cl
 8 | *.csv
 9 | *.zip
10 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2020-2021 Size Zheng
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/flextensor/__init__.py:
--------------------------------------------------------------------------------
1 | 
2 | 


--------------------------------------------------------------------------------
/flextensor/baselines/bilinear_baseline.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | import timeit
 3 | import torch
 4 | import numpy as np
 5 | from flextensor.configs.bilinear_config import bilinear_shapes
 6 | torch.backends.cudnn.enabled = False
 7 | 
 8 | 
 9 | def pytorch_cpu(N, K1, K2, M, number=100, dev=0):
10 |     run_time = timeit.timeit(setup= 'import torch\n'
11 |                                     'A = torch.rand([' + str(N) + ', ' + str(K1) + '], dtype=torch.float32)\n'
12 |                                     'B = torch.rand([' + str(N) + ', ' + str(K2) + '], dtype=torch.float32)\n'
13 |                                     'C = torch.rand([' + str(M) + ', ' + str(K1) + ', ' + str(K2) + '], dtype=torch.float32)\n'
14 |                                     'torch.nn.functional.bilinear(A, B, C)\n',
15 |                                stmt='ans = torch.nn.functional.bilinear(A, B, C)',
16 |                                number=number)
17 |     return run_time / number * 1e3
18 | 
19 | 
20 | def pytorch_cuda(N, K1, K2, M, number=100, dev=0):
21 |     A = torch.rand([N, K1], dtype=torch.float32).cuda("cuda:" + str(dev))
22 |     B = torch.rand([N, K2], dtype=torch.float32).cuda("cuda:" + str(dev))
23 |     C = torch.rand([M, K1, K2], dtype=torch.float32).cuda("cuda:" + str(dev))
24 | 
25 |     # warm-up
26 |     torch.nn.functional.bilinear(A, B, C)
27 |     torch.cuda.synchronize()
28 |     sum_time = 0.0
29 |     for i in range(number):
30 |         start = torch.cuda.Event(enable_timing=True)
31 |         end = torch.cuda.Event(enable_timing=True)
32 |         start.record()
33 |         ans = torch.nn.functional.bilinear(A, B, C)
34 |         end.record()
35 |         
36 |         # Waits for everything to finish running
37 |         torch.cuda.synchronize()
38 |         sum_time += start.elapsed_time(end)
39 | 
40 |     return sum_time / number
41 | 
42 | 
43 | if __name__ == "__main__":
44 |     parser = argparse.ArgumentParser()
45 |     parser.add_argument("-f", "--from_", help="From which shape", type=int, default=0)
46 |     parser.add_argument("-t", "--to", help="To which shape", type=int, default=-1)
47 |     parser.add_argument("-n", "--number", help="number test run", type=int, default=10)
48 |     parser.add_argument("--target", help="target device type", type=str, default="llvm")
49 |     parser.add_argument("--device", help="target device number", type=int, default=0)
50 |     parser.add_argument("--type", help="type of baseline", type=str, default="pytorch")
51 | 
52 |     args = parser.parse_args()
53 |     shapes = bilinear_shapes
54 |     if args.to < 0:
55 |         end = len(shapes)
56 |     else:
57 |         end = args.to
58 |     shapes = shapes[args.from_:end]
59 |     if args.type == "pytorch":
60 |         if args.target == "cuda":
61 |             baseline = pytorch_cuda
62 |         elif args.target == "llvm":
63 |             baseline = pytorch_cpu
64 |         else:
65 |             raise RuntimeError("Only support target 'llvm' and 'cuda', but got %s"%args.target)
66 |     else:
67 |         raise RuntimeError("Only implement pytorch baseline now, no '%s' baseline"%args.type)
68 |     
69 |     print("%s baselines gemm for target %s (%d):" % (args.type, args.target, args.device))
70 |     for i, shape in enumerate(shapes):
71 |         count = i + args.from_ 
72 |         print("layer", count)
73 |         N, K1, K2, M = shape
74 |         cost = baseline(N, K1, K2, M, args.number, args.device)
75 |         print("Use %f(ms)" % cost)
76 |     print("Done!")
77 | 


--------------------------------------------------------------------------------
/flextensor/baselines/block_matrix_circulant_baseline.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | import time
 3 | import copy
 4 | import torch
 5 | import numpy as np
 6 | from flextensor.configs.block_circulant_matrix_config import block_circulant_matrix_shapes as shapes
 7 | 
 8 | def python_cpu(ROW, COL, FFT, number=10, dev=0):
 9 |     Input = np.random.random([ROW, COL]).astype(np.float32)
10 |     Output = np.random.random([ROW, COL]).astype(np.float32)
11 | 
12 |     def run():
13 |         nonlocal Input, Output
14 |         for i in range(ROW // FFT):
15 |             sub_vec = np.zeros([FFT], dtype=np.float32)
16 |             vec = np.zeros([COL], dtype=np.float32)
17 |             for t in range(COL // FFT):
18 |                 for m in range(FFT):
19 |                     for n in range(FFT):
20 |                         vec[t * FFT + m] += \
21 |                             Input[FFT * i + n][t * FFT + (m + n) % FFT] / FFT
22 | 
23 |             for j in range(FFT):
24 |                 for k in range(COL//FFT):
25 |                     if j >= 1:
26 |                         sub_vec[0] = vec[FFT * (k + 1) - 1]
27 |                         sub_vec[1: FFT] = vec[FFT * k: FFT * (k + 1) - 1]
28 |                         vec[FFT * k: FFT * (k + 1)] = sub_vec
29 |                 Output[FFT * i + j][:] = copy.deepcopy(vec)
30 | 
31 |     sum_time = 0.0
32 |     for _ in range(number):
33 |         start = time.time()
34 |         run()
35 |         end = time.time()
36 |         sum_time += end - start
37 |     
38 |     return sum_time / number * 1e3
39 | 
40 | 
41 | if __name__ == "__main__":
42 |     parser = argparse.ArgumentParser()
43 |     parser.add_argument("-f", "--from_", help="From which shape", type=int, default=0)
44 |     parser.add_argument("-t", "--to", help="To which shape", type=int, default=-1)
45 |     parser.add_argument("-n", "--number", help="number test run", type=int, default=10)
46 |     parser.add_argument("--target", help="target device type", type=str, default="cpu")
47 |     parser.add_argument("--device", help="target device number", type=int, default=0)
48 |     parser.add_argument("--type", help="type of baseline", type=str, default="python")
49 | 
50 |     args = parser.parse_args()
51 |     if args.to < 0:
52 |         end = len(shapes)
53 |     else:
54 |         end = args.to
55 |     shapes = shapes[args.from_:end]
56 | 
57 |     if args.type == "python":
58 |         if args.target == "cpu":
59 |             baseline = python_cpu
60 |         else:
61 |             raise RuntimeError("Only support target 'cpu', but got %s"%args.target)
62 |     else:
63 |         raise RuntimeError("Only implement pytorch baseline now, no '%s' baseline"%args.type)
64 |     
65 |     print("%s baselines convolution 1d for target %s (%d):" % (args.type, args.target, args.device))
66 |     for i, shape in enumerate(shapes):
67 |         count = i + args.from_ 
68 |         print("layer", count)
69 |         cost = baseline(*shape, number=args.number, dev=args.device)
70 |         print("Use %f(ms)" % cost)
71 |     print("Done!")
72 | 


--------------------------------------------------------------------------------
/flextensor/baselines/conv-cudnn/Makefile:
--------------------------------------------------------------------------------
 1 | CXX := nvcc
 2 | TARGET := cudnn_conv
 3 | CUDNN_PATH := /usr/local/cuda-10.1
 4 | HEADERS := -I $(CUDNN_PATH)/include
 5 | LIBS := -L $(CUDNN_PATH)/lib64 -L/usr/local/lib
 6 | LIBDEVICE := --dont-use-profile -ldir $(CUDNN_PATH)/nvvm/libdevice
 7 | CXXFLAGS := -arch=sm_70 -std=c++11 -O2
 8 | 
 9 | all: conv
10 | 
11 | conv: $(TARGET).cu
12 | 	$(CXX) $(CXXFLAGS) $(LIBDEVICE) $(HEADERS) $(LIBS) $(TARGET).cu -o $(TARGET) \
13 | 	-lcudnn
14 | 
15 | .phony: clean
16 | 
17 | clean:
18 | 	rm $(TARGET) || echo -n ""


--------------------------------------------------------------------------------
/flextensor/baselines/conv1d_baseline.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | import timeit
 3 | import torch
 4 | from flextensor.configs.conv1d_config import conv1d_shapes
 5 | torch.backends.cudnn.enabled = True
 6 | 
 7 | 
 8 | def pytorch_cpu(batch_size, length, channel, kernel_size, output_channel, stride=1, padding=0, dilation=1, groups=1, number=100, dev=0):
 9 |     run_time = timeit.timeit(setup= 'import torch\n'
10 |                                     'conv = torch.nn.functional.conv1d\n'
11 |                                     'A = torch.rand([' + str(batch_size) + ', ' + str(channel) + ', ' + str(length) + '], dtype=torch.float32)\n'
12 |                                     'W = torch.rand([' + str(output_channel) + ', ' + str(channel//groups) + ', ' + str(kernel_size) + '], dtype=torch.float32)\n'
13 |                                     'conv(A, W, stride=' + str(stride) + ', padding=' + str(padding) + ', dilation=' + str(dilation) + ', groups=' + str(groups) + ')\n',
14 |                                stmt='ans = conv(A, W, stride=' + str(stride) + ', padding=' + str(padding) + ', dilation=' + str(dilation) + ', groups=' + str(groups) + ')',
15 |                                number=number)
16 |     return run_time / number * 1e3
17 | 
18 | 
19 | def pytorch_cuda(N, L, C, kernel_size, K, stride=1, padding=0, dilation=1, groups=1, number=100, dev=0):
20 |     A = torch.rand([N, C, L], dtype=torch.float32).cuda("cuda:" + str(dev))
21 |     W = torch.rand([K, C//groups, kernel_size], dtype=torch.float32).cuda("cuda:" + str(dev))
22 | 
23 |     # warm-up
24 |     torch.nn.functional.conv1d(A, W, stride=stride, padding=padding, dilation=dilation, groups=groups)
25 |     torch.cuda.synchronize()
26 |     sum_time = 0.0
27 |     for i in range(number):
28 |         start = torch.cuda.Event(enable_timing=True)
29 |         end = torch.cuda.Event(enable_timing=True)
30 |         start.record()
31 |         ans = torch.nn.functional.conv1d(A, W, stride=stride, padding=padding, dilation=dilation, groups=groups)
32 |         end.record()
33 | 
34 |         # Waits for everything to finish running
35 |         torch.cuda.synchronize()
36 |         sum_time += start.elapsed_time(end)
37 |     return sum_time / number
38 | 
39 | 
40 | if __name__ == "__main__":
41 |     parser = argparse.ArgumentParser()
42 |     parser.add_argument("-f", "--from_", help="From which shape", type=int, default=0)
43 |     parser.add_argument("-t", "--to", help="To which shape", type=int, default=-1)
44 |     parser.add_argument("-n", "--number", help="number test run", type=int, default=10)
45 |     parser.add_argument("--target", help="target device type", type=str, default="llvm")
46 |     parser.add_argument("--device", help="target device number", type=int, default=0)
47 |     parser.add_argument("--type", help="type of baseline", type=str, default="pytorch")
48 | 
49 |     args = parser.parse_args()
50 |     shapes = conv1d_shapes
51 |     if args.to < 0:
52 |         end = len(shapes)
53 |     else:
54 |         end = args.to
55 |     shapes = shapes[args.from_:end]
56 |     if args.type == "pytorch":
57 |         if args.target == "cuda":
58 |             baseline = pytorch_cuda
59 |         elif args.target == "llvm":
60 |             baseline = pytorch_cpu
61 |         else:
62 |             raise RuntimeError("Only support target 'llvm' and 'cuda', but got %s"%args.target)
63 |     else:
64 |         raise RuntimeError("Only implement pytorch baseline now, no '%s' baseline"%args.type)
65 |     
66 |     print("%s baselines convolution 1d for target %s (%d):" % (args.type, args.target, args.device))
67 |     for i, shape in enumerate(shapes):
68 |         count = i + args.from_ 
69 |         print("layer", count)
70 |         batch, in_channel, length, out_channel, _, k_len, _, stride, padding, dilation, groups = shape
71 |         cost = baseline(batch, length, in_channel, k_len, out_channel, stride=stride, padding=padding, dilation=dilation, groups=groups, number=args.number, dev=args.device)
72 |         print("Use %f(ms)" % cost)
73 |     print("Done!")
74 | 


--------------------------------------------------------------------------------
/flextensor/baselines/conv3d_baseline.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | import timeit
 3 | import torch
 4 | from flextensor.configs.conv3d_config import conv3d_shapes
 5 | torch.backends.cudnn.enabled = True
 6 | 
 7 | 
 8 | def pytorch_cpu(batch_size, depth, height, width, channel, kernel_size, output_channel, stride=1, padding=0, dilation=1, groups=1, number=100, dev=0):
 9 |     run_time = timeit.timeit(setup= 'import torch\n'
10 |                                     'conv = torch.nn.functional.conv3d\n'
11 |                                     'A = torch.rand([' + str(batch_size) + ', ' + str(channel) + ', ' + str(depth) + ', ' + str(height) + ', ' + str(width) + '], dtype=torch.float32)\n'
12 |                                     'W = torch.rand([' + str(output_channel) + ', ' + str(channel//groups) + ', ' + str(kernel_size) + ', ' + str(kernel_size) + ', ' + str(kernel_size) + '], dtype=torch.float32)\n'
13 |                                     'conv(A, W, stride=' + str(stride) + ', padding=' + str(padding) + ', dilation=' + str(dilation) + ', groups=' + str(groups) + ')\n',
14 |                                stmt='ans = conv(A, W, stride=' + str(stride) + ', padding=' + str(padding) + ', dilation=' + str(dilation) + ', groups=' + str(groups) + ')',
15 |                                number=number)
16 |     return run_time / number * 1e3
17 | 
18 | 
19 | def pytorch_cuda(N, D, H, W, C, kernel_size, K, stride=1, padding=0, dilation=1, groups=1, number=100, dev=0):
20 |     A = torch.rand([N, C, D, H, W], dtype=torch.float32).cuda("cuda:" + str(dev))
21 |     W = torch.rand([K, C//groups, kernel_size, kernel_size, kernel_size], dtype=torch.float32).cuda("cuda:" + str(dev))
22 | 
23 |     # warm-up
24 |     torch.nn.functional.conv3d(A, W, stride=stride, padding=padding, dilation=dilation, groups=groups)
25 |     torch.cuda.synchronize()
26 |     sum_time = 0.0
27 |     for i in range(number):
28 |         start = torch.cuda.Event(enable_timing=True)
29 |         end = torch.cuda.Event(enable_timing=True)
30 |         start.record()
31 |         ans = torch.nn.functional.conv3d(A, W, stride=stride, padding=padding, dilation=dilation, groups=groups)
32 |         end.record()
33 |         
34 |         # Waits for everything to finish running
35 |         torch.cuda.synchronize()
36 |         sum_time += start.elapsed_time(end) 
37 |     return sum_time / number
38 | 
39 | 
40 | if __name__ == "__main__":
41 |     parser = argparse.ArgumentParser()
42 |     parser.add_argument("-f", "--from_", help="From which shape", type=int, default=0)
43 |     parser.add_argument("-t", "--to", help="To which shape", type=int, default=-1)
44 |     parser.add_argument("-n", "--number", help="number test run", type=int, default=10)
45 |     parser.add_argument("--target", help="target device type", type=str, default="llvm")
46 |     parser.add_argument("--device", help="target device number", type=int, default=0)
47 |     parser.add_argument("--type", help="type of baseline", type=str, default="pytorch")
48 | 
49 |     args = parser.parse_args()
50 |     shapes = conv3d_shapes
51 |     if args.to < 0:
52 |         end = len(shapes)
53 |     else:
54 |         end = args.to
55 |     shapes = shapes[args.from_:end]
56 |     if args.type == "pytorch":
57 |         if args.target == "cuda":
58 |             baseline = pytorch_cuda
59 |         elif args.target == "llvm":
60 |             baseline = pytorch_cpu
61 |         else:
62 |             raise RuntimeError("Only support target 'llvm' and 'cuda', but got %s"%args.target)
63 |     else:
64 |         raise RuntimeError("Only implement pytorch baseline now, no '%s' baseline"%args.type)
65 |     
66 |     print("%s baselines convolution 3d for target %s (%d):" % (args.type, args.target, args.device))
67 |     for i, shape in enumerate(shapes):
68 |         count = i + args.from_ 
69 |         print("layer", count)
70 |         batch, in_channel, D, H, W, out_channel, _, k, _, stride, padding, _, _ = shape
71 |         cost = baseline(batch, D, H, W, in_channel, k, out_channel, stride=stride, padding=padding, number=args.number, dev=args.device)
72 |         print("Use %f(ms)" % cost)
73 |     print("Done!")
74 | 


--------------------------------------------------------------------------------
/flextensor/baselines/dilation_baseline.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | import timeit
 3 | import torch
 4 | from flextensor.configs.dilation_config import dilation_shapes
 5 | torch.backends.cudnn.enabled = False
 6 | 
 7 | 
 8 | def pytorch_cpu(batch_size, height, width, channel, kernel_size, output_channel, stride=1, padding=0, dilation=1, groups=1, number=100, dev=0):
 9 |     run_time = timeit.timeit(setup= 'import torch\n'
10 |                                     'conv = torch.nn.functional.conv2d\n'
11 |                                     'A = torch.rand([' + str(batch_size) + ', ' + str(channel) + ', ' + str(height) + ', ' + str(width) + '], dtype=torch.float32)\n'
12 |                                     'W = torch.rand([' + str(output_channel) + ', ' + str(channel//groups) + ', ' + str(kernel_size) + ', ' + str(kernel_size) + '], dtype=torch.float32)\n'
13 |                                     'conv(A, W, stride=' + str(stride) + ', padding=' + str(padding) + ', dilation=' + str(dilation) + ', groups=' + str(groups) + ')\n',
14 |                                stmt='ans = conv(A, W, stride=' + str(stride) + ', padding=' + str(padding) + ', dilation=' + str(dilation) + ', groups=' + str(groups) + ')',
15 |                                number=number)
16 |     return run_time / number * 1e3
17 | 
18 | 
19 | def pytorch_cuda(N, H, W, C, kernel_size, K, stride=1, padding=0, dilation=1, groups=1, number=100, dev=0):
20 |     A = torch.rand([N, C, H, W], dtype=torch.float32).cuda("cuda:" + str(dev))
21 |     W = torch.rand([K, C//groups, kernel_size, kernel_size], dtype=torch.float32).cuda("cuda:" + str(dev))
22 | 
23 |     # warm-up
24 |     torch.nn.functional.conv2d(A, W, stride=stride, padding=padding, dilation=dilation, groups=groups)
25 |     torch.cuda.synchronize()
26 |     sum_time = 0.0
27 |     for i in range(number):
28 |         start = torch.cuda.Event(enable_timing=True)
29 |         end = torch.cuda.Event(enable_timing=True)
30 |         start.record()
31 |         ans = torch.nn.functional.conv2d(A, W, stride=stride, padding=padding, dilation=dilation, groups=groups)
32 |         end.record()
33 | 
34 |         # Waits for everything to finish running
35 |         torch.cuda.synchronize()
36 | 
37 |         sum_time += start.elapsed_time(end)
38 |     return sum_time / number
39 | 
40 | 
41 | if __name__ == "__main__":
42 |     parser = argparse.ArgumentParser()
43 |     parser.add_argument("-f", "--from_", help="From which shape", type=int, default=0)
44 |     parser.add_argument("-t", "--to", help="To which shape", type=int, default=-1)
45 |     parser.add_argument("-n", "--number", help="number test run", type=int, default=10)
46 |     parser.add_argument("--target", help="target device type", type=str, default="llvm")
47 |     parser.add_argument("--device", help="target device number", type=int, default=0)
48 |     parser.add_argument("--type", help="type of baseline", type=str, default="pytorch")
49 | 
50 |     args = parser.parse_args()
51 |     shapes = dilation_shapes
52 |     if args.to < 0:
53 |         end = len(shapes)
54 |     else:
55 |         end = args.to
56 |     shapes = shapes[args.from_:end]
57 |     if args.type == "pytorch":
58 |         if args.target == "cuda":
59 |             baseline = pytorch_cuda
60 |         elif args.target == "llvm":
61 |             baseline = pytorch_cpu
62 |         else:
63 |             raise RuntimeError("Only support target 'llvm' and 'cuda', but got %s"%args.target)
64 |     else:
65 |         raise RuntimeError("Only implement pytorch baseline now, no '%s' baseline"%args.type)
66 |     
67 |     print("%s baselines dilation convolution for target %s (%d):" % (args.type, args.target, args.device))
68 |     for i, shape in enumerate(shapes):
69 |         count = i + args.from_ 
70 |         print("layer", count)
71 |         batch, in_channel, H, W, out_channel, k, _, stride, padding, dilation, groups = shape
72 |         cost = baseline(batch, H, W, in_channel, k, out_channel, stride=stride, padding=padding, dilation=dilation, groups=groups, number=args.number, dev=args.device)
73 |         print("Use %f(ms)" % cost)
74 |     print("Done!")
75 | 


--------------------------------------------------------------------------------
/flextensor/baselines/flextensor/conv1d-p100.txt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pku-liang/FlexTensor/98db7a952a0508f792edfd876527c8517a5ccd36/flextensor/baselines/flextensor/conv1d-p100.txt


--------------------------------------------------------------------------------
/flextensor/baselines/flextensor/conv1d-titanxp.txt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pku-liang/FlexTensor/98db7a952a0508f792edfd876527c8517a5ccd36/flextensor/baselines/flextensor/conv1d-titanxp.txt


--------------------------------------------------------------------------------
/flextensor/baselines/flextensor/conv1d-v100.txt:
--------------------------------------------------------------------------------
1 | conv1d_conv1d_batch1_(1, 192, 3136, 128, 1, 1, 0, 1, 1)_cuda(0):[[{"fuse": [[1, 2, 3]], "spatial": [[1, 1, 1, 1], [8, 2, 3, 4], [49, 2, 32, 1]], "reduce": [], "reorder": [[2]], "inline": [], "unroll": [[512, 0]], "merge": []}, {"fuse": [[1, 2, 3]], "spatial": [[1, 1, 1, 1], [8, 4, 4, 1], [49, 2, 32, 1]], "reduce": [[4, 3, 16], [1, 1, 1]], "reorder": [[1]], "inline": [], "unroll": [[512, 1]], "merge": []}], {"fuse": [], "spatial": [], "reduce": [], "reorder": [], "inline": [[1, 0]], "unroll": [], "merge": [[1, 0]]}]
2 | conv1d_conv1d_batch1_(1, 128, 3136, 256, 9, 1, 1, 1, 1)_cuda(0):[[{"fuse": [[1, 2, 3]], "spatial": [[1, 1, 1, 1], [16, 1, 1, 8], [6, 1, 523, 1]], "reduce": [], "reorder": [[1]], "inline": [], "unroll": [[1, 1]], "merge": []}, {"fuse": [[1, 2, 3]], "spatial": [[1, 1, 1, 1], [16, 4, 2, 2], [5, 1, 313, 2]], "reduce": [[32, 1, 4], [1, 1, 9]], "reorder": [[2]], "inline": [], "unroll": [[1500, 0]], "merge": []}], {"fuse": [], "spatial": [], "reduce": [], "reorder": [], "inline": [[0, 0]], "unroll": [], "merge": [[0, 1]]}]
3 | conv1d_conv1d_batch1_(1, 512, 784, 256, 1, 1, 0, 1, 1)_cuda(0):[[{"fuse": [[1, 2, 3]], "spatial": [[1, 1, 1, 1], [16, 1, 8, 4], [14, 1, 56, 1]], "reduce": [], "reorder": [[2]], "inline": [], "unroll": [[512, 0]], "merge": []}, {"fuse": [[1, 2, 3]], "spatial": [[1, 1, 1, 1], [8, 4, 8, 1], [28, 1, 14, 2]], "reduce": [[8, 4, 16], [1, 1, 1]], "reorder": [[1]], "inline": [], "unroll": [[1500, 1]], "merge": []}], {"fuse": [], "spatial": [], "reduce": [], "reorder": [], "inline": [[1, 0]], "unroll": [], "merge": [[0, 1]]}]
4 | conv1d_conv1d_batch1_(1, 256, 784, 512, 9, 1, 1, 1, 1)_cuda(0):[[{"fuse": [[1, 2, 3]], "spatial": [[1, 1, 1, 1], [256, 1, 1, 1], [1, 3, 262, 1]], "reduce": [], "reorder": [[2]], "inline": [], "unroll": [[512, 1]], "merge": []}, {"fuse": [[1, 2, 3]], "spatial": [[1, 1, 1, 1], [128, 2, 1, 2], [1, 1, 389, 2]], "reduce": [[32, 2, 4], [1, 1, 9]], "reorder": [[2]], "inline": [], "unroll": [[1500, 1]], "merge": []}], {"fuse": [], "spatial": [], "reduce": [], "reorder": [], "inline": [[0, 0]], "unroll": [], "merge": [[1, 1]]}]
5 | conv1d_conv1d_batch1_(1, 1024, 196, 512, 1, 1, 0, 1, 1)_cuda(0):[[{"fuse": [[1, 2, 3]], "spatial": [[1, 1, 1, 1], [64, 2, 2, 4], [1, 2, 98, 1]], "reduce": [], "reorder": [[2]], "inline": [], "unroll": [[0, 0]], "merge": []}, {"fuse": [[1, 2, 3]], "spatial": [[1, 1, 1, 1], [32, 2, 8, 1], [7, 1, 14, 2]], "reduce": [[2, 32, 16], [1, 1, 1]], "reorder": [[1]], "inline": [], "unroll": [[1500, 1]], "merge": []}], {"fuse": [], "spatial": [], "reduce": [], "reorder": [], "inline": [[1, 0]], "unroll": [], "merge": [[0, 0]]}]
6 | conv1d_conv1d_batch1_(1, 512, 196, 1024, 9, 1, 1, 1, 1)_cuda(0):[[{"fuse": [[1, 2, 3]], "spatial": [[1, 1, 1, 1], [128, 1, 4, 1], [2, 1, 33, 3]], "reduce": [], "reorder": [[1]], "inline": [], "unroll": [[1500, 0]], "merge": []}, {"fuse": [[1, 2, 3]], "spatial": [[1, 1, 1, 1], [256, 4, 1, 1], [1, 1, 95, 2]], "reduce": [[1, 128, 4], [1, 1, 9]], "reorder": [[1]], "inline": [], "unroll": [[512, 0]], "merge": []}], {"fuse": [], "spatial": [], "reduce": [], "reorder": [], "inline": [[0, 0]], "unroll": [], "merge": [[0, 1]]}]
7 | conv1d_conv1d_batch1_(1, 1024, 49, 1024, 9, 1, 1, 1, 1)_cuda(0):[[{"fuse": [[1, 2, 3]], "spatial": [[1, 1, 1, 1], [256, 2, 1, 2], [1, 1, 51, 1]], "reduce": [], "reorder": [[2]], "inline": [], "unroll": [[512, 0]], "merge": []}, {"fuse": [[1, 2, 3]], "spatial": [[1, 1, 1, 1], [128, 4, 2, 1], [1, 1, 43, 1]], "reduce": [[32, 2, 16], [1, 1, 9]], "reorder": [[1]], "inline": [], "unroll": [[1500, 1]], "merge": []}], {"fuse": [], "spatial": [], "reduce": [], "reorder": [], "inline": [[0, 0]], "unroll": [], "merge": [[1, 1]]}]


--------------------------------------------------------------------------------
/flextensor/baselines/flextensor/conv3d-p100.txt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pku-liang/FlexTensor/98db7a952a0508f792edfd876527c8517a5ccd36/flextensor/baselines/flextensor/conv3d-p100.txt


--------------------------------------------------------------------------------
/flextensor/baselines/flextensor/conv3d-titanxp.txt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pku-liang/FlexTensor/98db7a952a0508f792edfd876527c8517a5ccd36/flextensor/baselines/flextensor/conv3d-titanxp.txt


--------------------------------------------------------------------------------
/flextensor/baselines/flextensor/yolo-conv2d-p100.txt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pku-liang/FlexTensor/98db7a952a0508f792edfd876527c8517a5ccd36/flextensor/baselines/flextensor/yolo-conv2d-p100.txt


--------------------------------------------------------------------------------
/flextensor/baselines/flextensor/yolo-conv2d-titanxp.txt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pku-liang/FlexTensor/98db7a952a0508f792edfd876527c8517a5ccd36/flextensor/baselines/flextensor/yolo-conv2d-titanxp.txt


--------------------------------------------------------------------------------
/flextensor/baselines/gemm-cublas/Makefile:
--------------------------------------------------------------------------------
 1 | CXX := nvcc
 2 | TARGET := cublas_gemm
 3 | CUBLAS_PATH := /usr/local/cuda-10.1
 4 | HEADERS := -I $(CUBLAS_PATH)/include
 5 | LIBS := -L $(CUBLAS_PATH)/lib64 -L/usr/local/lib
 6 | LIBDEVICE := --dont-use-profile -ldir $(CUBLAS_PATH)/nvvm/libdevice
 7 | CXXFLAGS := -arch=sm_70 -std=c++11 -O2
 8 | 
 9 | all: conv
10 | 
11 | conv: $(TARGET).cu
12 | 	$(CXX) $(CXXFLAGS) $(LIBDEVICE) $(HEADERS) $(LIBS) $(TARGET).cu -o $(TARGET) \
13 | 	-lcublas
14 | 
15 | .phony: clean
16 | 
17 | clean:
18 | 	rm $(TARGET) || echo -n ""


--------------------------------------------------------------------------------
/flextensor/baselines/gemm_baseline.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | import timeit
 3 | import torch
 4 | import numpy as np
 5 | from flextensor.configs.gemm_config import gemm_shapes
 6 | torch.backends.cudnn.enabled = False
 7 | 
 8 | 
 9 | def numpy_cpu(N, K, M, number=100, dev=0):
10 |     run_time = timeit.timeit(setup='import numpy\n'
11 |                                      'N = ' + str(N) + '\n'
12 |                                      'K = ' + str(K) + '\n'
13 |                                      'M = ' + str(M) + '\n'
14 |                                      'dtype = "float32"\n'
15 |                                      'a = numpy.random.rand(N, K).astype(dtype)\n'
16 |                                      'b = numpy.random.rand(K, M).astype(dtype)\n',
17 |                                stmt='answer = numpy.dot(a, b)',
18 |                                number=number)
19 |     return run_time / number * 1e3
20 | 
21 | 
22 | def pytorch_cpu(N, K, M, number=100, dev=0):
23 |     run_time = timeit.timeit(setup= 'import torch\n'
24 |                                     'A = torch.rand([' + str(N) + ', ' + str(K) + '], dtype=torch.float32)\n'
25 |                                     'B = torch.rand([' + str(K) + ', ' + str(M) + '], dtype=torch.float32)\n'
26 |                                     'torch.mm(A, B)\n',
27 |                                stmt='ans = torch.mm(A, B)',
28 |                                number=number)
29 |     return run_time / number * 1e3
30 | 
31 | 
32 | def pytorch_cuda(N, K, M, number=100, dev=0):
33 |     A = torch.rand([N, K], dtype=torch.float32).cuda("cuda:" + str(dev))
34 |     B = torch.rand([K, M], dtype=torch.float32).cuda("cuda:" + str(dev))
35 | 
36 |     # warm-up
37 |     torch.mm(A, B)
38 |     torch.cuda.synchronize()
39 |     sum_time = 0.0
40 |     for i in range(number):
41 |         start = torch.cuda.Event(enable_timing=True)
42 |         end = torch.cuda.Event(enable_timing=True)
43 |         start.record()
44 |         ans = torch.mm(A, B)
45 |         end.record()
46 |         
47 |         # Waits for everything to finish running
48 |         torch.cuda.synchronize()
49 |         sum_time += start.elapsed_time(end)
50 | 
51 |     return sum_time / number
52 | 
53 | 
54 | if __name__ == "__main__":
55 |     parser = argparse.ArgumentParser()
56 |     parser.add_argument("-f", "--from_", help="From which shape", type=int, default=0)
57 |     parser.add_argument("-t", "--to", help="To which shape", type=int, default=-1)
58 |     parser.add_argument("-n", "--number", help="number test run", type=int, default=10)
59 |     parser.add_argument("--target", help="target device type", type=str, default="llvm")
60 |     parser.add_argument("--device", help="target device number", type=int, default=0)
61 |     parser.add_argument("--type", help="type of baseline", type=str, default="pytorch")
62 | 
63 |     args = parser.parse_args()
64 |     shapes = gemm_shapes
65 |     if args.to < 0:
66 |         end = len(shapes)
67 |     else:
68 |         end = args.to
69 |     shapes = shapes[args.from_:end]
70 |     if args.type == "pytorch":
71 |         if args.target == "cuda":
72 |             baseline = pytorch_cuda
73 |         elif args.target == "llvm":
74 |             baseline = pytorch_cpu
75 |         else:
76 |             raise RuntimeError("Only support target 'llvm' and 'cuda', but got %s"%args.target)
77 |     elif args.type == "numpy":
78 |         if args.target == "llvm":
79 |             baseline = numpy_cpu
80 |         else:
81 |             raise RuntimeError("Only support target 'llvm', but got %s"%args.target)
82 |     else:
83 |         raise RuntimeError("Only implement pytorch baseline now, no '%s' baseline"%args.type)
84 |     
85 |     print("%s baselines gemm for target %s (%d):" % (args.type, args.target, args.device))
86 |     for i, shape in enumerate(shapes):
87 |         count = i + args.from_ 
88 |         print("layer", count)
89 |         N, K, M = shape
90 |         cost = baseline(N, K, M, args.number, args.device)
91 |         print("Use %f(ms)" % cost)
92 |     print("Done!")
93 | 


--------------------------------------------------------------------------------
/flextensor/baselines/gemv_baseline.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | import timeit
 3 | import torch
 4 | from flextensor.configs.gemv_config import gemv_shapes
 5 | torch.backends.cudnn.enabled = False
 6 | 
 7 | 
 8 | def pytorch_cpu(N, K, number=100, dev=0):
 9 |     run_time = timeit.timeit(setup= 'import torch\n'
10 |                                     'A = torch.rand([' + str(N) + ', ' + str(K) + '], dtype=torch.float32)\n'
11 |                                     'B = torch.rand([' + str(K) + ', 1], dtype=torch.float32)\n'
12 |                                     'torch.mm(A, B)\n',
13 |                                stmt='ans = torch.mm(A, B)',
14 |                                number=number)
15 |     return run_time / number * 1e3
16 | 
17 | 
18 | def pytorch_cuda(N, K, number=100, dev=0):
19 |     A = torch.rand([N, K], dtype=torch.float32).cuda("cuda:" + str(dev))
20 |     B = torch.rand([K, 1], dtype=torch.float32).cuda("cuda:" + str(dev))
21 | 
22 |     # warm-up
23 |     torch.mm(A, B)
24 |     torch.cuda.synchronize()
25 |     sum_time = 0.0
26 |     for i in range(number):
27 |         start = torch.cuda.Event(enable_timing=True)
28 |         end = torch.cuda.Event(enable_timing=True)
29 |         start.record()
30 |         ans = torch.mm(A, B)
31 |         end.record()
32 |         
33 |         # Waits for everything to finish running
34 |         torch.cuda.synchronize()
35 |         sum_time += start.elapsed_time(end)
36 | 
37 |     return sum_time / number
38 | 
39 | 
40 | if __name__ == "__main__":
41 |     parser = argparse.ArgumentParser()
42 |     parser.add_argument("-f", "--from_", help="From which shape", type=int, default=0)
43 |     parser.add_argument("-t", "--to", help="To which shape", type=int, default=-1)
44 |     parser.add_argument("-n", "--number", help="number test run", type=int, default=10)
45 |     parser.add_argument("--target", help="target device type", type=str, default="llvm")
46 |     parser.add_argument("--device", help="target device number", type=int, default=0)
47 |     parser.add_argument("--type", help="type of baseline", type=str, default="pytorch")
48 | 
49 |     args = parser.parse_args()
50 |     shapes = gemv_shapes
51 |     if args.to < 0:
52 |         end = len(shapes)
53 |     else:
54 |         end = args.to
55 |     shapes = shapes[args.from_:end]
56 |     if args.type == "pytorch":
57 |         if args.target == "cuda":
58 |             baseline = pytorch_cuda
59 |         elif args.target == "llvm":
60 |             baseline = pytorch_cpu
61 |         else:
62 |             raise RuntimeError("Only support target 'llvm' and 'cuda', but got %s"%args.target)
63 |     else:
64 |         raise RuntimeError("Only implement pytorch baseline now, no '%s' baseline"%args.type)
65 |     
66 |     print("%s baselines gemv for target %s (%d):" % (args.type, args.target, args.device))
67 |     for i, shape in enumerate(shapes):
68 |         count = i + args.from_ 
69 |         print("layer", count)
70 |         N, K, _ = shape
71 |         cost = baseline(N, K, args.number, args.device)
72 |         print("Use %f(ms)" % cost)
73 |     print("Done!")
74 | 


--------------------------------------------------------------------------------
/flextensor/baselines/grouped_baseline.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | import timeit
 3 | import torch
 4 | from flextensor.configs.grouped_config import grouped_shapes
 5 | torch.backends.cudnn.enabled = False
 6 | 
 7 | 
 8 | def pytorch_cpu(batch_size, height, width, channel, kernel_size, output_channel, stride=1, padding=0, dilation=1, groups=1, number=100, dev=0):
 9 |     run_time = timeit.timeit(setup= 'import torch\n'
10 |                                     'conv = torch.nn.functional.conv2d\n'
11 |                                     'A = torch.rand([' + str(batch_size) + ', ' + str(channel) + ', ' + str(height) + ', ' + str(width) + '], dtype=torch.float32)\n'
12 |                                     'W = torch.rand([' + str(output_channel) + ', ' + str(channel//groups) + ', ' + str(kernel_size) + ', ' + str(kernel_size) + '], dtype=torch.float32)\n'
13 |                                     'conv(A, W, stride=' + str(stride) + ', padding=' + str(padding) + ', dilation=' + str(dilation) + ', groups=' + str(groups) + ')\n',
14 |                                stmt='ans = conv(A, W, stride=' + str(stride) + ', padding=' + str(padding) + ', dilation=' + str(dilation) + ', groups=' + str(groups) + ')',
15 |                                number=number)
16 |     return run_time / number * 1e3
17 | 
18 | 
19 | def pytorch_cuda(N, H, W, C, kernel_size, K, stride=1, padding=0, dilation=1, groups=1, number=100, dev=0):
20 |     A = torch.rand([N, C, H, W], dtype=torch.float32).cuda("cuda:" + str(dev))
21 |     W = torch.rand([K, C//groups, kernel_size, kernel_size], dtype=torch.float32).cuda("cuda:" + str(dev))
22 | 
23 |     # warm-up
24 |     torch.nn.functional.conv2d(A, W, stride=stride, padding=padding, dilation=dilation, groups=groups)
25 |     torch.cuda.synchronize()
26 |     sum_time = 0.0
27 |     for i in range(number):
28 |         start = torch.cuda.Event(enable_timing=True)
29 |         end = torch.cuda.Event(enable_timing=True)
30 |         start.record()
31 |         ans = torch.nn.functional.conv2d(A, W, stride=stride, padding=padding, dilation=dilation, groups=groups)
32 |         end.record()
33 | 
34 |         # Waits for everything to finish running
35 |         torch.cuda.synchronize()
36 | 
37 |         sum_time += start.elapsed_time(end)
38 |     return sum_time / number
39 | 
40 | 
41 | if __name__ == "__main__":
42 |     parser = argparse.ArgumentParser()
43 |     parser.add_argument("-f", "--from_", help="From which shape", type=int, default=0)
44 |     parser.add_argument("-t", "--to", help="To which shape", type=int, default=-1)
45 |     parser.add_argument("-n", "--number", help="number test run", type=int, default=10)
46 |     parser.add_argument("--target", help="target device type", type=str, default="llvm")
47 |     parser.add_argument("--device", help="target device number", type=int, default=0)
48 |     parser.add_argument("--type", help="type of baseline", type=str, default="pytorch")
49 | 
50 |     args = parser.parse_args()
51 |     shapes = grouped_shapes
52 |     if args.to < 0:
53 |         end = len(shapes)
54 |     else:
55 |         end = args.to
56 |     shapes = shapes[args.from_:end]
57 |     if args.type == "pytorch":
58 |         if args.target == "cuda":
59 |             baseline = pytorch_cuda
60 |         elif args.target == "llvm":
61 |             baseline = pytorch_cpu
62 |         else:
63 |             raise RuntimeError("Only support target 'llvm' and 'cuda', but got %s"%args.target)
64 |     else:
65 |         raise RuntimeError("Only implement pytorch baseline now, no '%s' baseline"%args.type)
66 |     
67 |     print("%s baselines grouped convolution for target %s (%d):" % (args.type, args.target, args.device))
68 |     for i, shape in enumerate(shapes):
69 |         count = i + args.from_ 
70 |         print("layer", count)
71 |         batch, in_channel, H, W, out_channel, k, _, stride, padding, dilation, groups = shape
72 |         cost = baseline(batch, H, W, in_channel, k, out_channel, stride=stride, padding=padding, dilation=dilation, groups=groups, number=args.number, dev=args.device)
73 |         print("Use %f(ms)" % cost)
74 |     print("Done!")
75 | 


--------------------------------------------------------------------------------
/flextensor/baselines/shift_conv2d_baseline.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | import timeit
 3 | import torch
 4 | import time
 5 | import tvm
 6 | import topi
 7 | import numpy as np
 8 | import random
 9 | import math
10 | from flextensor.configs.shift_conv2d_config import shift_conv2d_shape
11 | from flextensor.task import shiftconv2d
12 | 
13 | torch.backends.cudnn.enabled = False
14 | 
15 | def tvm_shift_conv2d_cpu(B, H, W, C, kernel_size, dilation, stride=1, number=100, dev=0):
16 |     Input = torch.rand([B, H, W, C], dtype=torch.float32)
17 |     Kernel = torch.rand([C, kernel_size, kernel_size], dtype=torch.float32)
18 |     KernelIndex = torch.argmax(Kernel.reshape([C, -1]), dim=1)
19 |     indexH = random.randint(0 ,kernel_size - 1)
20 |     indexW = random.randint(0 ,kernel_size - 1)
21 |     Kernel[:, indexH, indexW] = 0
22 | 
23 |     output, bufs = shiftconv2d(B, H, W, C, kernel_size, dilation, stride)
24 |     s =  tvm.te.create_schedule(output)
25 |     ctx = tvm.cpu(dev)
26 |     # print(tvm.lower(s, bufs, simple_mode=True))
27 |     f = tvm.build(s, bufs, 'llvm')
28 | 
29 |     im = tvm.nd.array(Input.numpy().astype(np.float32), ctx)
30 |     fi = tvm.nd.array(KernelIndex.numpy().astype(np.int32), ctx)
31 |     
32 |     paddings = [math.ceil(((stride - 1) * H - stride + dilation * (kernel_size - 1)) / 2), 
33 |                 math.ceil(((stride - 1) * W - stride + dilation * (kernel_size - 1)) / 2)]
34 | 
35 |     image_height = H
36 |     image_width = W
37 |     out_height = math.floor((image_height + 2 * paddings[0]- dilation * (kernel_size - 1) - 1) / stride + 1)
38 |     out_width = math.floor((image_width + 2 * paddings[1] - dilation * (kernel_size - 1) - 1) / stride + 1)
39 |     output_shape = (B, out_height, out_width, C)
40 |     un = tvm.nd.array(np.zeros(output_shape).astype(np.float32), ctx)
41 | 
42 |     start_time = time.time()
43 |     for i in range(number):
44 |         f(im, fi, un)
45 |     end_time = time.time()
46 |     return (end_time - start_time) * 1e3 / number
47 | 
48 | def tvm_shift_conv2d_cuda(B, H, W, C, kernel_size, stride, padding, number=100, dev=0):
49 |     pass
50 | 
51 | if __name__ == "__main__":
52 |     shapes = shift_conv2d_shape
53 | 
54 |     """warm up"""
55 |     """cost = pytorch_cpu(*shapes[0])
56 |     cost = pytorch_cuda(*shapes[0])
57 |     cost = tvm_shift_conv2d_cpu(*shapes[0])"""
58 |     # cost = tvm_shift_conv2d_cuda(*shapes[0])
59 | 
60 |     for shape in shapes:
61 |         print("Shape", shape)
62 |         cost = tvm_shift_conv2d_cpu(*shape)
63 |         print("Tvm cost on cpu: {}ms".format(cost))
64 |         
65 |     print("Done!")
66 | 


--------------------------------------------------------------------------------
/flextensor/baselines/sparse/sparse-gemm.py:
--------------------------------------------------------------------------------
 1 | import time
 2 | import numpy as np
 3 | import torch
 4 | 
 5 | 
 6 | def torch_spmm(M, N, K, dtype="float32", n_trial=1):
 7 |     spmm = torch.sparse.mm 
 8 |     # a_np = np.random.uniform(-0.91, 0.9, [M, K]).astype(dtype)
 9 |     # b_np = np.random.uniform(-0.91, 0.9, [K, N]).astype(dtype)
10 |     # a_torch = torch.relu(torch.tensor(a_np)).to_sparse()
11 |     # b_torch = torch.tensor(b_np)
12 |     m = torch.distributions.bernoulli.Bernoulli(torch.tensor(0.9))
13 |     a_torch = m.sample([M, K]).to_sparse()
14 |     b_torch = m.sample([K, N])
15 | 
16 |     # warm-up
17 |     res = spmm(a_torch, b_torch)
18 |     beg = time.time()
19 |     for i in range(n_trial):
20 |         spmm(a_torch, b_torch)
21 |     end = time.time()
22 |     return (end - beg) * 1e3 / n_trial
23 | 
24 | 
25 | def torch_spmv(M, K, dtype="float32", n_trial=1):
26 |     spmm = torch.sparse.mm 
27 |     # a_np = np.random.uniform(-0.91, 0.9, [M, K]).astype(dtype)
28 |     # b_np = np.random.uniform(-0.91, 0.9, [K, 1]).astype(dtype)
29 |     # a_torch = torch.relu(torch.tensor(a_np)).to_sparse()
30 |     # b_torch = torch.tensor(b_np)
31 |     m = torch.distributions.bernoulli.Bernoulli(torch.tensor(0.9))
32 |     a_torch = m.sample([M, K]).to_sparse()
33 |     b_torch = m.sample([K, 1])
34 | 
35 |     # warm-up
36 |     res = spmm(a_torch, b_torch)
37 |     beg = time.time()
38 |     for i in range(n_trial):
39 |         spmm(a_torch, b_torch)
40 |     end = time.time()
41 |     return (end - beg) * 1e3 / n_trial
42 | 
43 | 
44 | if __name__ == "__main__":
45 |     # for i in [1, 5, 10, 15, 20, 50]:
46 |     #     size = i * 2**10
47 |     #     try:
48 |     #         res = torch_spmm(size, size, size, n_trial=10)
49 |     #     except Exception as e:
50 |     #         print(str(e))
51 |     #         res = float("inf")
52 |     #     print("Spmm pytorch: [scale: %d]: %f ms" % (size, res))
53 |     for i in range(1, 10 + 1):
54 |         size = i * 10 * 2**10
55 |         try:
56 |             res = torch_spmv(size, size, n_trial=10)
57 |         except Exception as e:
58 |             print(str(e))
59 |             res = float("inf")
60 |         print("Spmm pytorch: [scale: %d]: %f ms" % (size, res))
61 |     


--------------------------------------------------------------------------------
/flextensor/configs/PixelCNN_config.py:
--------------------------------------------------------------------------------
1 | """
2 |     (batch, H, W, in_C, out_C, KH, KW, mask_type, bias, dilation, stride, padding)
3 | """
4 | PixelCNN_shape = [
5 |     (1, 256, 256, 3, 32, 7, 7, 'A', None, 1, 1, 0),
6 |     (1, 128, 128, 256, 256, 3, 3, 'B', None, 1, 1, 0)
7 | ]


--------------------------------------------------------------------------------
/flextensor/configs/bilinear_config.py:
--------------------------------------------------------------------------------
1 | bilinear_shapes = [
2 |     (32, 1024, 1024, 32),
3 |     (64, 512, 1024, 32),
4 |     (128, 256, 512, 64),
5 |     (256, 128, 256, 128),
6 |     (512, 64, 128, 256),
7 | ]


--------------------------------------------------------------------------------
/flextensor/configs/block_circulant_matrix_config.py:
--------------------------------------------------------------------------------
1 | block_circulant_matrix_shapes = []
2 | 
3 | for shape in [(1024, 256), (1024, 512), (1024, 40)]:
4 |     for factor in [8, 16]:
5 |         block_circulant_matrix_shapes.append((*shape, factor))
6 | 


--------------------------------------------------------------------------------
/flextensor/configs/conv1d_config.py:
--------------------------------------------------------------------------------
1 | conv1d_shapes = [
2 |     (1, 192, 56 * 56, 128, 192, 1 * 1, 1, 1, 0, 1, 1), 
3 |     (1, 128, 56 * 56, 256, 128, 3 * 3, 1, 1, 1, 1, 1), 
4 |     (1, 512, 28 * 28, 256, 512, 1 * 1, 1, 1, 0, 1, 1), 
5 |     (1, 256, 28 * 28, 512, 256, 3 * 3, 1, 1, 1, 1, 1),
6 |     (1, 1024, 196, 512, 1024, 1 * 1, 1, 1, 0, 1, 1), 
7 |     (1, 512, 196, 1024, 512, 3 * 3, 1, 1, 1, 1, 1),
8 |     (1, 1024, 49, 1024, 1024, 3 * 3, 1, 1, 1, 1, 1), 
9 | ]


--------------------------------------------------------------------------------
/flextensor/configs/conv3d_config.py:
--------------------------------------------------------------------------------
 1 | conv3d_shapes = [
 2 |     # yolo
 3 |     # (1, 3, 16, 112, 112, 64, 3, 7, 1, 2, 3, 1, 1), 
 4 |     # (1, 64, 16, 28, 28, 192, 64, 3, 1, 1, 1, 1, 1),  
 5 |     # (1, 192, 4, 28, 28, 128, 192, 1, 1, 1, 0, 1, 1), 
 6 |     # (1, 128, 4, 28, 28, 256, 128, 3, 1, 1, 1, 1, 1),  
 7 |     # (1, 512, 4, 14, 14, 256, 512, 1, 1, 1, 0, 1, 1),  
 8 |     # (1, 256, 4, 14, 14, 512, 256, 3, 1, 1, 1, 1, 1), 
 9 |     # (1, 1024, 4, 7, 7, 512, 1024, 1, 1, 1, 0, 1, 1),  
10 |     # (1, 512, 4, 7, 7, 1024, 512, 3, 1, 1, 1, 1, 1),  
11 | 
12 |     # resnet-18
13 |     # N, C,   D,  H,   W,   _, K, _,   kernel_size, stride=1,  padding=0, dilation=1, groups=1
14 |     # skip the first layer
15 |     # ( 1, 3,   16, 112, 112, 64,  7,           (1, 2, 2), 3,         1,          1),
16 |     # conv2_x
17 |     ( 1, 64,  8,  28,  28,  64,  64, 3, 1,          1,         1,         1,          1),
18 |     # ( 1, 64,  8,  28,  28,  64,  3,           1,         1,         1,          1),
19 |     # conv3_x
20 |     ( 1, 64,  8,  28,  28,  128, 64, 3, 1,          2,         1,         1,          1),
21 |     ( 1, 128,  4,  14,  14,  128, 128, 3, 1,          1,         1,         1,          1),
22 |     ( 1, 64,  8,  28,  28,  128, 64, 1, 1,          2,         1,         1,          1),
23 | 
24 |     # ( 1, 128,  4,  14,  14,  128, 3,           1,         1,         1,          1),
25 |     # ( 1, 128,  4,  14,  14,  128, 3,           1,         1,         1,          1),
26 |     # conv4_x
27 |     ( 1, 128, 4,  14,  14,  256, 128, 3, 1,          2,         1,         1,          1),
28 |     ( 1, 256, 2,  7,  7,  256, 256, 3, 1,          1,         1,         1,          1),
29 |     ( 1, 128, 4,  14,  14,  256, 128, 1, 1,          2,         1,         1,          1),
30 | 
31 |     # ( 1, 256, 2,  7,  7,  256, 3,           1,         1,         1,          1),
32 |     # ( 1, 256, 2,  7,  7,  256, 3,           1,         1,         1,          1),
33 |     # conv5_x
34 |     ( 1, 256, 2,  7,   7,   512, 256, 3, 1,          2,         1,         1,          1),
35 |     ( 1, 512, 1,  3,   3,   512, 512, 3, 1,          1,         1,         1,          1),
36 |     ( 1, 256, 2,  7,   7,   512, 256, 1, 1,          2,         1,         1,          1),
37 | 
38 |     # ( 1, 512, 1,  3,   3,   512, 3,           1,         1,         1,          1),
39 |     # ( 1, 512, 1,  3,   3,   512, 3,           1,         1,         1,          1),
40 | ]


--------------------------------------------------------------------------------
/flextensor/configs/conv_transpose2d_config.py:
--------------------------------------------------------------------------------
 1 | # N, C, H, W, K, kernel_size, stride=1, padding=0, output_padding=0, dilation=1, groups=1
 2 | conv_transpose2d_shapes = [
 3 |     # U-Net(https://arxiv.org/abs/1505.04597) with input size: 572 * 572
 4 |     (1, 1024, 28, 28, 512, 2, 2, 0, 0, 1, 1),   # 1*1024*28*28 -> 1*512*56*56
 5 |     (1, 512, 52, 52, 256, 2, 2, 0, 0, 1, 1),    # 1*512*52*52 -> 1*256*104*104
 6 |     (1, 256, 100, 100, 128, 2, 2, 0, 0, 1, 1),  # 1*256*100*100 -> 1*128*200*200
 7 |     (1, 128, 196, 196, 64, 2, 2, 0, 0, 1, 1),   # 1*128*196*196 -> 1*64*392*392
 8 | 
 9 |     # ShiftNet(https://arxiv.org/abs/1801.09392) with input size: 256*256
10 |     (1, 512, 1, 1, 512, 4, 2, 1, 0, 1, 1),  # 1*512*1*1 -> 1*512*2*2
11 |     (1, 1024, 2, 2, 512, 4, 2, 1, 0, 1, 1), # 1*1024*2*2 -> 1*512*4*4
12 |     (1, 1024, 4, 4, 512, 4, 2, 1, 0, 1, 1), # 1*1024*4*4 -> 1*512*8*8
13 |     (1, 1024, 8, 8, 512, 4, 2, 1, 0, 1, 1), # 1*1024*8*8 -> 1*512*16*16
14 |     (1, 1024, 16, 16, 256, 4, 2, 1, 0, 1, 1), # 1*1024*16*16 -> 1*256*32*32
15 |     (1, 768, 32, 32, 128, 4, 2, 1, 0, 1, 1), # 1*768*32*32 -> 1*128*64*64
16 |     (1, 256, 64, 64, 64, 4, 2, 1, 0, 1, 1), # 1*256*64*64 -> 1*64*128*128
17 |     (1, 128, 128, 128, 3, 4, 2, 1, 0, 1, 1), # 1*128*128*128 -> 1*3*256*256
18 | ]


--------------------------------------------------------------------------------
/flextensor/configs/depthwise_config.py:
--------------------------------------------------------------------------------
 1 | depthwise_shapes = [
 2 |     (1, 32, 112, 112, 1, 3, 3, 1, 1, 1),
 3 |     (1, 16, 112, 112, 6, 3, 3, 2, 1, 1),
 4 |     (1, 24, 56, 56, 6, 3, 3, 2, 1, 1),
 5 |     (1, 32, 28, 28, 6, 3, 3, 2, 1, 1),
 6 |     (1, 64, 14, 14, 6, 3, 3, 1, 1, 1),
 7 |     (1, 96, 14, 14, 6, 3, 3, 2, 1, 1),
 8 |     (1, 160, 7, 7, 6, 3, 3, 1, 1, 1),
 9 | ]
10 | 
11 | 
12 | depthwise_shapes_b8 = [
13 |     (8, 32, 112, 112, 1, 3, 3, 1, 1, 1),
14 |     (8, 16, 112, 112, 6, 3, 3, 2, 1, 1),
15 |     (8, 24, 56, 56, 6, 3, 3, 2, 1, 1),
16 |     (8, 32, 28, 28, 6, 3, 3, 2, 1, 1),
17 |     (8, 64, 14, 14, 6, 3, 3, 1, 1, 1),
18 |     (8, 96, 14, 14, 6, 3, 3, 2, 1, 1),
19 |     (8, 160, 7, 7, 6, 3, 3, 1, 1, 1),
20 | ]
21 | 
22 | 
23 | depthwise_shapes_b16 = [
24 |     (16, 32, 112, 112, 1, 3, 3, 1, 1, 1),
25 |     (16, 16, 112, 112, 6, 3, 3, 2, 1, 1),
26 |     (16, 24, 56, 56, 6, 3, 3, 2, 1, 1),
27 |     (16, 32, 28, 28, 6, 3, 3, 2, 1, 1),
28 |     (16, 64, 14, 14, 6, 3, 3, 1, 1, 1),
29 |     (16, 96, 14, 14, 6, 3, 3, 2, 1, 1),
30 |     (16, 160, 7, 7, 6, 3, 3, 1, 1, 1),
31 | ]


--------------------------------------------------------------------------------
/flextensor/configs/dilation_config.py:
--------------------------------------------------------------------------------
 1 | dilation_shapes_test = [
 2 |     (1, 3, 448, 448, 64, 7, 7, 2, 3, 2, 1), 
 3 |     (1, 64, 112, 112, 192, 3, 3, 1, 1, 2, 1),
 4 |     (1, 192, 56, 56, 128, 1, 1, 1, 0, 2, 1),
 5 |     (1, 128, 56, 56, 256, 3, 3, 1, 1, 2, 1),
 6 | ]
 7 | 
 8 | dilation_shapes = [
 9 |     # yolo
10 |     (1, 256, 56, 56, 256, 1, 1, 1, 0, 2, 1),  # conv5   4
11 |     (1, 256, 56, 56, 512, 3, 3, 1, 1, 2, 1),  # conv6   5
12 |     (1, 512, 28, 28, 256, 1, 1, 1, 0, 2, 1),  # conv7   6
13 |     (1, 256, 28, 28, 512, 3, 3, 1, 1, 2, 1),  # conv8   7
14 | 
15 |     (1, 512, 28, 28, 512, 1, 1, 1, 0, 2, 1),  # conv15      8
16 |     (1, 512, 28, 28, 1024, 3, 3, 1, 1, 2, 1),  # conv16     9
17 |     (1, 1024, 14, 14, 512, 1, 1, 1, 0, 2, 1),  # conv17    10
18 |     (1, 512, 14, 14, 1024, 3, 3, 1, 1, 2, 1),  # conv18     11
19 | 
20 |     (1, 1024, 14, 14, 1024, 3, 3, 1, 1, 2, 1),  # conv21   12
21 |     (1, 1024, 14, 14, 1024, 3, 3, 2, 1, 2, 1),  # conv22   13
22 |     (1, 1024, 7, 7, 1024, 3, 3, 1, 1, 2, 1),  # conv23     14
23 | ]


--------------------------------------------------------------------------------
/flextensor/configs/gated_pixelcnn_config.py:
--------------------------------------------------------------------------------
1 | """
2 |     batch_size, input_height, input_width, in_channels, out_channels, kernel_height, kernel_width, ClassVector=None, bias=None, dilation=1, stride=1, padding=0
3 | """
4 | gated_pixelcnn_shape = [
5 |     (1, 256, 256, 3, 256, 3, None, None, 1, 1, 0)
6 | ]


--------------------------------------------------------------------------------
/flextensor/configs/gemm_config.py:
--------------------------------------------------------------------------------
 1 | old_gemm_shapes = [
 2 |     (32, 32, 32),
 3 |     (64, 64, 64),
 4 |     (128, 128, 128),
 5 |     (256, 256, 256),
 6 |     (512, 512, 512),
 7 |     (1024, 1024, 1024),
 8 |     (2048, 2048, 2048)
 9 | ]
10 | 
11 | gemm_shapes = []
12 | for i in range(5, 13):
13 |     for j in range(5, 13):
14 |         for k in range(5, 13):
15 |             gemm_shapes.append([2**i, 2**k, 2**j])
16 | 
17 | 
18 | test_gemm_shapes = [
19 |     # batch
20 |     # height
21 |     # width
22 |     # length
23 | 
24 |     # open
25 |     (1, 1024, 1024, 1024),
26 |     (2, 512, 512, 512),
27 |     (3, 1024, 32, 1024),
28 |     # confidential
29 |     (16, 4096, 128, 1024),
30 |     (32, 28, 1024, 28),
31 | ]


--------------------------------------------------------------------------------
/flextensor/configs/gemv_config.py:
--------------------------------------------------------------------------------
1 | gemv_shapes = [
2 |     (128, 128, 128),
3 |     (256, 256, 256),
4 |     (512, 512, 512),
5 |     (1024, 1024, 1024),
6 |     (2048, 2048, 2048),
7 |     (4096, 4096, 4096),
8 | ]


--------------------------------------------------------------------------------
/flextensor/configs/grouped_config.py:
--------------------------------------------------------------------------------
 1 | grouped_shapes = [
 2 |     (1, 1024, 14, 14, 512, 1, 1, 1, 0, 1, 4),
 3 |     (1, 1024, 14, 14, 512, 1, 1, 1, 0, 1, 8),
 4 |     (1, 1024, 14, 14, 512, 1, 1, 1, 0, 1, 16),
 5 |     (1, 1024, 14, 14, 512, 1, 1, 1, 0, 1, 32),
 6 | ]
 7 | 
 8 | grouped_shapes_ = [
 9 |     (1, 64, 112, 112, 192, 3, 3, 1, 1, 1, 16),  # conv2   1
10 |     (1, 192, 56, 56, 128, 1, 1, 1, 0, 1, 16),  # conv3   2
11 |     (1, 128, 56, 56, 256, 3, 3, 1, 1, 1, 16),  # conv4   3
12 |     (1, 256, 56, 56, 256, 1, 1, 1, 0, 1, 16),  # conv5   4
13 |     (1, 256, 56, 56, 512, 3, 3, 1, 1, 1, 16),  # conv6   5
14 |     (1, 512, 28, 28, 256, 1, 1, 1, 0, 1, 16),  # conv7   6
15 |     (1, 256, 28, 28, 512, 3, 3, 1, 1, 1, 16),  # conv8   7
16 | 
17 |     (1, 512, 28, 28, 512, 1, 1, 1, 0, 1, 16),  # conv15      8
18 |     (1, 512, 28, 28, 1024, 3, 3, 1, 1, 1, 16),  # conv16     9
19 |     (1, 1024, 14, 14, 512, 1, 1, 1, 0, 1, 16),  # conv17    10
20 |     (1, 512, 14, 14, 1024, 3, 3, 1, 1, 1, 16),  # conv18     11
21 | 
22 |     (1, 1024, 14, 14, 1024, 3, 3, 1, 1, 1, 16),  # conv21   12
23 |     (1, 1024, 14, 14, 1024, 3, 3, 2, 1, 1, 16),  # conv22   13
24 |     (1, 1024, 7, 7, 1024, 3, 3, 1, 1, 1, 16),  # conv23     14
25 | ]


--------------------------------------------------------------------------------
/flextensor/configs/maxunpooling1d_config.py:
--------------------------------------------------------------------------------
1 | """
2 | batch, channels, in_lengths, kernel_size, stride, padding
3 | """
4 | maxunpooling1d_shape = [
5 |     (1, 128, 114, 2, 2, 0),
6 |     (1, 256, 56, 2, 2, 0),
7 |     (1, 512, 28, 2, 2, 0),
8 |     (1, 512, 14, 2, 2, 0)
9 | ]


--------------------------------------------------------------------------------
/flextensor/configs/maxunpooling2d_config.py:
--------------------------------------------------------------------------------
 1 | 
 2 | """
 3 | batch, channels, height, width, kernel_size, stride, padding
 4 | """
 5 | maxunpooling2d_shape = [
 6 |     # DeconvNet(https://arxiv.org/abs/1505.04366) --- based on VGG-16
 7 |     (1, 64, 112, 112, 2, 2, 0), # 1*64*112*112 -> 1*64*224*224 
 8 |     (1, 128, 56, 56, 2, 2, 0),  # 1*128*56*56 -> 1*128*112*112
 9 |     (1, 256, 28, 28, 2, 2, 0),  # 1*256*28*28 -> 1*256*56*56
10 |     (1, 512, 14, 14, 2, 2, 0),  # 1*512*14*14 -> 1*512*28*28
11 |     (1, 512, 7, 7, 2, 2, 0),    # 1*512*7*7 -> 1*512*14*14
12 | ]


--------------------------------------------------------------------------------
/flextensor/configs/mttkrp_config.py:
--------------------------------------------------------------------------------
1 | mttkrp_shapes = [
2 |     (128, 256, 512, 64),
3 |     (128, 256, 256, 128),
4 |     (256, 128, 256, 128),
5 |     (256, 128, 128, 256),
6 |     (512, 64, 128, 256),
7 |     (512, 64, 64, 512),
8 |     (1024, 32, 32, 1024)
9 | ]


--------------------------------------------------------------------------------
/flextensor/configs/shift_conv2d_config.py:
--------------------------------------------------------------------------------
 1 | """
 2 |     (N, H, W, C, kernel_size, dilation)
 3 | """
 4 | shift_conv2d_shape = [
 5 |     # ShiftNet(https://arxiv.org/abs/1801.09392) with input size: 256*256
 6 |     # (1, 128, 128, 64, 3, 1), 
 7 |     (1, 128, 128, 64, 3, 1), 
 8 |     (1, 64, 64, 128, 5, 1), 
 9 |     (1, 32, 32, 256, 3, 1), 
10 |     (1, 16, 16, 512, 3, 1)
11 | ]


--------------------------------------------------------------------------------
/flextensor/examples/__init__.py:
--------------------------------------------------------------------------------
1 | from .single_operation import FUNC_TABLE
2 | 


--------------------------------------------------------------------------------
/flextensor/examples/autotvm_opt_conv1_cpu.py:
--------------------------------------------------------------------------------
 1 | import tvm
 2 | import logging
 3 | import sys
 4 | import time
 5 | from tvm import autotvm
 6 | from flextensor.scheduler import parallel_evaluate
 7 | 
 8 | 
 9 | @autotvm.template
10 | def conv2d_channel_batch(B, N, M, C, K, L, O, stride=1, padding=0, dtype="float32"):
11 |     A = tvm.te.placeholder((B, N, M, C), dtype=dtype, name="A")
12 |     W = tvm.te.placeholder((K, L, C, O), dtype=dtype, name="W")
13 |     N_out = max(0, (N + padding * 2 - K) // stride) + 1
14 |     M_out = max(0, (M + padding * 2 - L) // stride) + 1
15 |     Apad = tvm.te.compute((B, N + 2 * padding, M + 2 * padding, C),
16 |                        lambda b, i, j, k: tvm.te.if_then_else(
17 |                            tvm.te.all(i >= padding, j >= padding, i < N + padding, j < M + padding),
18 |                            A[b, i - padding, j - padding, k], 0.0), name="Apad")
19 |     rx, ry = tvm.te.reduce_axis((0, K), name="rx"), tvm.te.reduce_axis((0, L), name="ry")
20 |     rc = tvm.te.reduce_axis((0, C), name="rc")
21 |     Output = tvm.te.compute((B, N_out, M_out, O),
22 |                          lambda b, i, j, k: tvm.te.sum(Apad[b, i * stride + rx, j * stride + ry, rc] * W[rx, ry, rc, k],
23 |                                                     axis=[rx, ry, rc]),
24 |                          name="Output")
25 | 
26 |     s = tvm.te.create_schedule(Output.op)
27 |     s[Apad].compute_inline()
28 |     CL = s.cache_write(Output, "local")
29 | 
30 |     n, h, w, c = s[Output].op.axis
31 |     out = s[Output].fuse(h, w)
32 |     cfg = autotvm.get_config()
33 |     cfg.define_split("split_n", n, num_outputs=2)
34 |     cfg.define_split("split_c", c, num_outputs=2)
35 |     no, ni = cfg["split_n"].apply(s, Output, n)
36 |     co, ci = cfg["split_c"].apply(s, Output, c)
37 |     s[Output].reorder(no, out, co, ni, ci)
38 |     s[Output].parallel(out)
39 | 
40 |     # schedule CL
41 |     s[CL].compute_at(s[Output], co)
42 |     ni, hi, wi, ci = s[CL].op.axis
43 |     xi, yi, ki = s[CL].op.reduce_axis
44 |     cfg.define_split("split_k", ki, num_outputs=2)
45 |     ko, ki = cfg["split_k"].apply(s, CL, ki)
46 |     s[CL].reorder(ko, xi, yi, ni, ki, ci)
47 |     s[CL].unroll(ki)
48 |     s[CL].vectorize(ci)
49 | 
50 |     return s, [A, W, Output]
51 | 
52 | 
53 | args = (1, 14, 14, 256, 3, 3, 512, 1, 1)
54 | task = autotvm.task.create(conv2d_channel_batch, args=args, target="llvm")
55 | 
56 | logging.getLogger("autotvm").setLevel(logging.DEBUG)
57 | logging.getLogger("autotvm").addHandler(logging.StreamHandler(sys.stdout))
58 | 
59 | measure_option = autotvm.measure_option(builder="local", runner=autotvm.LocalRunner(number=10))
60 | 
61 | # begin tuning
62 | tuner = autotvm.tuner.XGBTuner(task)
63 | # tuner = autotvm.tuner.RandomTuner(task)
64 | # tuner = autotvm.tuner.GATuner(task)
65 | # tuner = autotvm.tuner.GridSearchTuner(task)
66 | n_trial = len(task.config_space)
67 | print("trials=", n_trial)
68 | beg = time.time()
69 | tuner.tune(n_trial=n_trial, measure_option=measure_option, callbacks=[autotvm.callback.log_to_file("conv2d.log")])
70 | end = time.time()
71 | # history best
72 | with autotvm.apply_history_best("conv2d.log"):
73 |     with tvm.target.create("llvm"):
74 |         s, bufs = conv2d_channel_batch(*args)
75 |         func = tvm.build(s, bufs)
76 | 
77 | # time evaluate
78 | time_cost = parallel_evaluate(s, bufs, "llvm", 2)
79 | print("time cost is: ", time_cost, "ms, use ",(end - beg), "s")
80 | 


--------------------------------------------------------------------------------
/flextensor/examples/opt_blur2d_cpu.py:
--------------------------------------------------------------------------------
 1 | import time
 2 | import os
 3 | import numpy as np
 4 | import argparse
 5 | from flextensor.examples import FUNC_TABLE
 6 | from flextensor.test import test_graph_schedule_cpu_general_dx
 7 | from flextensor.train import Entity, train_op_schedule_cpu_general_dx
 8 | 
 9 | 
10 | def run(M, N, k, model_path, epoch=5, sample_size=16, number=100, test=False):
11 |     entities = []
12 |     func = FUNC_TABLE["gaussian_blur2d"].func
13 |     args = (M, N, k)
14 |     entities.append(Entity("gaussian_blur2d", args))
15 |     model_path = os.path.abspath(model_path)
16 |     if not test:
17 |         beg = time.time()
18 |         train_op_schedule_cpu_general_dx(entities, epoch, sample_size, model_path)
19 |         end = time.time()
20 |         print("{}({}):".format("gaussian_blur2d", args))
21 |         print("train done! use {}ms".format((end - beg) * 1e3))
22 |     test_graph_schedule_cpu_general_dx(func, args, model_path, number=number)
23 | 
24 | 
25 | def numpy_baseline(M, N, k, number=10):
26 |     A = np.random.random([M, N])
27 |     B = np.zeros([M, N])
28 | 
29 |     def blur(A, k, B):
30 |         Apad = np.vstack([np.zeros([k//2, A.shape[1] + k // 2]), np.hstack([np.zeros([A.shape[0], k // 2]), A])])
31 |         for i in range(k):
32 |             for j in range(k):
33 |                 np.add(B, np.vstack([np.hstack([Apad[:A.shape[0], j:], np.zeros([A.shape[0], j])])[i:, :A.shape[1]], np.zeros([i, A.shape[1]])]), B)
34 |         np.divide(B, k * k, B)
35 | 
36 |     beg = time.time()
37 |     for i in range(number):
38 |         blur(A, k, B)
39 |     end = time.time()
40 |     cost = (end - beg) * 1e3
41 |     return cost
42 | 
43 | 
44 | if __name__ == "__main__":
45 |     # parser = argparse.ArgumentParser()
46 |     # parser.add_argument("-t", "--train", help="train the model", action="store_true")
47 |     # parser.add_argument("-p", "--pytorch", help="run pytorch baseline", action="store_true")
48 |     # parser.add_argument("-a", "--auto_schedule", help="run auto-scheduler", action="store_true")
49 |     # parser.add_argument("-n", "--number", help="number of tests", type=int, default=100)
50 |     # parser.add_argument("-f", "--model_file_path", type=str, default="../logs/test_model.pkl")
51 |     # parser.add_argument("--params", help="N,H,W,L,C,k,K,stride,padding", type=str, default="1,14,14,14,512,3,512,1,1")
52 |     # parser.add_argument("--epoch", type=int, default=5)
53 |     # parser.add_argument("--sample", type=int, default=16)
54 |     # args = parser.parse_args()
55 |     # test = not args.train
56 |     # use_torch = args.pytorch
57 |     # use_auto = args.auto_schedule
58 |     # try:
59 |     #     params = [int(x) for x in args.params.split(",")]
60 |     #     batch_size, height, width, depth, channel, kernel_size, output_channel, stride, padding = params
61 |     #     if use_torch:
62 |     #         pytorch_baseliine(batch_size, height, width, depth, channel, kernel_size, output_channel, stride, padding, args.number)
63 |     #     if use_auto:
64 |     #         run(batch_size, height, width, depth, channel, kernel_size, output_channel, stride, padding,
65 |     #             args.model_file_path, args.epoch, args.sample, args.number, test)
66 |     # except Exception as e:
67 |     #     raise ValueError("Bad parameters, please refer to usage")
68 |     arg_lst = [
69 |         (1024, 1024, 7),
70 |         (1024, 1024, 3),
71 |         (2048, 2048, 3),
72 |     ]
73 | 
74 |     names = [
75 |         "1024_7",
76 |         "1024_3",
77 |         "2048_3",
78 |     ]
79 | 
80 |     for i in range(len(arg_lst)):
81 |         # model_path = "../models/opt_blur2d_" + names[i] + "_cpu_process.pkl"
82 |         # entities = []
83 |         # func = FUNC_TABLE["gaussian_blur2d"].func
84 |         # args = arg_lst[i]
85 |         # entities.append(Entity("gaussian_blur2d", args))
86 |         # model_path = os.path.abspath(model_path)
87 |         # train_op_schedule_cpu_general_dx(entities, 20, 25, model_path, logfile="process_blur_" + names[i] + "_cpu.txt", device="cuda:1")
88 |         M, N, k = arg_lst[i]
89 |         print(arg_lst[i], numpy_baseline(M, N, k))
90 | 


--------------------------------------------------------------------------------
/flextensor/examples/opt_conv_gpu.py:
--------------------------------------------------------------------------------
 1 | import time
 2 | import os
 3 | import torch
 4 | import argparse
 5 | 
 6 | 
 7 | def run(N, H, W, C, kernel_size, K, stride, padding, model_path, epoch=5, sample=16, number=100, test=False):
 8 |     return -1
 9 | 
10 | 
11 | def pytorch_baseliine(N, H, W, C, kernel_size, K, stride, padding, number=100, dev=0):
12 |     A = torch.rand([N, C, H, W]).cuda("cuda:" + str(dev))
13 |     W = torch.rand([K, C, kernel_size, kernel_size]).cuda("cuda:" + str(dev))
14 | 
15 |     start = torch.cuda.Event(enable_timing=True)
16 |     end = torch.cuda.Event(enable_timing=True)
17 | 
18 |     start.record()
19 |     for i in range(number):
20 |         torch.nn.functional.conv2d(A, W)
21 |     end.record()
22 | 
23 |     # Waits for everything to finish running
24 |     torch.cuda.synchronize()
25 | 
26 |     return start.elapsed_time(end) / number
27 | 
28 | 
29 | if __name__ == "__main__":
30 |     parser = argparse.ArgumentParser()
31 |     parser.add_argument("-t", "--train", help="train the model", action="store_true")
32 |     parser.add_argument("-p", "--pytorch", help="run pytorch baseline", action="store_true")
33 |     parser.add_argument("-a", "--flextensor", help="run auto-scheduler", action="store_true")
34 |     parser.add_argument("-n", "--number", help="number of tests", type=int, default=100)
35 |     parser.add_argument("-f", "--model_file_path", type=str, default="../logs/test_model.pkl")
36 |     parser.add_argument("--params", help="N,H,W,C,k,K,stride,padding", type=str, default="1,14,14,512,3,512,1,1")
37 |     parser.add_argument("--epoch", type=int, default=5)
38 |     parser.add_argument("--sample", type=int, default=16)
39 |     parser.add_argument("--device", type=int, default=0)
40 |     args = parser.parse_args()
41 |     test = not args.train
42 |     use_torch = args.pytorch
43 |     use_auto = args.flextensor
44 |     try:
45 |         params = [int(x) for x in args.params.split(",")]
46 |         batch_size, height, width, channel, kernel_size, output_channel, stride, padding = params
47 |         if use_torch:
48 |             cost = pytorch_baseliine(batch_size, height, width, channel, kernel_size, output_channel, stride, padding, args.number, args.device)
49 |             print("PyTorch baseline: {}ms".format(cost))
50 |         if use_auto:
51 |             run(batch_size, height, width, channel, kernel_size, output_channel, stride, padding,
52 |                 args.model_file_path, args.epoch, args.sample, args.number, test)
53 |     except Exception as e:
54 |         raise ValueError("Bad parameters, please refer to usage")
55 | 
56 |     # arg_lst = [
57 |     #     (1, 7, 7, 1024, 3, 3, 1024, 1, 1),
58 |     #     # (8, 7, 7, 1024, 3, 3, 1024, 1, 1),
59 |     #     # (64, 7, 7, 1024, 3, 3, 1024, 1, 1),
60 |     #     # (256, 7, 7, 1024, 3, 3, 1024, 1, 1),
61 |     #     (1, 14, 14, 1024, 1, 1, 512, 1, 0),
62 |     #     (1, 28, 28, 256, 3, 3, 512, 1, 1),
63 |     #     (1, 28, 28, 512, 1, 1, 256, 1, 0),
64 |     #     (1, 56, 56, 128, 3, 3, 256, 1, 1),
65 |     #     (1, 56, 56, 192, 1, 1, 128, 1, 0),
66 |     #     (1, 112, 112, 64, 3, 3, 192, 1, 1),
67 |     #     (1, 448, 448, 3, 7, 7, 64, 2, 3)
68 |     # ]
69 |     # names = [
70 |     #     "yolo24_b1",
71 |     #     # "yolo24_b8",
72 |     #     # "yolo24_b64",
73 |     #     # "yolo24_b256",
74 |     #     "yolo19_b1",
75 |     #     "yolo10_b1",
76 |     #     "yolo7_b1",
77 |     #     "yolo4_b1",
78 |     #     "yolo3_b1",
79 |     #     "yolo2_b1",
80 |     #     "yolo1_b1"
81 |     # ]
82 |     # for i in range(len(arg_lst)):
83 |     #     model_path = "opt_conv2d_nchw_" + names[i] + "_gpu.pkl"
84 |     #     entities = []
85 |     #     args = arg_lst[i]
86 |     #     entities.append(Entity("conv2d_nchw", args))
87 |     #     model_path = os.path.abspath(model_path)
88 |     #     train_op_schedule_gpu_general_dx(entities, 40, 25, model_path, logfile="process_conv2d_nchw_" + names[i] + "_gpu.txt")
89 | 
90 | 


--------------------------------------------------------------------------------
/flextensor/examples/opt_gemm_cpu.py:
--------------------------------------------------------------------------------
 1 | import time
 2 | import os
 3 | import torch
 4 | import argparse
 5 | from flextensor.examples import FUNC_TABLE
 6 | from flextensor.test import test_graph_schedule_cpu_general_dx
 7 | from flextensor.train import Entity, train_op_schedule_cpu_general_dx
 8 | 
 9 | 
10 | def run(batch_size, M, N, L, model_path, epoch=5, sample=16, number=100, test=False):
11 |     entities = []
12 |     func = FUNC_TABLE["matmul_batch"].func
13 |     args = (batch_size, M, N, L)
14 |     entities.append(Entity(func, args))
15 |     model_path = os.path.abspath(model_path)
16 |     if not test:
17 |         beg = time.time()
18 |         train_op_schedule_cpu_general_dx(entities, epoch, sample, model_path)
19 |         end = time.time()
20 |         print("{}({}):".format("matmul_batch", args))
21 |         print("train done! use {}ms".format((end - beg) * 1e3))
22 |     test_graph_schedule_cpu_general_dx(func, args, model_path, number=number)
23 | 
24 | 
25 | def pytorch_baseliine(batch_size, M, N, L, number=100):
26 |     A = torch.rand((batch_size, M, N))
27 |     B = torch.rand((N, L))
28 |     beg = time.time()
29 |     for i in range(number):
30 |         C = A.matmul(B)
31 |     end = time.time()
32 |     cost = (end - beg) / number * 1e3
33 |     print("pytorch gemm use {}ms".format(cost))
34 |     return cost
35 | 
36 | 
37 | if __name__ == "__main__":
38 |     # parser = argparse.ArgumentParser()
39 |     # parser.add_argument("-t", "--train", help="train the model", action="store_true")
40 |     # parser.add_argument("-p", "--pytorch", help="run pytorch baseline", action="store_true")
41 |     # parser.add_argument("-a", "--auto_schedule", help="run auto-scheduler", action="store_true")
42 |     # parser.add_argument("-n", "--number", help="number of tests", type=int, default=100)
43 |     # parser.add_argument("-f", "--model_file_path", type=str, default="../logs/test_model.pkl")
44 |     # parser.add_argument("--params", help="B,M,N,L", type=str, default="1,1024,1024,1024")
45 |     # parser.add_argument("--epoch", type=int, default=5)
46 |     # parser.add_argument("--sample", type=int, default=16)
47 |     # args = parser.parse_args()
48 |     # test = not args.train
49 |     # use_torch = args.pytorch
50 |     # use_auto = args.auto_schedule
51 |     # try:
52 |     #     params = [int(x) for x in args.params.split(",")]
53 |     #     batch_size, M, N, L = params
54 |     #     if use_torch:
55 |     #         pytorch_baseliine(batch_size, M, N, L, args.number)
56 |     #     if use_auto:
57 |     #         run(batch_size, M, N, L, args.model_file_path, args.epoch, args.sample, args.number, test)
58 |     # except Exception as e:
59 |     #     raise ValueError("Bad parameters, please refer to usage")
60 | 
61 |     # gemm
62 |     arg_lst = [
63 |         (1, 128, 128, 128),
64 |         (1, 256, 256, 256),
65 |         (1, 512, 512, 512),
66 |         (1, 1024, 1024, 1024),
67 |         (1, 2048, 2048, 2048),
68 |         (1, 4096, 4096, 4096),
69 |         (1, 1024, 32, 1024),
70 |         (1, 32, 1024, 32),
71 |     ]
72 | 
73 |     names = [
74 |         "128_128_128_b1",
75 |         "256_256_256_b1",
76 |         "512_512_512_b1",
77 |         "1024_1024_1024_b1",
78 |         "2048_2048_2048_b1",
79 |         "4096_4096_4096_b1",
80 |         "1024_32_1024_b1",
81 |         "32_1024_32_b1",
82 |     ]
83 |     for i in range(len(arg_lst)):
84 |         model_path = "opt_gemm_" + names[i] + "_cpu.pkl"
85 |         entities = []
86 |         args = arg_lst[i]
87 |         entities.append(Entity("matmul_batch", args))
88 |         model_path = os.path.abspath(model_path)
89 |         train_op_schedule_cpu_general_dx(entities, 20, 50, model_path, logfile="process_gemm_" + names[i] + "_cpu.txt", device="cuda:0")
90 | 
91 | 


--------------------------------------------------------------------------------
/flextensor/examples/opt_gemm_gpu.py:
--------------------------------------------------------------------------------
 1 | import time
 2 | import os
 3 | import torch
 4 | import argparse
 5 | from flextensor.test import test_graph_schedule_gpu_general_dx
 6 | from flextensor.train import Entity, train_op_schedule_gpu_general_dx
 7 | 
 8 | 
 9 | def run(batch_size, M, N, L, model_path, epoch=5, sample=16, number=100, test=False):
10 |     entities = []
11 |     args = (batch_size, M, N, L)
12 |     entities.append(Entity("matmul_batch", args))
13 |     model_path = os.path.abspath(model_path)
14 |     if not test:
15 |         beg = time.time()
16 |         train_op_schedule_gpu_general_dx(entities, epoch, sample, model_path)
17 |         end = time.time()
18 |         print("{}({}):".format("matmul_batch", args))
19 |         print("train done! use {}ms".format((end - beg) * 1e3))
20 |     test_graph_schedule_gpu_general_dx(entities, model_path, sampling=True, number=number)
21 | 
22 | 
23 | def pytorch_baseliine(batch_size, M, N, L, number=100):
24 |     A = torch.rand((batch_size, M, N)).cuda()
25 |     B = torch.rand((N, L)).cuda()
26 |     beg = time.time()
27 |     for i in range(number):
28 |         C = A.matmul(B)
29 |     end = time.time()
30 |     cost = (end - beg) / number * 1e3
31 |     return cost
32 | 
33 | 
34 | if __name__ == "__main__":
35 |     # parser = argparse.ArgumentParser()
36 |     # parser.add_argument("-t", "--train", help="train the model", action="store_true")
37 |     # parser.add_argument("-p", "--pytorch", help="run pytorch baseline", action="store_true")
38 |     # parser.add_argument("-a", "--auto_schedule", help="run auto-scheduler", action="store_true")
39 |     # parser.add_argument("-n", "--number", help="number of tests", type=int, default=100)
40 |     # parser.add_argument("-f", "--model_file_path", type=str, default="../logs/test_model.pkl")
41 |     # parser.add_argument("--params", help="B,M,N,L", type=str, default="1,1024,1024,1024")
42 |     # parser.add_argument("--epoch", type=int, default=5)
43 |     # parser.add_argument("--sample", type=int, default=16)
44 |     # args = parser.parse_args()
45 |     # test = not args.train
46 |     # use_torch = args.pytorch
47 |     # use_auto = args.auto_schedule
48 |     # try:
49 |     #     params = [int(x) for x in args.params.split(",")]
50 |     #     batch_size, M, N, L = params
51 |     #     if use_torch:
52 |     #         cost = pytorch_baseliine(batch_size, M, N, L, args.number)
53 |     #         print("pytorch gemm use {}ms".format(cost))
54 |     #     if use_auto:
55 |     #         run(batch_size, M, N, L, args.model_file_path, args.epoch, args.sample, args.number, test)
56 |     # except Exception as e:
57 |     #     raise ValueError("Bad parameters, please refer to usage")
58 | 
59 |     # gemm
60 |     arg_lst = [
61 |         (1, 32, 32, 32),
62 |         (1, 64, 64, 64),
63 |         (1, 128, 128, 128),
64 |         (1, 256, 256, 256),
65 |         (1, 512, 512, 512),
66 |         (1, 1024, 1024, 1024),
67 |         (1, 2048, 2048, 2048),
68 |         (1, 4096, 4096, 4096),
69 |         (1, 1024, 32, 1024),
70 |         (1, 32, 1024, 32),
71 |     ]
72 | 
73 |     names = [
74 |         "32_32_32_b1",
75 |         "64_64_64_b1",
76 |         "128_128_128_b1",
77 |         "256_256_256_b1",
78 |         "512_512_512_b1",
79 |         "1024_1024_1024_b1",
80 |         "2048_2048_2048_b1",
81 |         "4096_4096_4096_b1",
82 |         "1024_32_1024_b1",
83 |         "32_1024_32_b1",
84 |     ]
85 |     for i in range(len(arg_lst)):
86 |         model_path = "opt_gemm_" + names[i] + "_gpu.pkl"
87 |         entities = []
88 |         args = arg_lst[i]
89 |         entities.append(Entity("matmul_batch", args))
90 |         model_path = os.path.abspath(model_path)
91 |         train_op_schedule_gpu_general_dx(entities, 20, 50, model_path, logfile="process_gemm_" + names[i] + "_gpu.txt", device="cuda:1")
92 | 
93 | 
94 | 


--------------------------------------------------------------------------------
/flextensor/examples/opt_outer_cpu.py:
--------------------------------------------------------------------------------
 1 | import time
 2 | import os
 3 | import numpy as np
 4 | import argparse
 5 | from flextensor.examples import FUNC_TABLE
 6 | from flextensor.test import test_graph_schedule_cpu_general_dx
 7 | from flextensor.train import Entity, train_op_schedule_cpu_general_dx
 8 | 
 9 | 
10 | def run(M, N, model_path, epoch=5, sample_size=16, number=100, test=False):
11 |     entities = []
12 |     func = FUNC_TABLE["outer_product"].func
13 |     args = (M, N)
14 |     entities.append(Entity("outer_product", args))
15 |     model_path = os.path.abspath(model_path)
16 |     if not test:
17 |         beg = time.time()
18 |         train_op_schedule_cpu_general_dx(entities, epoch, sample_size, model_path)
19 |         end = time.time()
20 |         print("{}({}):".format("outer_product", args))
21 |         print("train done! use {}ms".format((end - beg) * 1e3))
22 |     test_graph_schedule_cpu_general_dx(func, args, model_path, number=number)
23 | 
24 | 
25 | def numpy_baseline(M, N, number=100):
26 |     A = np.random.random(M)
27 |     B = np.random.random(N)
28 |     C = np.zeros([M, N])
29 |     beg = time.time()
30 |     for i in range(number):
31 |         np.outer(A, B, C)
32 |     end = time.time()
33 |     cost = (end - beg) * 1e3 / number
34 |     return cost
35 | 
36 | 
37 | if __name__ == "__main__":
38 |     # parser = argparse.ArgumentParser()
39 |     # parser.add_argument("-t", "--train", help="train the model", action="store_true")
40 |     # parser.add_argument("-p", "--pytorch", help="run pytorch baseline", action="store_true")
41 |     # parser.add_argument("-a", "--auto_schedule", help="run auto-scheduler", action="store_true")
42 |     # parser.add_argument("-n", "--number", help="number of tests", type=int, default=100)
43 |     # parser.add_argument("-f", "--model_file_path", type=str, default="../logs/test_model.pkl")
44 |     # parser.add_argument("--params", help="N,H,W,L,C,k,K,stride,padding", type=str, default="1,14,14,14,512,3,512,1,1")
45 |     # parser.add_argument("--epoch", type=int, default=5)
46 |     # parser.add_argument("--sample", type=int, default=16)
47 |     # args = parser.parse_args()
48 |     # test = not args.train
49 |     # use_torch = args.pytorch
50 |     # use_auto = args.auto_schedule
51 |     # try:
52 |     #     params = [int(x) for x in args.params.split(",")]
53 |     #     batch_size, height, width, depth, channel, kernel_size, output_channel, stride, padding = params
54 |     #     if use_torch:
55 |     #         pytorch_baseliine(batch_size, height, width, depth, channel, kernel_size, output_channel, stride, padding, args.number)
56 |     #     if use_auto:
57 |     #         run(batch_size, height, width, depth, channel, kernel_size, output_channel, stride, padding,
58 |     #             args.model_file_path, args.epoch, args.sample, args.number, test)
59 |     # except Exception as e:
60 |     #     raise ValueError("Bad parameters, please refer to usage")
61 |     arg_lst = [
62 |         (512, 512),
63 |         (1024, 1024),
64 |         (2048, 2048),
65 |     ]
66 | 
67 |     names = [
68 |         "512",
69 |         "1024",
70 |         "2048"
71 |     ]
72 | 
73 |     for i in range(len(arg_lst)):
74 |         model_path = "opt_outer_" + names[i] + "_cpu.pkl"
75 |         entities = []
76 |         func = FUNC_TABLE["outer_product"].func
77 |         args = arg_lst[i]
78 |         entities.append(Entity("outer_product", args))
79 |         model_path = os.path.abspath(model_path)
80 |         train_op_schedule_cpu_general_dx(entities, 10, 20, model_path, logfile="process_outer_" + names[i] + "_cpu.txt", device="cuda:1")
81 | 


--------------------------------------------------------------------------------
/flextensor/examples/transfer_cpu.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | from flextensor.train import Entity, train_op_schedule_cpu_general_dx
 3 | 
 4 | 
 5 | arg_lst = [
 6 |         (1, 7, 7, 1024, 3, 3, 1024, 1, 1),
 7 |         # (8, 7, 7, 1024, 3, 3, 1024, 1, 1),
 8 |         # (64, 7, 7, 1024, 3, 3, 1024, 1, 1),
 9 |         # (256, 7, 7, 1024, 3, 3, 1024, 1, 1),
10 |         (1, 14, 14, 1024, 1, 1, 512, 1, 0),
11 |         (1, 28, 28, 256, 3, 3, 512, 1, 1),
12 |         (1, 28, 28, 512, 1, 1, 256, 1, 0),
13 |         (1, 56, 56, 128, 3, 3, 256, 1, 1),
14 |         (1, 56, 56, 192, 1, 1, 128, 1, 0),
15 |         (1, 112, 112, 64, 3, 3, 192, 1, 1),
16 |         (1, 448, 448, 3, 7, 7, 64, 2, 3),
17 |         (1, 1024, 1024, 1024),
18 |     ]
19 | 
20 | names = [
21 |         "yolo24_b1",
22 |         # "yolo24_b8",
23 |         # "yolo24_b64",
24 |         # "yolo24_b256",
25 |         "yolo19_b1",
26 |         "yolo10_b1",
27 |         "yolo7_b1",
28 |         "yolo4_b1",
29 |         "yolo3_b1",
30 |         "yolo2_b1",
31 |         "yolo1_b1",
32 |         "gemm_1024"
33 |     ]
34 | 
35 | func_names = ["conv2d_nchw"] * 8 + ["matmul_batch"]
36 | 
37 | 
38 | def transfer(pre_train, post_train):
39 |     entities = []
40 |     for i in pre_train:
41 |         args = arg_lst[i]
42 |         entities.append(Entity(func_names[i], args))
43 |     model_path = "cpu_transfer_pre{}_post{}.pkl".format(pre_train, post_train)
44 |     model_path = os.path.abspath(model_path)
45 |     train_op_schedule_cpu_general_dx(entities, 20, 50, model_path,
46 |                                      logfile="process_transfer_pre{}_(post{})_cpu.txt".format(pre_train, post_train),
47 |                                      device="cuda:3")
48 |     entities = []
49 |     for i in post_train:
50 |         args = arg_lst[i]
51 |         entities.append(Entity(func_names[i], args))
52 |     model_path = "cpu_transfer_pre{}_post{}.pkl".format(pre_train, post_train)
53 |     model_path = os.path.abspath(model_path)
54 |     train_op_schedule_cpu_general_dx(entities, 20, 50, model_path,
55 |                                      logfile="process_transfer_(pre_{})_post{}_cpu.txt".format(pre_train, post_train),
56 |                                      device="cuda:3")
57 | 
58 | 
59 | if __name__ == "__main__":
60 |     transfer([1, 3], [5])
61 |     transfer([0, 2], [4])
62 |     transfer([0, 1, 2], [6])
63 |     transfer([1, 3, 5], [8])
64 | 


--------------------------------------------------------------------------------
/flextensor/examples/transfer_gpu.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | from flextensor.train import Entity, train_op_schedule_gpu_general_dx
 3 | 
 4 | 
 5 | arg_lst = [
 6 |         (1, 7, 7, 1024, 3, 3, 1024, 1, 1),
 7 |         # (8, 7, 7, 1024, 3, 3, 1024, 1, 1),
 8 |         # (64, 7, 7, 1024, 3, 3, 1024, 1, 1),
 9 |         # (256, 7, 7, 1024, 3, 3, 1024, 1, 1),
10 |         (1, 14, 14, 1024, 1, 1, 512, 1, 0),
11 |         (1, 28, 28, 256, 3, 3, 512, 1, 1),
12 |         (1, 28, 28, 512, 1, 1, 256, 1, 0),
13 |         (1, 56, 56, 128, 3, 3, 256, 1, 1),
14 |         (1, 56, 56, 192, 1, 1, 128, 1, 0),
15 |         (1, 112, 112, 64, 3, 3, 192, 1, 1),
16 |         (1, 448, 448, 3, 7, 7, 64, 2, 3),
17 |         (1, 1024, 1024, 1024),
18 |     ]
19 | 
20 | names = [
21 |         "yolo24_b1",
22 |         # "yolo24_b8",
23 |         # "yolo24_b64",
24 |         # "yolo24_b256",
25 |         "yolo19_b1",
26 |         "yolo10_b1",
27 |         "yolo7_b1",
28 |         "yolo4_b1",
29 |         "yolo3_b1",
30 |         "yolo2_b1",
31 |         "yolo1_b1",
32 |         "gemm_1024"
33 |     ]
34 | 
35 | func_names = ["conv2d_nchw"] * 8 + ["matmul_batch"]
36 | 
37 | 
38 | def transfer(pre_train, post_train):
39 |     entities = []
40 |     for i in pre_train:
41 |         args = arg_lst[i]
42 |         entities.append(Entity(func_names[i], args))
43 |     model_path = "gpu_transfer_pre{}_post{}.pkl".format(pre_train, post_train)
44 |     model_path = os.path.abspath(model_path)
45 |     train_op_schedule_gpu_general_dx(entities, 20, 50, model_path,
46 |                                      logfile="process_transfer_pre_{}_(post{})_gpu.txt".format(pre_train, post_train),
47 |                                      device="cuda:0")
48 |     entities = []
49 |     for i in post_train:
50 |         args = arg_lst[i]
51 |         entities.append(Entity(func_names[i], args))
52 |     model_path = "gpu_transfer0_pre{}_post{}.pkl".format(pre_train, post_train)
53 |     model_path = os.path.abspath(model_path)
54 |     train_op_schedule_gpu_general_dx(entities, 20, 50, model_path,
55 |                                      logfile="process_transfer_(pre_{})_post{}_gpu.txt".format(pre_train, post_train),
56 |                                      device="cuda:0")
57 | 
58 | 
59 | if __name__ == "__main__":
60 |     transfer([1, 3], [5])
61 |     transfer([0, 2], [4])
62 |     transfer([0, 1, 2], [6])
63 |     transfer([1, 3, 5], [8])
64 | 


--------------------------------------------------------------------------------
/flextensor/nn/README.md:
--------------------------------------------------------------------------------
 1 | ## 添加算子步骤
 2 | 
 3 | 1. 将算子的实现部分添加在`.ops.py`，如
 4 | ```python
 5 | def gemm(A, B, transposeA=False, transposeB=False):
 6 |     """Matrix multiplies matrix
 7 | 
 8 |     Args:
 9 |     -----------------------------
10 |     A: tvm.tensor.Tensor
11 |         shape [height, width]
12 |     B: tvm.tensor.Tensor
13 |         shape [width, length]
14 |     transposeA: (optional:False) bool
15 |     transposeB: (optional:False) bool
16 |     -----------------------------
17 | 
18 |     Returns:
19 |     -----------------------------
20 |     tvm.tensor.Tensor
21 |         shape [height, length]
22 |     -----------------------------
23 |     """
24 |     if transposeA and transposeB:
25 |         k = tvm.reduce_axis((0, B.shape[1]))
26 |         assert_print(A.shape[0].value == B.shape[1].value)
27 |         return tvm.compute((A.shape[1], B.shape[0]), lambda i, j: tvm.sum(A[k, i] * B[j, k], axis=k))
28 |     elif transposeA and not transposeB:
29 |         k = tvm.reduce_axis((0, B.shape[0]))
30 |         assert_print(A.shape[0].value == B.shape[0].value)
31 |         return tvm.compute((A.shape[1], B.shape[1]), lambda i, j: tvm.sum(A[k, i] * B[k, j], axis=k))
32 |     elif not transposeA and transposeB:
33 |         k = tvm.reduce_axis((0, B.shape[1]))
34 |         assert_print(A.shape[1].value == B.shape[1].value)
35 |         return tvm.compute((A.shape[0], B.shape[0]), lambda i, j: tvm.sum(A[i, k] * B[j, k], axis=k))
36 |     else:
37 |         k = tvm.reduce_axis((0, B.shape[0]))
38 |         assert_print(A.shape[1].value == B.shape[0].value)
39 |         return tvm.compute((A.shape[0], B.shape[1]), lambda i, j: tvm.sum(A[i, k] * B[k, j], axis=k))
40 | ```
41 | 2. 在`../configs/`中添加配置文件，如`../configs/gemm_config.py`
42 | 
43 | ```python
44 | gemm_shapes = [
45 |     (32, 32, 32),
46 |     (64, 64, 64),
47 |     (128, 128, 128),
48 |     (256, 256, 256),
49 |     (512, 512, 512),
50 |     (1024, 1024, 1024),
51 |     (2048, 2048, 2048)
52 | ]
53 | ```
54 | 3. 在`../task.py`中注册算子的task，如
55 | 
56 | ```python
57 | from flextensor.nn.ops import gemm as op_gemm
58 | 
59 | def gemm(N, K, M):
60 |     A = tvm.placeholder((N, K))
61 |     B = tvm.placeholder((K, M))
62 |     Output = op_gemm(A, B)
63 |     return [Output.op], [A, B, Output]
64 | 
65 | from flextensor.configs.gemm_config import gemm_shapes
66 | 
67 | for shape in gemm_shapes:
68 |     N, K, M = shape
69 |     for j in range(4):
70 |         register_task(Task("gemm", "gemm", gemm, (N, K, M), "llvm", j))
71 |         register_task(Task("gemm", "gemm", gemm, (N, K, M), "cuda", j))
72 | ```
73 | 4. （可选）在`../test/test_ops.py`中添加算子的正确性测试
74 | 5. 在`../optimize/`中添加优化测试


--------------------------------------------------------------------------------
/flextensor/nn/__init__.py:
--------------------------------------------------------------------------------
 1 | from .ops import (conv1d, conv_transpose1d, conv2d_nchw, conv_transpose2d_nchw,
 2 |                 conv3d_ncdhw, conv_transpose3d_ncdhw, depthwise_conv2d_nchw, 
 3 |                 conv2d_nhwc, gemm_conv2d_nchw, gemv, gemm, batch_gemm, linear,
 4 |                 bilinear, MTTKRP3d, pointwise_multiply, mean, variance, 
 5 |                 batch_normalization2d, block_circulant_matrix, MaxUnpooling1d,
 6 |                 MaxUnpooling2d, ShiftConv2d_nhwc, PixelCNN, GatedPixelCNN, conv2d_nchwc, 
 7 |                 winograd_conv2d_nchw)
 8 | from .layers import (YoloConvLayer1, YoloConvLayer2, YoloConvLayer3, YoloConvLayer4,
 9 |                 YoloConvLayer5, YoloConvLayer6, YoloConvLayer7, YoloConvLayer8,
10 |                 YoloConvLayer9, YoloConvLayer10, YoloConvLayer11, YoloConvLayer12,
11 |                 YoloConvLayer13, YoloConvLayer14, YoloConvLayer15,
12 |                 YoloGemmConvLayer1, YoloGemmConvLayer17, YoloGemmConvLayer24,
13 |                 SqueezeNetFire8, SqueezeNetFire8Gemm)


--------------------------------------------------------------------------------
/flextensor/optimize/README.md:
--------------------------------------------------------------------------------
 1 | # Testing Operators for Mali GPU Optimized by FlexTensor 
 2 | 
 3 | [TOC]
 4 | 
 5 | ## Preparation
 6 | 
 7 | ### [Installation of TVM](https://tvm.apache.org/docs/install/)
 8 | 
 9 | Note: Only need to turn on `USE_LLVM` in `config.cmake`.
10 | 
11 | ### [Installation of TVM Java Frontend](https://github.com/apache/incubator-tvm/blob/main/jvm/README.md)
12 | 
13 | ### [Installation&Setup of TVM-RPC App](https://github.com/apache/incubator-tvm/blob/main/apps/android_rpc/README.md)
14 | 
15 | 
16 | 
17 | ## Test
18 | 
19 | 1. Setup environment:
20 | 
21 |    1. Startup TVM RPC tracker and TVM RPC app.
22 |    2. Export `/path/to/FlexTensor` to `PYTHONPATH`.
23 | 
24 | 2. Open `FlexTensor/flextensor/optimize`.
25 | 
26 | 3. Run the following commands to test optimized operators: gemm, conv1d, conv2d (suppose tracker's ip:port is 0.0.0.0:9190):
27 | 
28 |    ```shell
29 |    python3 optimize_gemm.py \
30 |        --target_host "llvm -mtriple=aarch64-linux-android" \
31 |        --host 0.0.0.0 --port 9190 \
32 |        --use_rpc tracker \
33 |        --fcompile ndk \
34 |        --device_key android \
35 |        --target opencl \
36 |        --test gemm-config.log
37 |    
38 |    python3 optimize_conv1d.py \
39 |        --target_host "llvm -mtriple=aarch64-linux-android" \
40 |        --host 0.0.0.0 --port 9190 \
41 |        --use_rpc tracker \
42 |        --fcompile ndk \
43 |        --device_key android \
44 |        --target opencl \
45 |        --test conv1d-config.log
46 |    
47 |    python3 optimize_conv2d.py \
48 |        --target_host "llvm -mtriple=aarch64-linux-android" \
49 |        --host 0.0.0.0 --port 9190 \
50 |        --use_rpc tracker \
51 |        --fcompile ndk \
52 |        --device_key android \
53 |        --target opencl \
54 |        --test conv2d-config.log
55 |    ```
56 | 
57 |    Then you will see the generated kernels and evaluation results.
58 | 
59 | 


--------------------------------------------------------------------------------
/flextensor/optimize/conv1d-config.log:
--------------------------------------------------------------------------------
1 | conv1d_conv1d_(1, 192, 3136, 128, 1, 1, 0, 1, 1)_opencl(0):[[{"fuse": [[1, 2, 3]], "spatial": [[1, 1, 1, 1], [1, 24, 1, 8], [1, 784, 2, 2]], "reduce": [], "reorder": [[1]], "inline": [], "unroll": [[512, 0]], "merge": [], "special": []}, {"fuse": [[1, 2, 3]], "spatial": [[1, 1, 1, 1], [8, 8, 2, 1], [112, 1, 28, 1]], "reduce": [[2, 2, 48], [1, 1, 1]], "reorder": [[1]], "inline": [], "unroll": [[1500, 0]], "merge": [], "special": []}], {"fuse": [], "spatial": [], "reduce": [], "reorder": [], "inline": [[0, 0]], "unroll": [], "merge": [], "special": []}]
2 | conv1d_conv1d_(1, 128, 3136, 256, 9, 1, 1, 1, 1)_opencl(0):[[{"fuse": [[1, 2, 3]], "spatial": [[1, 1, 1, 1], [1, 2, 1, 64], [3, 1, 1, 1046]], "reduce": [], "reorder": [[1]], "inline": [], "unroll": [[1500, 0]], "merge": [], "special": []}, {"fuse": [[1, 2, 3]], "spatial": [[1, 1, 1, 1], [1, 2, 1, 128], [1, 313, 5, 2]], "reduce": [[32, 4, 1], [1, 9, 1]], "reorder": [[0]], "inline": [], "unroll": [[1500, 0]], "merge": [], "special": []}], {"fuse": [], "spatial": [], "reduce": [], "reorder": [], "inline": [[1, 0]], "unroll": [], "merge": [], "special": []}]
3 | conv1d_conv1d_(1, 512, 784, 256, 1, 1, 0, 1, 1)_opencl(0):[[{"fuse": [[1, 2, 3]], "spatial": [[1, 1, 1, 1], [1, 64, 1, 8], [1, 16, 1, 49]], "reduce": [], "reorder": [[2]], "inline": [], "unroll": [[512, 0]], "merge": [], "special": []}, {"fuse": [[1, 2, 3]], "spatial": [[1, 1, 1, 1], [16, 4, 1, 4], [56, 1, 14, 1]], "reduce": [[1, 256, 2], [1, 1, 1]], "reorder": [[0]], "inline": [], "unroll": [[1, 1]], "merge": [], "special": []}], {"fuse": [], "spatial": [], "reduce": [], "reorder": [], "inline": [[0, 0]], "unroll": [], "merge": [], "special": []}]
4 | conv1d_conv1d_(1, 256, 784, 512, 9, 1, 1, 1, 1)_opencl(0):[[{"fuse": [[1, 2, 3]], "spatial": [[1, 1, 1, 1], [128, 1, 2, 1], [3, 1, 131, 2]], "reduce": [], "reorder": [[1]], "inline": [], "unroll": [[1500, 0]], "merge": [], "special": []}, {"fuse": [[1, 2, 3]], "spatial": [[1, 1, 1, 1], [1, 512, 1, 1], [2, 389, 1, 1]], "reduce": [[2, 32, 4], [3, 3, 1]], "reorder": [[2]], "inline": [], "unroll": [[512, 0]], "merge": [], "special": []}], {"fuse": [], "spatial": [], "reduce": [], "reorder": [], "inline": [[1, 0]], "unroll": [], "merge": [], "special": []}]
5 | conv1d_conv1d_(1, 1024, 196, 512, 1, 1, 0, 1, 1)_opencl(0):[[{"fuse": [[1, 2, 3]], "spatial": [[1, 1, 1, 1], [1024, 1, 1, 1], [2, 1, 49, 2]], "reduce": [], "reorder": [[2]], "inline": [], "unroll": [[0, 1]], "merge": [], "special": []}, {"fuse": [[1, 2, 3]], "spatial": [[1, 1, 1, 1], [32, 1, 8, 2], [1, 2, 14, 7]], "reduce": [[1, 32, 32], [1, 1, 1]], "reorder": [[1]], "inline": [], "unroll": [[512, 1]], "merge": [], "special": []}], {"fuse": [], "spatial": [], "reduce": [], "reorder": [], "inline": [[1, 0]], "unroll": [], "merge": [], "special": []}]
6 | conv1d_conv1d_(1, 512, 196, 1024, 9, 1, 1, 1, 1)_opencl(0):[[{"fuse": [[1, 2, 3]], "spatial": [[1, 1, 1, 1], [64, 1, 2, 4], [1, 1, 99, 2]], "reduce": [], "reorder": [[2]], "inline": [], "unroll": [[1500, 1]], "merge": [], "special": []}, {"fuse": [[1, 2, 3]], "spatial": [[1, 1, 1, 1], [1, 4, 1, 256], [1, 2, 1, 95]], "reduce": [[64, 2, 4], [1, 3, 3]], "reorder": [[0]], "inline": [], "unroll": [[1, 0]], "merge": [], "special": []}], {"fuse": [], "spatial": [], "reduce": [], "reorder": [], "inline": [[1, 0]], "unroll": [], "merge": [], "special": []}]
7 | 


--------------------------------------------------------------------------------
/flextensor/optimize/conv2d-config.log:
--------------------------------------------------------------------------------
1 | conv2d_mobile_v21_(1, 16, 112, 112, 96, 3, 2, 1, 1, 16)_opencl(0):[[{"fuse": [[2, 3, 4]], "spatial": [[1, 1, 1, 1], [8, 2, 1, 1], [114, 1, 1, 1], [1, 1, 57, 2]], "reduce": [], "reorder": [[1]], "inline": [], "unroll": [[1500, 1]], "merge": [], "special": []}, {"fuse": [[2, 3, 4]], "spatial": [[1, 1, 1, 1], [48, 1, 2, 1], [8, 1, 1, 7], [1, 1, 14, 4]], "reduce": [[1, 1, 1], [3, 1, 1], [3, 1, 1]], "reorder": [[0]], "inline": [], "unroll": [[0, 1]], "merge": [], "special": []}], {"fuse": [], "spatial": [], "reduce": [], "reorder": [], "inline": [[0, 0]], "unroll": [], "merge": [], "special": []}]
2 | conv2d_mobile_v22_(1, 24, 56, 56, 144, 3, 2, 1, 1, 24)_opencl(0):[[{"fuse": [[1, 2, 4]], "spatial": [[1, 1, 1, 1], [4, 3, 2, 1], [29, 1, 1, 2], [1, 1, 58, 1]], "reduce": [], "reorder": [[0]], "inline": [], "unroll": [[512, 1]], "merge": [], "special": []}, {"fuse": [[2, 3, 4]], "spatial": [[1, 1, 1, 1], [24, 2, 3, 1], [2, 2, 1, 7], [1, 1, 28, 1]], "reduce": [[1, 1, 1], [1, 3, 1], [1, 1, 3]], "reorder": [[1]], "inline": [], "unroll": [[1, 1]], "merge": [], "special": []}], {"fuse": [], "spatial": [], "reduce": [], "reorder": [], "inline": [[1, 0]], "unroll": [], "merge": [], "special": []}]
3 | conv2d_mobile_v23_(1, 32, 28, 28, 192, 3, 2, 1, 1, 32)_opencl(0):[[{"fuse": [[1, 2, 4]], "spatial": [[1, 1, 1, 1], [32, 1, 1, 1], [5, 1, 2, 3], [1, 1, 30, 1]], "reduce": [], "reorder": [[0]], "inline": [], "unroll": [[512, 0]], "merge": [], "special": []}, {"fuse": [[1, 2, 4]], "spatial": [[1, 1, 1, 1], [32, 2, 3, 1], [1, 1, 7, 2], [1, 2, 7, 1]], "reduce": [[1, 1, 1], [3, 1, 1], [1, 1, 3]], "reorder": [[2]], "inline": [], "unroll": [[0, 1]], "merge": [], "special": []}], {"fuse": [], "spatial": [], "reduce": [], "reorder": [], "inline": [[1, 0]], "unroll": [], "merge": [], "special": []}]
4 | conv2d_mobile_v24_(1, 64, 14, 14, 384, 3, 1, 1, 1, 64)_opencl(0):[[{"fuse": [[2, 3, 4]], "spatial": [[1, 1, 1, 1], [8, 1, 4, 2], [4, 1, 4, 1], [1, 1, 8, 2]], "reduce": [], "reorder": [[0]], "inline": [], "unroll": [[512, 1]], "merge": [], "special": []}, {"fuse": [[1, 3, 4]], "spatial": [[1, 1, 1, 1], [48, 1, 2, 4], [2, 1, 7, 1], [1, 1, 7, 2]], "reduce": [[1, 1, 1], [1, 1, 3], [1, 3, 1]], "reorder": [[0]], "inline": [], "unroll": [[512, 1]], "merge": [], "special": []}], {"fuse": [], "spatial": [], "reduce": [], "reorder": [], "inline": [[0, 0]], "unroll": [], "merge": [], "special": []}]
5 | conv2d_mobile_v25_(1, 96, 14, 14, 576, 3, 2, 1, 1, 96)_opencl(0):[[{"fuse": [[1, 3, 4]], "spatial": [[1, 1, 1, 1], [8, 1, 12, 1], [2, 1, 8, 1], [1, 2, 2, 4]], "reduce": [], "reorder": [[1]], "inline": [], "unroll": [[1, 1]], "merge": [], "special": []}, {"fuse": [[2, 3, 4]], "spatial": [[1, 1, 1, 1], [48, 1, 12, 1], [1, 1, 1, 7], [1, 1, 7, 1]], "reduce": [[1, 1, 1], [1, 1, 3], [3, 1, 1]], "reorder": [[2]], "inline": [], "unroll": [[0, 1]], "merge": [], "special": []}], {"fuse": [], "spatial": [], "reduce": [], "reorder": [], "inline": [[1, 0]], "unroll": [], "merge": [], "special": []}]
6 | conv2d_mobile_v26_(1, 160, 7, 7, 960, 3, 1, 1, 1, 160)_opencl(0):[[{"fuse": [[1, 3, 4]], "spatial": [[1, 1, 1, 1], [16, 5, 1, 2], [1, 1, 9, 1], [1, 1, 9, 1]], "reduce": [], "reorder": [[0]], "inline": [], "unroll": [[1500, 1]], "merge": [], "special": []}, {"fuse": [[1, 2, 4]], "spatial": [[1, 1, 1, 1], [160, 1, 6, 1], [1, 7, 1, 1], [1, 1, 7, 1]], "reduce": [[1, 1, 1], [1, 3, 1], [1, 3, 1]], "reorder": [[2]], "inline": [], "unroll": [[512, 0]], "merge": [], "special": []}], {"fuse": [], "spatial": [], "reduce": [], "reorder": [], "inline": [[1, 0]], "unroll": [], "merge": [], "special": []}]
7 | 


--------------------------------------------------------------------------------
/flextensor/optimize/depthwise_conv2d-config.log:
--------------------------------------------------------------------------------
1 | conv2d_depthwise_(1, 32, 112, 112, 1, 3, 1, 1, 1)_opencl(0):[[{"fuse": [[1, 3, 4]], "spatial": [[1, 1, 1, 1], [8, 4, 1, 1], [114, 1, 1, 1], [1, 2, 57, 1]], "reduce": [], "reorder": [[1]], "inline": [], "unroll": [[1, 1]], "merge": [], "special": []}, {"fuse": [[1, 3, 4]], "spatial": [[1, 1, 1, 1], [8, 2, 2, 1], [56, 1, 1, 2], [7, 1, 8, 2]], "reduce": [[1, 3, 1], [3, 1, 1]], "reorder": [[0]], "inline": [], "unroll": [[1500, 1]], "merge": [], "special": []}], {"fuse": [], "spatial": [], "reduce": [], "reorder": [], "inline": [[0, 0]], "unroll": [], "merge": [], "special": []}]
2 | conv2d_depthwise_(1, 16, 112, 112, 6, 3, 2, 1, 1)_opencl(0):[[{"fuse": [[1, 2, 4]], "spatial": [[1, 1, 1, 1], [1, 2, 1, 8], [1, 19, 1, 6], [1, 1, 1, 114]], "reduce": [], "reorder": [[2]], "inline": [], "unroll": [[1500, 0]], "merge": [], "special": []}, {"fuse": [[1, 3, 4]], "spatial": [[1, 1, 1, 1], [32, 3, 1, 1], [2, 7, 2, 2], [7, 1, 4, 2]], "reduce": [[1, 1, 3], [1, 1, 3]], "reorder": [[2]], "inline": [], "unroll": [[0, 1]], "merge": [], "special": []}], {"fuse": [], "spatial": [], "reduce": [], "reorder": [], "inline": [[0, 0]], "unroll": [], "merge": [], "special": []}]
3 | conv2d_depthwise_(1, 24, 56, 56, 6, 3, 2, 1, 1)_opencl(0):[[{"fuse": [[1, 2, 4]], "spatial": [[1, 1, 1, 1], [6, 4, 1, 1], [58, 1, 1, 1], [1, 1, 58, 1]], "reduce": [], "reorder": [[0]], "inline": [], "unroll": [[0, 0]], "merge": [], "special": []}, {"fuse": [[1, 3, 4]], "spatial": [[1, 1, 1, 1], [24, 1, 1, 6], [7, 1, 4, 1], [1, 1, 14, 2]], "reduce": [[3, 1, 1], [1, 3, 1]], "reorder": [[0]], "inline": [], "unroll": [[1, 1]], "merge": [], "special": []}], {"fuse": [], "spatial": [], "reduce": [], "reorder": [], "inline": [[0, 0]], "unroll": [], "merge": [], "special": []}]
4 | conv2d_depthwise_(1, 32, 28, 28, 6, 3, 2, 1, 1)_opencl(0):[[{"fuse": [[2, 3, 4]], "spatial": [[1, 1, 1, 1], [8, 2, 1, 2], [3, 1, 10, 1], [1, 2, 15, 1]], "reduce": [], "reorder": [[0]], "inline": [], "unroll": [[1500, 1]], "merge": [], "special": []}, {"fuse": [[1, 3, 4]], "spatial": [[1, 1, 1, 1], [32, 6, 1, 1], [7, 1, 2, 1], [1, 1, 14, 1]], "reduce": [[3, 1, 1], [1, 1, 3]], "reorder": [[2]], "inline": [], "unroll": [[1500, 0]], "merge": [], "special": []}], {"fuse": [], "spatial": [], "reduce": [], "reorder": [], "inline": [[1, 0]], "unroll": [], "merge": [], "special": []}]
5 | conv2d_depthwise_(1, 64, 14, 14, 6, 3, 1, 1, 1)_opencl(0):[[{"fuse": [[1, 2, 4]], "spatial": [[1, 1, 1, 1], [1, 1, 64, 1], [8, 1, 2, 1], [2, 8, 1, 1]], "reduce": [], "reorder": [[0]], "inline": [], "unroll": [[1, 1]], "merge": [], "special": []}, {"fuse": [[2, 3, 4]], "spatial": [[1, 1, 1, 1], [64, 1, 6, 1], [1, 2, 7, 1], [1, 1, 7, 2]], "reduce": [[1, 1, 3], [3, 1, 1]], "reorder": [[0]], "inline": [], "unroll": [[0, 0]], "merge": [], "special": []}], {"fuse": [], "spatial": [], "reduce": [], "reorder": [], "inline": [[1, 0]], "unroll": [], "merge": [], "special": []}]
6 | conv2d_depthwise_(1, 96, 14, 14, 6, 3, 2, 1, 1)_opencl(0):[[{"fuse": [[1, 2, 4]], "spatial": [[1, 1, 1, 1], [32, 1, 3, 1], [1, 2, 4, 2], [1, 2, 8, 1]], "reduce": [], "reorder": [[1]], "inline": [], "unroll": [[1500, 1]], "merge": [], "special": []}, {"fuse": [[2, 3, 4]], "spatial": [[1, 1, 1, 1], [192, 1, 3, 1], [1, 1, 7, 1], [1, 1, 7, 1]], "reduce": [[1, 3, 1], [3, 1, 1]], "reorder": [[0]], "inline": [], "unroll": [[1500, 1]], "merge": [], "special": []}], {"fuse": [], "spatial": [], "reduce": [], "reorder": [], "inline": [[1, 0]], "unroll": [], "merge": [], "special": []}]
7 | conv2d_depthwise_(1, 160, 7, 7, 6, 3, 1, 1, 1)_opencl(0):[[{"fuse": [[1, 3, 4]], "spatial": [[1, 1, 1, 1], [40, 1, 2, 2], [3, 3, 1, 1], [1, 1, 9, 1]], "reduce": [], "reorder": [[0]], "inline": [], "unroll": [[0, 1]], "merge": [], "special": []}, {"fuse": [[2, 3, 4]], "spatial": [[1, 1, 1, 1], [240, 1, 4, 1], [1, 1, 1, 7], [1, 1, 7, 1]], "reduce": [[1, 1, 3], [1, 1, 3]], "reorder": [[0]], "inline": [], "unroll": [[512, 1]], "merge": [], "special": []}], {"fuse": [], "spatial": [], "reduce": [], "reorder": [], "inline": [[1, 0]], "unroll": [], "merge": [], "special": []}]
8 | 


--------------------------------------------------------------------------------
/flextensor/optimize/gemm-config.log:
--------------------------------------------------------------------------------
1 | gemm_gemm_(256, 128, 128)_opencl(0):[[{"fuse": [[1, 2, 2]], "spatial": [[64, 1, 2, 2], [1, 1, 32, 4]], "reduce": [[2, 32, 2]], "reorder": [[0]], "inline": [], "unroll": [[0, 1]], "merge": [], "special": []}], {"fuse": [], "spatial": [], "reduce": [], "reorder": [], "inline": [[0]], "unroll": [], "merge": [], "special": []}]
2 | gemm_gemm_(256, 256, 256)_opencl(0):[[{"fuse": [[1, 2, 2]], "spatial": [[32, 4, 2, 1], [2, 1, 32, 4]], "reduce": [[1, 64, 4]], "reorder": [[0]], "inline": [], "unroll": [[512, 1]], "merge": [], "special": []}], {"fuse": [], "spatial": [], "reduce": [], "reorder": [], "inline": [[0]], "unroll": [], "merge": [], "special": []}]
3 | 


--------------------------------------------------------------------------------
/flextensor/optimize/gemm-config.old.log:
--------------------------------------------------------------------------------
1 | gemm_gemm_(128, 128, 128)_opencl(0):[[{"fuse": [[1, 2, 2]], "spatial": [[64, 2, 1, 1], [1, 1, 64, 2]], "reduce": [[64, 2, 1]], "reorder": [[0]], "inline": [], "unroll": [[1, 0]], "merge": [], "special": []}], {"fuse": [], "spatial": [], "reduce": [], "reorder": [], "inline": [[0]], "unroll": [], "merge": [], "special": []}]
2 | gemm_gemm_(256, 256, 256)_opencl(0):[[{"fuse": [[1, 2, 2]], "spatial": [[128, 1, 2, 1], [16, 1, 4, 4]], "reduce": [[4, 2, 32]], "reorder": [[1]], "inline": [], "unroll": [[1, 1]], "merge": [], "special": []}], {"fuse": [], "spatial": [], "reduce": [], "reorder": [], "inline": [[0]], "unroll": [], "merge": [], "special": []}]
3 | gemm_gemm_(256, 128, 128)_opencl(0):[[{"fuse": [[1, 2, 2]], "spatial": [[64, 2, 2, 1], [2, 1, 32, 2]], "reduce": [[32, 2, 2]], "reorder": [[0]], "inline": [], "unroll": [[1500, 0]], "merge": [], "special": []}], {"fuse": [], "spatial": [], "reduce": [], "reorder": [], "inline": [[0]], "unroll": [], "merge": [], "special": []}]
4 | gemm_gemm_(128, 256, 128)_opencl(0):[[{"fuse": [[1, 2, 2]], "spatial": [[32, 2, 2, 1], [1, 1, 64, 2]], "reduce": [[8, 8, 4]], "reorder": [[1]], "inline": [], "unroll": [[1500, 0]], "merge": [], "special": []}], {"fuse": [], "spatial": [], "reduce": [], "reorder": [], "inline": [[0]], "unroll": [], "merge": [], "special": []}]
5 | gemm_gemm_(128, 128, 256)_opencl(0):[[{"fuse": [[1, 2, 2]], "spatial": [[32, 2, 2, 1], [8, 1, 16, 2]], "reduce": [[1, 32, 4]], "reorder": [[0]], "inline": [], "unroll": [[512, 0]], "merge": [], "special": []}], {"fuse": [], "spatial": [], "reduce": [], "reorder": [], "inline": [[0]], "unroll": [], "merge": [], "special": []}]
6 | gemm_gemm_(128, 256, 256)_opencl(0):[[{"fuse": [[1, 2, 2]], "spatial": [[32, 4, 1, 1], [4, 1, 64, 1]], "reduce": [[16, 4, 4]], "reorder": [[1]], "inline": [], "unroll": [[1500, 0]], "merge": [], "special": []}], {"fuse": [], "spatial": [], "reduce": [], "reorder": [], "inline": [[0]], "unroll": [], "merge": [], "special": []}]
7 | gemm_gemm_(256, 128, 256)_opencl(0):[[{"fuse": [[1, 2, 2]], "spatial": [[64, 1, 2, 2], [4, 1, 8, 8]], "reduce": [[32, 4, 1]], "reorder": [[0]], "inline": [], "unroll": [[512, 0]], "merge": [], "special": []}], {"fuse": [], "spatial": [], "reduce": [], "reorder": [], "inline": [[0]], "unroll": [], "merge": [], "special": []}]
8 | gemm_gemm_(256, 256, 128)_opencl(0):[[{"fuse": [[1, 2, 2]], "spatial": [[128, 1, 2, 1], [4, 1, 8, 4]], "reduce": [[16, 16, 1]], "reorder": [[0]], "inline": [], "unroll": [[512, 1]], "merge": [], "special": []}], {"fuse": [], "spatial": [], "reduce": [], "reorder": [], "inline": [[0]], "unroll": [], "merge": [], "special": []}]
9 | 


--------------------------------------------------------------------------------
/flextensor/optimize/gemm-config.v0.log:
--------------------------------------------------------------------------------
1 | gemm_gemm_(128, 128, 128)_opencl(0):[[{"fuse": [[1, 2, 2]], "spatial": [[64, 2, 1, 1], [1, 1, 64, 2]], "reduce": [[64, 2, 1]], "reorder": [[0]], "inline": [], "unroll": [[1, 0]], "merge": [], "special": []}], {"fuse": [], "spatial": [], "reduce": [], "reorder": [], "inline": [[0]], "unroll": [], "merge": [], "special": []}]
2 | gemm_gemm_(256, 256, 256)_opencl(0):[[{"fuse": [[1, 2, 2]], "spatial": [[128, 1, 2, 1], [16, 1, 4, 4]], "reduce": [[4, 2, 32]], "reorder": [[1]], "inline": [], "unroll": [[1, 1]], "merge": [], "special": []}], {"fuse": [], "spatial": [], "reduce": [], "reorder": [], "inline": [[0]], "unroll": [], "merge": [], "special": []}]
3 | gemm_gemm_(256, 128, 128)_opencl(0):[[{"fuse": [[1, 2, 2]], "spatial": [[64, 2, 2, 1], [2, 1, 32, 2]], "reduce": [[32, 2, 2]], "reorder": [[0]], "inline": [], "unroll": [[1500, 0]], "merge": [], "special": []}], {"fuse": [], "spatial": [], "reduce": [], "reorder": [], "inline": [[0]], "unroll": [], "merge": [], "special": []}]
4 | gemm_gemm_(128, 256, 128)_opencl(0):[[{"fuse": [[1, 2, 2]], "spatial": [[32, 2, 2, 1], [1, 1, 64, 2]], "reduce": [[8, 8, 4]], "reorder": [[1]], "inline": [], "unroll": [[1500, 0]], "merge": [], "special": []}], {"fuse": [], "spatial": [], "reduce": [], "reorder": [], "inline": [[0]], "unroll": [], "merge": [], "special": []}]
5 | gemm_gemm_(128, 128, 256)_opencl(0):[[{"fuse": [[1, 2, 2]], "spatial": [[32, 2, 2, 1], [8, 1, 16, 2]], "reduce": [[1, 32, 4]], "reorder": [[0]], "inline": [], "unroll": [[512, 0]], "merge": [], "special": []}], {"fuse": [], "spatial": [], "reduce": [], "reorder": [], "inline": [[0]], "unroll": [], "merge": [], "special": []}]
6 | gemm_gemm_(128, 256, 256)_opencl(0):[[{"fuse": [[1, 2, 2]], "spatial": [[32, 4, 1, 1], [4, 1, 64, 1]], "reduce": [[16, 4, 4]], "reorder": [[1]], "inline": [], "unroll": [[1500, 0]], "merge": [], "special": []}], {"fuse": [], "spatial": [], "reduce": [], "reorder": [], "inline": [[0]], "unroll": [], "merge": [], "special": []}]
7 | gemm_gemm_(256, 128, 256)_opencl(0):[[{"fuse": [[1, 2, 2]], "spatial": [[64, 1, 2, 2], [4, 1, 8, 8]], "reduce": [[32, 4, 1]], "reorder": [[0]], "inline": [], "unroll": [[512, 0]], "merge": [], "special": []}], {"fuse": [], "spatial": [], "reduce": [], "reorder": [], "inline": [[0]], "unroll": [], "merge": [], "special": []}]
8 | gemm_gemm_(256, 256, 128)_opencl(0):[[{"fuse": [[1, 2, 2]], "spatial": [[128, 1, 2, 1], [4, 1, 8, 4]], "reduce": [[16, 16, 1]], "reorder": [[0]], "inline": [], "unroll": [[512, 1]], "merge": [], "special": []}], {"fuse": [], "spatial": [], "reduce": [], "reorder": [], "inline": [[0]], "unroll": [], "merge": [], "special": []}]
9 | 


--------------------------------------------------------------------------------
/flextensor/optimize/gemm-config.v1.log:
--------------------------------------------------------------------------------
1 | gemm_gemm_(256, 256, 256)_opencl(0):[[{"fuse": [[1, 2, 2]], "spatial": [[32, 4, 1, 2], [8, 2, 8, 2]], "reduce": [[8, 16, 2]], "reorder": [[0]], "inline": [], "unroll": [[1, 1]], "merge": [], "special": []}], {"fuse": [], "spatial": [], "reduce": [], "reorder": [], "inline": [[0]], "unroll": [], "merge": [], "special": []}]
2 | gemm_gemm_(256, 128, 128)_opencl(0):[[{"fuse": [[1, 2, 2]], "spatial": [[32, 1, 1, 8], [8, 2, 4, 2]], "reduce": [[1, 1, 128]], "reorder": [[0]], "inline": [], "unroll": [[1500, 1]], "merge": [], "special": []}], {"fuse": [], "spatial": [], "reduce": [], "reorder": [], "inline": [[0]], "unroll": [], "merge": [], "special": []}]
3 | 


--------------------------------------------------------------------------------
/flextensor/optimize/run_remote_opencl_conv1d.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/env bash
 2 | running() {
 3 |     beg=$1
 4 |     end=$(python3 -c "print($beg + 1)")
 5 |     # timeout=$(python3 -c "print((($beg + 3) * 2) if $beg < 6 else 7)")
 6 |     timeout=4
 7 |     stdbuf --output=0 --error=0 python3 optimize_conv1d.py \
 8 |         --target_host "llvm -mtriple=aarch64-linux-android" \
 9 |         --host 0.0.0.0 --port 9190 \
10 |         --use_rpc tracker \
11 |         --fcompile ndk \
12 |         --device_key android \
13 |         --target opencl \
14 |         --timeout $timeout \
15 |         --parallel 6 \
16 |         -f $beg -t $end \
17 |         -l conv1d-config.log \
18 |         1>conv1d-$beg.log 2>conv1d-$beg.log
19 |     # --test conv1d-config.log
20 | }
21 | 
22 | start=${1:-0}
23 | stop=${2:-6}
24 | 
25 | set -x
26 | for ((i = $start; i < $stop; i++)); do
27 |     running $i
28 | done
29 | 


--------------------------------------------------------------------------------
/flextensor/optimize/run_remote_opencl_conv2d.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/env bash
 2 | running() {
 3 |     beg=$1
 4 |     end=$(python3 -c "print($beg + 1)")
 5 |     # timeout=$(python3 -c "print((($beg + 3) * 2) if $beg < 6 else 7)")
 6 |     timeout=5
 7 |     stdbuf --output=0 --error=0 python3 optimize_conv2d.py \
 8 |         --target_host "llvm -mtriple=aarch64-linux-android" \
 9 |         --host 0.0.0.0 --port 9190 \
10 |         --use_rpc tracker \
11 |         --fcompile ndk \
12 |         --device_key android \
13 |         --target opencl \
14 |         --timeout $timeout \
15 |         --parallel 6 \
16 |         -f $beg -t $end \
17 |         -l conv2d-config.log \
18 |         --shapes mobile_v2 \
19 |         1>conv2d-$beg.log 2>conv2d-$beg.log
20 |     # --test conv2d-config.log
21 | }
22 | 
23 | start=${1:-0}
24 | stop=${2:-7}
25 | 
26 | set -x
27 | for ((i = $start; i < $stop; i++)); do
28 |     running $i
29 | done
30 | 


--------------------------------------------------------------------------------
/flextensor/optimize/run_remote_opencl_depthwise_conv2d.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/env bash
 2 | running() {
 3 |     beg=$1
 4 |     end=$(python3 -c "print($beg + 1)")
 5 |     name=depthwise_conv2d
 6 |     # timeout=$(python3 -c "print((($beg + 3) * 2) if $beg < 6 else 7)")
 7 |     timeout=4
 8 |     stdbuf --output=0 --error=0 python3 optimize_$name.py \
 9 |         --target_host "llvm -mtriple=aarch64-linux-android" \
10 |         --host 0.0.0.0 --port 9190 \
11 |         --use_rpc tracker \
12 |         --fcompile ndk \
13 |         --device_key android \
14 |         --target opencl \
15 |         --timeout $timeout \
16 |         --parallel 6 \
17 |         -f $beg -t $end \
18 |         -l $name-config.log \
19 |         --test $name-config.log --check
20 |     # 1>$name-$beg.log 2>$name-$beg.log
21 | }
22 | 
23 | start=${1:-0}
24 | stop=${2:-7}
25 | 
26 | set -x
27 | for ((i = $start; i < $stop; i++)); do
28 |     running $i
29 | done
30 | 


--------------------------------------------------------------------------------
/flextensor/optimize/run_remote_opencl_gemm.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/env bash
 2 | running() {
 3 |     beg=$1
 4 |     end=$(python3 -c "print($beg + 1)")
 5 |     timeout=$(python3 -c "print((($beg + 3) * 2) if $beg < 6 else 7)")
 6 |     # timeout=4
 7 |     stdbuf --output=0 --error=0 python3 optimize_gemm.py \
 8 |         --target_host "llvm -mtriple=aarch64-linux-android" \
 9 |         --host 0.0.0.0 --port 9190 \
10 |         --use_rpc tracker \
11 |         --fcompile ndk \
12 |         --device_key android \
13 |         --target opencl \
14 |         --timeout $timeout \
15 |         --parallel 6 \
16 |         -f $beg -t $end \
17 |         --test gemm-config.log --check
18 |         # -l gemm-config.log \
19 |         # 1>gemm-$beg.log 2>gemm-$beg.log
20 | }
21 | 
22 | start=${1:-0}
23 | stop=${2:-12}
24 | 
25 | set -x
26 | for ((i = $start; i < $stop; i++)); do
27 |     running $i
28 | done
29 | 


--------------------------------------------------------------------------------
/flextensor/project/tensor_graph/README.md:
--------------------------------------------------------------------------------
 1 | # Tensor Graph: Using GNN to optimize Tensor Operators
 2 | 
 3 | 1. Prerequsiities:
 4 |    - Python >= 3.5
 5 |    - PyTorch >= 1.2.0
 6 |    - [PyTorch Geometric](https://pytorch-geometric.readthedocs.io/en/latest/notes/installation.html)
 7 |    - [TVM >= 0.6.0](https://docs.tvm.ai/install/from_source.html)
 8 |    - [FlexTensor](https://github.com/KnowingNothing/FlexTensor.git)
 9 | 
10 | 2. Run:
11 |    `python train.py --help` to see optional knobs
12 | 
13 | 3. Use trained model on Titan X:
14 |    `python train.py --only_test --fmodel gemm_model/gemm_model.pkl --ftest dataset/gemm_test.txt --eval_dev 0`
15 | 
16 | 4. The dataset:
17 | 
18 |    GEMM: gemm_train.txt, gemm_test.txt
19 | 
20 |    Conv2d: conv2d_train.txt, conv2d_test.txt
21 | 
22 | 5. Any problems:
23 |    File issues to https://github.com/KnowingNothing/FlexTensor.git
24 |    Tensor Graph is a testing feature of FlexTensor currently.


--------------------------------------------------------------------------------
/flextensor/project/tensor_graph/conv2d_model/conv2d_model.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pku-liang/FlexTensor/98db7a952a0508f792edfd876527c8517a5ccd36/flextensor/project/tensor_graph/conv2d_model/conv2d_model.pkl


--------------------------------------------------------------------------------
/flextensor/project/tensor_graph/dataset/preprocess.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | 
 3 | 
 4 | ratio = 0.8
 5 | 
 6 | 
 7 | if __name__ == "__main__":
 8 |     raw = []
 9 |     with open("all.txt", "r") as fin:
10 |         for line in fin:
11 |             raw.append(line)
12 |     if not raw[-1][-1] == "\n":
13 |         raw[-1] = raw[-1] + "\n"
14 |     np.random.shuffle(raw)
15 |     length = int(ratio * len(raw))
16 |     train = raw[:length]
17 |     test = raw[length:]
18 |     with open("all_train.txt", "w") as fout:
19 |         fout.writelines(train)
20 |     with open("all_test.txt", "w") as fout:
21 |         fout.writelines(test)
22 | 


--------------------------------------------------------------------------------
/flextensor/project/tensor_graph/gemm_model/gemm_model.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pku-liang/FlexTensor/98db7a952a0508f792edfd876527c8517a5ccd36/flextensor/project/tensor_graph/gemm_model/gemm_model.pkl


--------------------------------------------------------------------------------
/flextensor/project/tensor_graph/node.py:
--------------------------------------------------------------------------------
 1 | from __future__ import absolute_import
 2 | 
 3 | import tvm
 4 | from utils import strict_limit
 5 | 
 6 | 
 7 | class Node(object):
 8 |     def __init__(self, feature, name=None):
 9 |         if not isinstance(feature, (list, tuple)):
10 |             feature = [feature]
11 |         self.feature = feature
12 |         self.name = name
13 | 
14 | 
15 | def make_nodes_from_tensor(tensor):
16 |     """
17 |     return: list of Node
18 |     """
19 |     assert isinstance(tensor, tvm.te.tensor.Tensor), strict_limit("tvm.te.tensor.Tensor")
20 |     node_lst = []
21 |     for dim, val in enumerate(tensor.shape):
22 |         assert isinstance(val, tvm.tir.IntImm), strict_limit("tvm.tir.IntImm")
23 |         node_lst.append(Node(val.value), name="%s/%d" % (tensor.name, dim))
24 |     return node_lst
25 | 
26 | 
27 | def make_node_from_var(var, feature):
28 |     assert isinstance(var, tvm.tir.Var), strict_limit("tvm.tir.Var")
29 |     return Node(feature, name=var.name)
30 | 
31 | 


--------------------------------------------------------------------------------
/flextensor/project/tensor_graph/preprocess.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | 
 3 | 
 4 | ratio = 0.8
 5 | 
 6 | 
 7 | if __name__ == "__main__":
 8 |     raw = []
 9 |     with open("data.txt", "r") as fin:
10 |         for line in fin:
11 |             raw.append(line)
12 |     np.random.shuffle(raw)
13 |     length = int(ratio * len(raw))
14 |     train = raw[:length]
15 |     test = raw[length:]
16 |     with open("train.txt", "w") as fout:
17 |         fout.writelines(train)
18 |     with open("test.txt", "w") as fout:
19 |         fout.writelines(test)


--------------------------------------------------------------------------------
/flextensor/templates/__init__.py:
--------------------------------------------------------------------------------
1 | from .cpu import *
2 | from .cuda import *
3 | from .opencl import *
4 | 


--------------------------------------------------------------------------------
/flextensor/test/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pku-liang/FlexTensor/98db7a952a0508f792edfd876527c8517a5ccd36/flextensor/test/__init__.py


--------------------------------------------------------------------------------
/flextensor/test/naive_schedule_all.py:
--------------------------------------------------------------------------------
 1 | from __future__ import absolute_import 
 2 | 
 3 | import sys
 4 | import tvm 
 5 | from flextensor.task import TASK_TABLE
 6 | 
 7 | 
 8 | def print_source(s, bufs, target, file=sys.stdout):
 9 |     func = tvm.build(s, bufs, target)
10 |     if target in ["cuda", "opencl"]:
11 |         print(func.imported_modules[0].get_source(), file=file)
12 | 
13 | def recursive_fuse(s, cur, flag=False):
14 |     for t in s[cur].op.input_tensors:
15 |         if isinstance(t.op, tvm.te.tensor.ComputeOp):
16 |             recursive_fuse(s, t.op, True)
17 |     if flag:
18 |         s[cur].compute_inline()
19 | 
20 | hit_set = set()
21 | for task in TASK_TABLE.values():
22 |     if (task.target == "cuda" and "gemm_conv" not in task.key 
23 |         and "mttkrp" not in task.key and "block_circulant_matrix" not in task.key
24 |         and "pixel" not in task.key and "unpool" not in task.key and "shift" not in task.key
25 |         and "packed" not in task.key):
26 |         prefix = task.key.rsplit("_", 4)[0]
27 |         if prefix in hit_set:
28 |             continue
29 |         hit_set.add(prefix)
30 |         outops, bufs = task.func(*task.args)
31 |         s = tvm.te.create_schedule(outops)
32 |         bx = tvm.te.thread_axis("blockIdx.x")
33 |         op = outops[0]
34 |         recursive_fuse(s, op)
35 |         outer, inner = s[op].split(s[op].op.axis[0], nparts=1)
36 |         s[op].bind(outer, bx)
37 |         print(task.key)
38 |         print_source(s, bufs, "opencl")


--------------------------------------------------------------------------------
/flextensor/test/pyimpl.py:
--------------------------------------------------------------------------------
 1 | from __future__ import absolute_import
 2 | 
 3 | 
 4 | import numpy as np
 5 | 
 6 | 
 7 | def conv2d_nchwc(inputs, weight, bias=None, stride=1, padding=0, dilation=1, groups=1):
 8 |     """Convolution 2d NCHWc layout
 9 | 
10 |     Args:
11 |     -----------------------------
12 |     inputs  : np.ndarray
13 |         shape [batch, channel // vlen, height, width, vlen]
14 |     weight  : np.ndarray
15 |         shape [out_channel // vlen, channel // vlen // groups, kernel_height, kernel_width, vlen(i), vlen(o)]
16 |     bias    : (optional:None) np.ndarray
17 |         shape [out_channel // vlen, vlen]
18 |     stride  : (optional:1) int or tuple
19 | 
20 |     padding : (optional:0) int or tuple
21 | 
22 |     dilation: (optional:1) int
23 | 
24 |     groups  : (optional:1) int
25 |     -----------------------------
26 | 
27 |     Returns:
28 |     -----------------------------
29 |     np.ndarray
30 |         shape [batch, out_channel // vlen, output_height, output_width, vlen]
31 |     -----------------------------
32 |     """
33 |     batch_size, in_channel_chunk, in_h, in_w, in_channel_block = inputs.shape
34 |     out_channel_chunk, channel_per_group_chunk, k_h, k_w, _in_channel_block, out_channel_block = weight.shape
35 |     assert ((channel_per_group_chunk * groups) == in_channel_chunk)
36 |     assert _in_channel_block == in_channel_block
37 |     assert in_channel_block == out_channel_block
38 |     out_channel_per_group = out_channel_chunk // groups
39 |     assert ((out_channel_per_group * groups) == out_channel_chunk)
40 | 
41 |     stride = (stride, stride) if isinstance(stride, int) else stride
42 |     padding = (padding, padding) if isinstance(padding, int) else padding
43 |     dilation = (dilation, dilation) if isinstance(dilation, int) else dilation
44 |     assert (isinstance(stride, tuple) and len(stride) == 2)
45 |     assert (isinstance(padding, tuple) and len(padding) == 2)
46 |     assert (isinstance(dilation, tuple) and len(dilation) == 2)
47 | 
48 |     out_h = (in_h + 2 * padding[0] - dilation[0] * (k_h - 1) - 1) // stride[0] + 1
49 |     out_w = (in_w + 2 * padding[1] - dilation[1] * (k_w - 1) - 1) // stride[1] + 1
50 | 
51 |     output = np.zeros((batch_size, out_channel_chunk, out_h, out_w, out_channel_block), dtype=inputs.dtype)
52 |     for b in range(batch_size):
53 |         for c_c in range(out_channel_chunk):
54 |             for h in range(out_h):
55 |                 for w in range(out_w):
56 |                     for c_b in range(out_channel_block):
57 |                         for rc_chunk in range(channel_per_group_chunk):
58 |                             for rc_block in range(in_channel_block):
59 |                                 for rh in range(k_h):
60 |                                     for rw in range(k_w):
61 |                                         h_index = h * stride[0] + rh * dilation[0] - padding[0]
62 |                                         w_index = w * stride[1] + rw * dilation[1] - padding[1]
63 |                                         if 0 <= h_index < in_h and 0 <= w_index < in_w:
64 |                                             output[b, c_c, h, w, c_b] += inputs[b, 
65 |                                                     c_c // out_channel_per_group * channel_per_group_chunk + rc_chunk,
66 |                                                     h_index, 
67 |                                                     w_index, 
68 |                                                     rc_block] * weight[c_c, rc_chunk, rh, rw, rc_block, c_b]
69 |     
70 |     if bias is not None:
71 |         for b in range(batch_size):
72 |             for c_c in range(out_channel_chunk):
73 |                 for h in range(out_h):
74 |                     for w in range(out_w):
75 |                         for c_b in range(out_channel_block):
76 |                             output[b, c_c, h, w, c_b] += bias[c_c, c_b]
77 |     
78 |     return output


--------------------------------------------------------------------------------
/flextensor/test/test_scheduler.py:
--------------------------------------------------------------------------------
 1 | import time
 2 | import tvm 
 3 | from flextensor.utils import Config
 4 | from flextensor.task import Task
 5 | from flextensor.scheduler import schedule, schedule_with_config
 6 | from flextensor.measure import _evaluate
 7 | 
 8 | 
 9 | def test():
10 |     # create an empty task but has the correct key we want
11 |     task = Task("yolo1", None, (1, 3, 448, 448, 64, 7, 2, 3, 1, 1), "llvm", 0)
12 |     beg = time.time()
13 |     # s, bufs, configs = schedule(task.key)
14 |     end = time.time()
15 |     # print(tvm.lower(s, bufs, simple_mode=True))
16 |     # print("######################################")
17 |     # print("op schedules:")
18 |     # for config in configs.op_config_lst:
19 |     #     print("----------------------------------")
20 |     #     for name, value in config.items():
21 |     #         if value:
22 |     #             print(name, value)
23 |     # print("graph schedules:")
24 |     # for name, value in configs.graph_config.items():
25 |     #     if value:
26 |     #         print(name, value)
27 |     op_configs = [
28 |         {
29 |             "spatial": [[1, 1, 1, 1], [1, 1, 1, 3], [454, 1, 1, 1], [1, 227, 2, 1]],
30 |             "unroll": [[1500, 1]]
31 |         },
32 |         {
33 |             "spatial": [[1, 1, 1, 1], [2, 4, 2, 4], [8, 1, 4, 7], [7, 1, 16, 2]],
34 |             "reduce": [[1, 3, 1], [7, 1, 1], [7, 1, 1]],
35 |             "unroll": [[1500, 1]]
36 |         }
37 |     ]
38 |     graph_config = {"inline": [[0, 0]]}
39 |     configs = Config(op_configs, graph_config)
40 | 
41 |     s, bufs = schedule_with_config(task.key, configs)
42 |     time_cost = _evaluate(s, bufs, "llvm", 0, 10)
43 |     print("Use", time_cost, "ms")
44 |     print("Cost", end - beg, "s")
45 | 
46 | 
47 | if __name__ == "__main__":
48 |     test()


--------------------------------------------------------------------------------
/flextensor/test/test_tvm/grad/dqn_pytorch.py:
--------------------------------------------------------------------------------
 1 | import math
 2 | 
 3 | import torch
 4 | import torch.nn as nn
 5 | import torch.nn.functional as F
 6 | 
 7 | 
 8 | def _calc_fc1_in_features(input_sz, channels_before_flatten):
 9 |     def _calc_conv_osize(sz, k, s, pad):
10 |         return math.floor((sz + 2*pad - k) / s) + 1
11 |     isz = input_sz
12 |     isz = _calc_conv_osize(isz, 8, 4, 0)
13 |     isz = _calc_conv_osize(isz, 4, 2, 0)
14 |     isz = _calc_conv_osize(isz, 3, 1, 0)
15 |     return isz * isz * channels_before_flatten
16 | 
17 | 
18 | class DQN(nn.Module):
19 |     def __init__(self, num_actions=18, image_shape=(4, 84, 84)):
20 |         super().__init__()
21 | 
22 |         self.num_actions = num_actions
23 |         self.image_shape = image_shape
24 |         input_c, input_h, input_w = image_shape
25 |         assert input_h == input_w, "input image must be square"
26 |         fc1_in_features = _calc_fc1_in_features(input_h, 64)
27 | 
28 |         self.conv1 = nn.Conv2d(input_c, 32, kernel_size=8, stride=4, padding=0, bias=True)
29 |         self.conv2 = nn.Conv2d(32, 64, kernel_size=4, stride=2, padding=0, bias=True)
30 |         self.conv3 = nn.Conv2d(64, 64, kernel_size=3, stride=1, padding=0, bias=True)
31 |         self.fc1 = nn.Linear(fc1_in_features, 512, bias=True)
32 |         self.fc2 = nn.Linear(512, num_actions, bias=True)
33 | 
34 |     def forward(self, x):
35 |         x = F.relu(self.conv1(x))
36 |         x = F.relu(self.conv2(x))
37 |         x = F.relu(self.conv3(x))
38 |         x = torch.flatten(x)
39 |         x = F.relu(self.fc1(x))
40 |         x = self.fc2(x)
41 |         return x
42 | 
43 | 
44 | model = DQN()
45 | dummy_data = torch.randn(1, *model.image_shape)
46 | model(dummy_data)
47 | 


--------------------------------------------------------------------------------
/flextensor/test/test_tvm/grad/relay-dqn.py:
--------------------------------------------------------------------------------
 1 | import tvm
 2 | import numpy as np
 3 | import torch
 4 | from tvm import relay
 5 | 
 6 | import tvm.relay.testing
 7 | import tvm.contrib.graph_runtime as runtime
 8 | 
 9 | from dqn_pytorch import DQN
10 | 
11 | batch_size = 1
12 | 
13 | num_actions = 18
14 | 
15 | image_shape = (4, 84, 84)
16 | 
17 | target = "llvm"
18 | 
19 | dtype = "float32"
20 | 
21 | input_shape = (batch_size, *image_shape)
22 | 
23 | input_type = relay.TensorType(input_shape, dtype)
24 | 
25 | print("Get DQN network...")
26 | net = relay.testing.dqn.get_net(
27 |   batch_size, num_actions=num_actions, image_shape=image_shape, dtype=dtype)
28 | 
29 | fmod, fparams = relay.testing.dqn.get_workload(batch_size)
30 | 
31 | print("Get gradient...")
32 | bnet = relay.transform.gradient(fmod["main"], mode='first_order')  # default: higher_order
33 | 
34 | print("Make workload")
35 | mod, params = relay.testing.create_workload(bnet)  # print(mod.get_global_vars())  # [GlobalVar(main)]
36 | 
37 | pytorch_model = DQN(num_actions=num_actions, image_shape=image_shape)
38 | param_name_mapping = {
39 |     'conv1.weight': 'conv1_weight',
40 |     'conv1.bias': 'conv1_bias',
41 |     'conv2.weight': 'conv2_weight',
42 |     'conv2.bias': 'conv2_bias',
43 |     'conv3.weight': 'conv3_weight',
44 |     'conv3.bias': 'conv3_bias',
45 |     'fc1.weight': 'dense1_weight',
46 |     'fc1.bias': 'dense1_bias',
47 |     'fc2.weight': 'dense2_weight',
48 |     'fc2.bias': 'dense2_bias',
49 | }
50 | pytorch_model.load_state_dict({
51 |     pth_key: torch.from_numpy(params[tvm_key].asnumpy())
52 |     for pth_key, tvm_key in param_name_mapping.items()
53 | }, strict=True)
54 | pytorch_model.train()
55 | 
56 | print("Build graph...")
57 | with relay.build_config(opt_level=3):
58 |   graph, lib, params = relay.build_module.build(
59 |       mod, target=target, params=params)
60 | 
61 | ctx = tvm.device(str(target), 0)
62 | 
63 | print("Create runtime...")
64 | module = runtime.create(graph, lib, ctx)
65 | 
66 | print("Set inputs...")
67 | data_tvm = tvm.nd.array((np.random.uniform(size=input_shape)).astype(dtype))
68 | module.set_input('data', data_tvm)
69 | module.set_input(**params)
70 | 
71 | module.run()
72 | 
73 | print(f'#outputs: {module.get_num_outputs()}')
74 | 
75 | relay_output = module.get_output(0).asnumpy()
76 | pytorch_output = pytorch_model(torch.from_numpy(data_tvm.asnumpy()))
77 | pytorch_output_np = pytorch_output.data.numpy()
78 | pytorch_output.sum().backward()
79 | 
80 | for output_idx in range(1, 12):
81 |     output = module.get_output(output_idx)
82 |     print(f'Shape: {output.shape}', end=' ')
83 |     print(f'Mean: {output.asnumpy().mean()}')
84 | 
85 | for name, param in pytorch_model.named_parameters():
86 |     print(f'{name}: {param.grad.mean().item()}')
87 | 
88 | print(f'Allclose: {np.allclose(relay_output, pytorch_output_np)}')


--------------------------------------------------------------------------------
/flextensor/test/test_tvm/graph/placeholder-only.py:
--------------------------------------------------------------------------------
 1 | import tvm
 2 | 
 3 | """
 4 | We can't build a single placeholder in TVM,
 5 | which will returns Segmentation Fault
 6 | """
 7 | 
 8 | A = tvm.te.placeholder([4, 4], dtype="float32", name="A")
 9 | 
10 | s = tvm.te.create_schedule(A.op)
11 | 
12 | func = tvm.build(s, [A], "llvm")
13 | 


--------------------------------------------------------------------------------
/flextensor/test/test_tvm/graph/share-placeholder.py:
--------------------------------------------------------------------------------
 1 | import tvm
 2 | import numpy as np
 3 | 
 4 | 
 5 | dtype = "float32"
 6 | 
 7 | A = tvm.te.placeholder([4, 4], dtype=dtype, name="A")
 8 | 
 9 | B = tvm.te.compute([4, 4], lambda i, j: A[i, j] + 1, name="B")
10 | 
11 | C = tvm.te.compute([4, 4], lambda i, j: A[i, j] * 2, name="C")
12 | 
13 | target = "llvm"
14 | 
15 | s1 = tvm.te.create_schedule(B.op)
16 | 
17 | s2 = tvm.te.create_schedule(C.op)
18 | 
19 | s3 = tvm.te.create_schedule([B.op, C.op])
20 | 
21 | func1 = tvm.build(s1, [A, B], target=target)
22 | 
23 | func2 = tvm.build(s2, [A, C], target=target)
24 | 
25 | func3 = tvm.build(s3, [A, B, C], target=target)
26 | 
27 | ctx = tvm.device(target)
28 | 
29 | A_np = np.random.uniform(-1, 1, [4, 4]).astype(dtype)
30 | B_np = np.zeros([4, 4]).astype(dtype)
31 | C_np = np.zeros([4, 4]).astype(dtype)
32 | 
33 | print("Inputs:")
34 | print(A_np)
35 | 
36 | 
37 | def run(func, id):
38 |   A_tvm = tvm.nd.array(A_np, ctx)
39 |   B_tvm = tvm.nd.array(B_np, ctx)
40 |   C_tvm = tvm.nd.array(C_np, ctx)
41 |   if id == 0:
42 |     func(A_tvm, B_tvm)
43 |     print("Outputs:")
44 |     print(B_tvm.asnumpy())
45 |   elif id == 1:
46 |     func(A_tvm, C_tvm)
47 |     print("Outputs:")
48 |     print(C_tvm.asnumpy())
49 |   elif id == 2:
50 |     func(A_tvm, B_tvm, C_tvm)
51 |     print("Outputs 1:")
52 |     print(B_tvm.asnumpy())
53 |     print("Outputs 2:")
54 |     print(C_tvm.asnumpy())
55 | 
56 | run(func1, 0)
57 | run(func2, 1)
58 | run(func3, 2)
59 | 
60 | 


--------------------------------------------------------------------------------
/flextensor/test/test_tvm/legacy/multi_compute_inline.py:
--------------------------------------------------------------------------------
 1 | import tvm
 2 | 
 3 | A = tvm.te.placeholder((32, 32, 32, 32), dtype="float32", name="A")
 4 | B = tvm.te.compute((32, 30, 32, 32), lambda i, j, p, q: (A[i, j, p, q] + A[i, j+1, p, q] + A[i, j+2, p, q]) / 3, name="B")
 5 | C = tvm.te.compute((30, 30, 32, 32), lambda a, b, c, d: (B[a, b, c, d] + B[a + 1, b, c, d] + B[a + 2, b, c, d]), name="C")
 6 | D = tvm.te.compute((30, 30, 32, 32), lambda h, k, l, m: (C[h, k, l, m] * 2), name="D")
 7 | E = tvm.te.compute((30, 30, 32, 32), lambda h, k, l, m: (C[h, k, l, m] * 3), name="E")
 8 | F = tvm.te.compute((30, 30, 32, 32), lambda h, k, l, m: (D[h, k, l, m] + E[h, k, l, m]), name="F")
 9 | 
10 | s = tvm.te.create_schedule(F.op)
11 | s[C].compute_inline()
12 | print(str(tvm.lower(s, [A, F], simple_mode=True)))
13 | 


--------------------------------------------------------------------------------
/flextensor/test/test_tvm/legacy/test_compute_inline.py:
--------------------------------------------------------------------------------
 1 | import tvm
 2 | 
 3 | 
 4 | def compute_inline_reduce():
 5 |     A = tvm.te.placeholder((32, 32, 32, 32), dtype="float32", name="A")
 6 |     B = tvm.te.placeholder((32, 32), dtype="float32", name="B")
 7 |     k = tvm.te.reduce_axis((0, 32), name="k")
 8 |     C = tvm.te.compute((30, 30, 32, 32), lambda a, b, c, d: tvm.te.sum(A[a, b, c, k] * B[k, d], axis=k), name="C")
 9 |     D = tvm.te.compute((30, 30, 32, 32), lambda h, k, l, m: (C[h, k, l, m] * 2), name="D")
10 |     E = tvm.te.compute((30, 30, 32, 32), lambda h, k, l, m: (C[h, k, l, m] * 3), name="E")
11 |     F = tvm.te.compute((30, 30, 32, 32), lambda h, k, l, m: (D[h, k, l, m] + E[h, k, l, m]), name="F")
12 | 
13 |     s = tvm.te.create_schedule(F.op)
14 |     s[C].compute_inline()
15 |     try:
16 |         tvm.build(s, [A, F], "llvm")
17 |     except Exception as e:
18 |         return False, str(e)
19 |     return True, "pass"
20 | 
21 | 
22 | def compute_inline_output():
23 |     N = 1024
24 |     M = 512
25 |     A = tvm.te.placeholder((M, N), name="A")
26 |     B, C = tvm.te.compute((M, N), lambda i, j: (A[i, j] + 1, A[i, j] * 2), name="B_C")
27 |     D = tvm.te.compute((M, N), lambda i, j: B[i, j] * 2, name="D")
28 | 
29 |     s = tvm.te.create_schedule(D.op)
30 |     s[B].compute_inline()
31 |     try:
32 |         tvm.build(s, [A, C, D], "llvm")
33 |     except Exception as e:
34 |         return False, str(e)
35 |     return True, "pass"
36 | 
37 | 
38 | if __name__ == "__main__":
39 |     print("compute_inline_reduce:", *compute_inline_reduce())
40 |     print("compute_inline_output:", *compute_inline_output())
41 | 


--------------------------------------------------------------------------------
/flextensor/test/test_tvm/legacy/test_conv2d_hwcn_map.py:
--------------------------------------------------------------------------------
 1 | """Example code to do convolution."""
 2 | import os
 3 | import numpy as np
 4 | import scipy.signal
 5 | import tvm
 6 | from tvm.contrib import nvcc
 7 | import topi
 8 | from topi.util import get_const_tuple
 9 | 
10 | TASK = "conv2d_hwcn_map"
11 | USE_MANUAL_CODE = False
12 | 
13 | @tvm.register_func
14 | def tvm_callback_cuda_compile(code):
15 |     ptx = nvcc.compile_cuda(code, target="ptx")
16 |     return ptx
17 | 
18 | def write_code(code, fname):
19 |     with open(fname, "w") as f:
20 |         f.write(code)
21 | 
22 | @tvm.register_func
23 | def tvm_callback_cuda_postproc(code):
24 |     if not os.path.exists("perf"):
25 |         os.mkdir("perf")
26 |     write_code(code, "perf/%s_generated.cu" % TASK)
27 |     if USE_MANUAL_CODE:
28 |         code = open("perf/%s_manual.cu" % TASK).read()
29 |     return code
30 | 
31 | 
32 | def test_conv2d_hwcn_map():
33 |     batch = 64
34 |     in_channel = 128
35 |     in_height = 16
36 |     in_width = 16
37 |     num_filter = 128
38 |     kernel = 3
39 |     stride = 2
40 |     padding = 'SAME'
41 | 
42 |     A = tvm.te.placeholder((in_height, in_width, in_channel, batch), name='A')
43 |     W = tvm.te.placeholder((kernel, kernel, in_channel, num_filter), name='W')
44 |     B = topi.nn.conv2d_hwcn(A, W, stride, padding)
45 |     C = topi.nn.relu(B)
46 |     s1 = topi.cuda.schedule_conv2d_hwcn([B])
47 |     s2 = topi.cuda.schedule_conv2d_hwcn([C])
48 | 
49 |     a_np = np.random.uniform(size=get_const_tuple(A.shape)).astype(A.dtype)
50 |     w_np = np.random.uniform(size=get_const_tuple(W.shape)).astype(W.dtype)
51 |     b_np = topi.testing.conv2d_hwcn_python(a_np, w_np, stride, padding)
52 |     c_np = np.maximum(b_np, 0)
53 | 
54 |     def check_device(device):
55 |         if not tvm.runtime.module.enabled(device):
56 |             print("Skip because %s is not enabled" % device)
57 |             return
58 |         ctx = tvm.device(device, 0)
59 |         a = tvm.nd.array(a_np, ctx)
60 |         w = tvm.nd.array(w_np, ctx)
61 |         b = tvm.nd.array(np.zeros(get_const_tuple(B.shape), dtype=B.dtype), ctx)
62 |         c = tvm.nd.array(np.zeros(get_const_tuple(C.shape), dtype=C.dtype), ctx)
63 |         with tvm.build_config(auto_unroll_max_step=128,
64 |                               unroll_explicit=device == 'rocm'):
65 |             func1 = tvm.build(s1, [A, W, B], device)
66 |             func1(a, w, b)
67 |             tvm.testing.assert_allclose(b.asnumpy(), b_np, rtol=1e-5)
68 |             func2 = tvm.build(s2, [A, W, C], device)
69 |             func2(a, w, c)
70 |             tvm.testing.assert_allclose(c.asnumpy(), c_np, rtol=1e-5)
71 | 
72 |     for device in ['cuda', 'opencl', 'rocm']:
73 |         check_device(device)
74 | 
75 | 
76 | if __name__ == "__main__":
77 |     test_conv2d_hwcn_map()
78 | 


--------------------------------------------------------------------------------
/flextensor/test/test_tvm/legacy/test_multi_outputs.py:
--------------------------------------------------------------------------------
 1 | import tvm 
 2 | 
 3 | 
 4 | A = tvm.te.placeholder((10, 10), name="A")
 5 | B0, B1 = tvm.te.compute((10, 10), lambda i, j: (A[i, j] + 1, A[i, j] * 2), name="B")
 6 | s = tvm.te.create_schedule(B1.op)
 7 | cache0 = s.cache_write(B0.op.output(0), "local")
 8 | cache1 = s.cache_write(B1.op.output(1), "local")
 9 | print(tvm.lower(s, [A, B0, B1], simple_mode=True))
10 | func = tvm.build(s, [A, B0, B1], "llvm")


--------------------------------------------------------------------------------
/flextensor/test/test_tvm/legacy/test_one_thread.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pku-liang/FlexTensor/98db7a952a0508f792edfd876527c8517a5ccd36/flextensor/test/test_tvm/legacy/test_one_thread.py


--------------------------------------------------------------------------------
/flextensor/test/test_tvm/legacy/test_reduce_map.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import tvm
 3 | from tvm.contrib import nvcc
 4 | import numpy as np
 5 | 
 6 | import topi
 7 | 
 8 | 
 9 | TASK = "reduce_map"
10 | USE_MANUAL_CODE = False
11 | 
12 | 
13 | @tvm.register_func
14 | def tvm_callback_cuda_compile(code):
15 |     ptx = nvcc.compile_cuda(code, target="ptx")
16 |     return ptx
17 | 
18 | 
19 | def write_code(code, fname):
20 |     with open(fname, "w") as f:
21 |         f.write(code)
22 | 
23 | 
24 | @tvm.register_func
25 | def tvm_callback_cuda_postproc(code):
26 |     if not os.path.exists("perf"):
27 |         os.mkdir("perf")
28 |     write_code(code, "perf/%s_generated.cu" % TASK)
29 |     if USE_MANUAL_CODE:
30 |         code = open("perf/%s_manual.cu" % TASK).read()
31 |     return code
32 | 
33 | 
34 | def test_reduce_map(in_shape, axis, keepdims, type="sum", test_id=0):
35 |     global TASK
36 |     # Build the logic and compile the function
37 |     A = tvm.te.placeholder(shape=in_shape, name="A")
38 |     if type == "sum":
39 |         TASK = "sum_map_id%d" %test_id
40 |         B = topi.sum(A, axis=axis, keepdims=keepdims)
41 |     elif type == "max":
42 |         TASK = "max_map_id%d" %test_id
43 |         B = topi.max(A, axis=axis, keepdims=keepdims)
44 |     elif type == "min":
45 |         TASK = "min_map_id%d" %test_id
46 |         B = topi.min(A, axis=axis, keepdims=keepdims)
47 |     else:
48 |         raise NotImplementedError
49 |     s = topi.cuda.schedule_reduce(B)
50 |     with tvm.build_config(auto_unroll_max_step=16,
51 |                           auto_unroll_min_depth=0):
52 |         fcuda = tvm.build(s, [A, B], "cuda", name="sum")
53 | 
54 |     # Test
55 |     in_npy = np.random.normal(size=in_shape).astype(np.float32)
56 |     if type == "sum":
57 |         out_npy = in_npy.sum(axis=axis, keepdims=keepdims)
58 |     elif type == "max":
59 |         out_npy = in_npy.max(axis=axis, keepdims=keepdims)
60 |     elif type == "min":
61 |         out_npy = in_npy.min(axis=axis, keepdims=keepdims)
62 |     else:
63 |         raise NotImplementedError
64 | 
65 |     data_tvm = tvm.nd.array(in_npy, ctx=tvm.gpu())
66 |     out_tvm = tvm.nd.empty(shape=out_npy.shape, ctx=tvm.gpu())
67 | 
68 |     for _ in range(2):
69 |         fcuda(data_tvm, out_tvm)
70 |     tvm.testing.assert_allclose(out_tvm.asnumpy(), out_npy, rtol=4e-4, atol=4e-4)
71 | 
72 | if __name__ == "__main__":
73 |     test_reduce_map(in_shape=(128, 24, 128, 24),
74 |                     axis=(1, 2, 3),
75 |                     keepdims=True,
76 |                     type="sum",
77 |                     test_id=0)
78 |     test_reduce_map(in_shape=(128, 24 * 128 * 24),
79 |                     axis=(1,),
80 |                     keepdims=False,
81 |                     type="max",
82 |                     test_id=1)
83 |     test_reduce_map(in_shape=(32, 128, 24),
84 |                     axis=None,
85 |                     keepdims=True,
86 |                     type="sum",
87 |                     test_id=2)
88 |     test_reduce_map(in_shape=(128, 24, 128, 24),
89 |                     axis=(0, 2),
90 |                     keepdims=False,
91 |                     type="min",
92 |                     test_id=3)
93 | 


--------------------------------------------------------------------------------
/flextensor/test/test_tvm_expr/grad/requires_grad.py:
--------------------------------------------------------------------------------
 1 | import tvm
 2 | 
 3 | A = tvm.te.placeholder([4, 4])
 4 | 
 5 | print("A requires_grad=", A.requires_grad)
 6 | 
 7 | B = tvm.te.placeholder([4, 4], requires_grad=True)
 8 | 
 9 | print("B requires_grad=", B.requires_grad)
10 | 
11 | C = tvm.te.compute([4, 4], lambda i, j: A[i, j])
12 | 
13 | print("C requires_grad=", C.requires_grad)
14 | 
15 | D = tvm.te.compute([4, 4], lambda i, j: A[i, j], requires_grad=True)
16 | 
17 | print("D requires_grad=", D.requires_grad)
18 | 
19 | E = tvm.te.compute([4, 4], lambda i, j: B[i, j])
20 | 
21 | print("E requires_grad=", E.requires_grad)
22 | 
23 | F = tvm.te.compute([4, 4], lambda i, j: B[i, j], requires_grad=True)
24 | 
25 | print("F requires_grad=", F.requires_grad)


--------------------------------------------------------------------------------
/flextensor/test/test_tvm_expr/grad/te-avgpool-case1.py:
--------------------------------------------------------------------------------
 1 | import tvm
 2 | import numpy as np
 3 | 
 4 | 
 5 | H = 6
 6 | W = 6
 7 | 
 8 | R = 2
 9 | S = 2
10 | 
11 | P = H // R
12 | Q = W // S
13 | 
14 | dtype = "float32"
15 | 
16 | A = tvm.te.placeholder([H, W], dtype=dtype, name="A")
17 | 
18 | r = tvm.te.reduce_axis([0, R], name="r")
19 | s = tvm.te.reduce_axis([0, S], name="s")
20 | 
21 | C = tvm.te.compute([P, Q],
22 |   lambda i, j: tvm.te.sum(A[i * R + r, j * S + s]/(R*S), axis=[r, s]), name="C")
23 | 
24 | dC = tvm.te.placeholder([P, Q], dtype=dtype, name="dC")
25 | 
26 | 
27 | dA = tvm.te.grad_op(A, C, dC)
28 | 
29 | s = tvm.te.create_schedule(dA.op)
30 | 
31 | print(tvm.lower(s, [A, dC, dA], simple_mode=True))
32 | 
33 | func = tvm.build(s, [A, dC, dA], target="llvm")
34 | 
35 | A_np = np.random.uniform(-10, 10, [H, W]).astype("float32")
36 | dC_np = np.random.uniform(-10, 10, [P, Q]).astype("float32")
37 | dA_np = np.zeros([H, W]).astype("float32")
38 | 
39 | ctx = tvm.device("llvm", 0)
40 | A_tvm = tvm.nd.array(A_np, ctx)
41 | dC_tvm = tvm.nd.array(dC_np, ctx)
42 | dA_tvm = tvm.nd.array(dA_np, ctx)
43 | 
44 | func(A_tvm, dC_tvm, dA_tvm)
45 | 
46 | print(dC_tvm)
47 | print(dA_tvm.asnumpy())
48 | 
49 | # =======>
50 | # compare the results with numpy
51 | golden_np = np.zeros([H, W]).astype("float32")
52 | for i in range(0, P):
53 |     for j in range(0, Q):
54 |         for di in range(0, R):
55 |             for dj in range(0, S):
56 |                 assert(i+di < H)
57 |                 assert(j+dj < W)
58 |                 golden_np[i*R+di][j*S+dj] = dC_np[i][j] / (R * S)
59 | tvm.testing.assert_allclose(dA_tvm.asnumpy(), golden_np, rtol=1e-30)
60 | print("Compare with Numpy success!")
61 | 


--------------------------------------------------------------------------------
/flextensor/test/test_tvm_expr/grad/te-broadcast-case1.py:
--------------------------------------------------------------------------------
 1 | import tvm
 2 | import numpy as np
 3 | 
 4 | 
 5 | H = 8
 6 | W = 16
 7 | 
 8 | dtype = "float32"
 9 | 
10 | A = tvm.te.placeholder([H], dtype=dtype, name="A")
11 | C = tvm.te.compute([H, W],
12 |   lambda i, j:
13 |     A[i], name="C")
14 | 
15 | dC = tvm.te.placeholder([H, W], dtype=dtype, name="dC")
16 | 
17 | dA = tvm.te.grad_op(A, C, dC)
18 | 
19 | s = tvm.te.create_schedule(dA.op)
20 | 
21 | print(tvm.lower(s, [A, dC, dA], simple_mode=True))
22 | 
23 | func = tvm.build(s, [A, dC, dA], target="llvm")
24 | 
25 | A_np = np.random.uniform(-10, 10, [H]).astype("float32")
26 | dC_np = np.random.uniform(-10, 10, [H, W]).astype("float32")
27 | dA_np = np.zeros([H]).astype("float32")
28 | 
29 | ctx = tvm.device("llvm", 0)
30 | A_tvm = tvm.nd.array(A_np, ctx)
31 | dC_tvm = tvm.nd.array(dC_np, ctx)
32 | dA_tvm = tvm.nd.array(dA_np, ctx)
33 | 
34 | func(A_tvm, dC_tvm, dA_tvm)
35 | 
36 | print(dA_tvm)
37 | 
38 | # =======>
39 | # compare the results with numpy
40 | golden_np = np.sum(dC_np, axis=1)
41 | tvm.testing.assert_allclose(dA_tvm.asnumpy(), golden_np, rtol=1e-6)
42 | print("Compare with Numpy success!")
43 | 


--------------------------------------------------------------------------------
/flextensor/test/test_tvm_expr/grad/te-cat-case1.py:
--------------------------------------------------------------------------------
 1 | import topi
 2 | import tvm
 3 | import numpy as np 
 4 | import torch
 5 | 
 6 | 
 7 | dim0 = 8
 8 | dim1 = 3
 9 | dim2 = 4
10 | shape_size1 = [dim0, dim1]
11 | shape_size2 = [dim0, dim2]
12 | dtype = "float32"
13 | 
14 | A = tvm.te.placeholder(shape_size1, dtype=dtype, name="A")
15 | B = tvm.te.placeholder(shape_size2, dtype=dtype, name="B")
16 | C = topi.concatenate([A, B], axis=1)
17 | 
18 | dC = tvm.te.placeholder(C.shape, dtype=dtype, name="dC")
19 | dA, dB = tvm.te.mygradient(C, [A, B], dC)
20 | 
21 | s = tvm.te.create_schedule([C.op, dA.op, dB.op])
22 | 
23 | print(tvm.lower(s, [A, B, dC, dA, dB], simple_mode=True))
24 | 
25 | func = tvm.build(s, [A, B, dC, dA, dB], target="llvm")
26 | 
27 | A_np = np.random.uniform(-10, 10, shape_size1).astype("float32")
28 | B_np = np.random.uniform(-10, 10, shape_size2).astype("float32")
29 | 
30 | dC_np = np.ones([dim0, dim1+dim2]).astype("float32") 
31 | dA_np = np.zeros(shape_size1).astype("float32")
32 | dB_np = np.zeros(shape_size2).astype("float32")
33 | 
34 | ctx = tvm.device("llvm", 0)
35 | A_tvm = tvm.nd.array(A_np, ctx)
36 | B_tvm = tvm.nd.array(B_np, ctx)
37 | 
38 | dC_tvm = tvm.nd.array(dC_np, ctx)
39 | dA_tvm = tvm.nd.array(dA_np, ctx)
40 | dB_tvm = tvm.nd.array(dB_np, ctx)
41 | func(A_tvm, B_tvm, dC_tvm, dA_tvm, dB_tvm)
42 | 
43 | print("dA_tvm", dA_tvm)
44 | 
45 | # =======>
46 | # compare the results with pytorch
47 | A_torch = torch.tensor(A_np, requires_grad=True)
48 | B_torch = torch.tensor(B_np, requires_grad=True)
49 | C_torch = torch.cat([A_torch, B_torch], dim=1)
50 | loss = C_torch.sum()
51 | loss.backward()
52 | print("Pytorch gradient:\n", A_torch.grad.numpy(), B_torch.grad.numpy())
53 | tvm.testing.assert_allclose(dA_tvm.asnumpy(), A_torch.grad.numpy(), atol=1e-30, rtol=1e-30)
54 | tvm.testing.assert_allclose(dB_tvm.asnumpy(), B_torch.grad.numpy(), atol=1e-30, rtol=1e-30)
55 | print("Compare with PyTorch success!")
56 | 


--------------------------------------------------------------------------------
/flextensor/test/test_tvm_expr/grad/te-conv2d-case1.py:
--------------------------------------------------------------------------------
 1 | import tvm
 2 | import numpy as np 
 3 | import torch
 4 | 
 5 | 
 6 | N = 2
 7 | nC = 16
 8 | H = 14
 9 | W = 14
10 | K = 8
11 | R = 3
12 | S = 3
13 | 
14 | st = 1
15 | 
16 | P = (H - R + 1) // st
17 | Q = (W - S + 1) // st
18 | 
19 | dtype = "float32"
20 | 
21 | A = tvm.te.placeholder([N, nC, H, W], dtype=dtype, name="A")
22 | B = tvm.te.placeholder([K, nC, R, S], dtype=dtype, name="B")
23 | c = tvm.te.reduce_axis([0, nC], name="c")
24 | r = tvm.te.reduce_axis([0, R], name="r")
25 | s = tvm.te.reduce_axis([0, S], name="s")
26 | C = tvm.te.compute([N, K, P, Q],
27 |   lambda n, k, h, w :
28 |     tvm.te.sum(A[n, c, h * st + r, w * st + s] * B[k, c, r, s], axis=[c,r,s]), name="C")
29 | 
30 | dC = tvm.te.placeholder([N, K, P, Q], dtype=dtype, name="dC")
31 | 
32 | print(C.op.body)
33 | 
34 | print(dir(C.op.body[0].source[0]))
35 | 
36 | print(tvm.te.expr_equal(C.op.body[0].source[0].b.args[0], C.op.body[0].source[0].b.args[1]))
37 | 
38 | dA = tvm.te.grad_op(A, C, dC)
39 | 
40 | s = tvm.te.create_schedule(dA.op)
41 | 
42 | print(tvm.lower(s, [A, B, dC, dA], simple_mode=True))
43 | 
44 | func = tvm.build(s, [A, B, dC, dA], target="llvm")
45 | 
46 | A_np = np.random.uniform(-10, 10, [N, nC, H, W]).astype("float32")
47 | B_np = np.random.uniform(-10, 10, [K, nC, R, S]).astype("float32")
48 | dC_np = np.random.uniform(-10, 10, [N, K, P, Q]).astype("float32")
49 | dA_np = np.zeros([N, nC, H, W]).astype("float32")
50 | 
51 | ctx = tvm.device("llvm", 0)
52 | A_tvm = tvm.nd.array(A_np, ctx)
53 | B_tvm = tvm.nd.array(B_np, ctx)
54 | dC_tvm = tvm.nd.array(dC_np, ctx)
55 | dA_tvm = tvm.nd.array(dA_np, ctx)
56 | 
57 | func(A_tvm, B_tvm, dC_tvm, dA_tvm)
58 | 
59 | print(dA_tvm)
60 | 
61 | # =======>
62 | # compare the results with pytorch
63 | A_torch = torch.tensor(A_np)
64 | B_torch = torch.tensor(B_np)
65 | dC_torch = torch.tensor(dC_np)
66 | golden_torch = torch.nn.functional.conv_transpose2d(dC_torch, B_torch)
67 | tvm.testing.assert_allclose(dA_tvm.asnumpy(), golden_torch.numpy(), rtol=1e-3)
68 | print("Compare with PyTorch success!")
69 | 


--------------------------------------------------------------------------------
/flextensor/test/test_tvm_expr/grad/te-conv2d-case2.py:
--------------------------------------------------------------------------------
 1 | import tvm
 2 | import numpy as np 
 3 | import torch
 4 | 
 5 | 
 6 | N = 3
 7 | nC = 1024
 8 | H = 15
 9 | W = 15
10 | K = 1024
11 | R = 3
12 | S = 3
13 | 
14 | st = 2
15 | group = 2
16 | 
17 | OG = K // group
18 | IG = nC // group
19 | 
20 | P = (H - R + 1) // st + 1
21 | Q = (W - S + 1) // st + 1
22 | 
23 | dtype = "float32"
24 | 
25 | A = tvm.te.placeholder([N, nC, H, W], dtype=dtype, name="A")
26 | B = tvm.te.placeholder([K, nC, R, S], dtype=dtype, name="B")
27 | c = tvm.te.reduce_axis([0, nC], name="c")
28 | r = tvm.te.reduce_axis([0, R], name="r")
29 | s = tvm.te.reduce_axis([0, S], name="s")
30 | C = tvm.te.compute([N, K, P, Q],
31 |   lambda n, k, h, w :
32 |     tvm.te.sum(A[n, c, h * st + r, w * st + s] * B[k, c, r, s], axis=[c,r,s]), name="C")
33 | 
34 | dC = tvm.te.placeholder([N, K, P, Q], dtype=dtype, name="dC")
35 | 
36 | print(C.op.body)
37 | 
38 | print(dir(C.op.body[0].source[0]))
39 | 
40 | print(tvm.te.expr_equal(C.op.body[0].source[0].b.args[0], C.op.body[0].source[0].b.args[1]))
41 | 
42 | dA = tvm.te.grad_op(A, C, dC)
43 | 
44 | s = tvm.te.create_schedule(dA.op)
45 | 
46 | print(tvm.lower(s, [A, B, dC, dA], simple_mode=True))
47 | 
48 | func = tvm.build(s, [A, B, dC, dA], target="llvm")
49 | 
50 | A_np = np.random.uniform(-1, 1, [N, nC, H, W]).astype("float32")
51 | # B_np = np.ones([K, nC, R, S]).astype("float32")
52 | B_np = np.random.uniform(-1, 1, [K, nC, R, S]).astype("float32")
53 | # dC_np = np.ones([N, K, P, Q]).astype("float32")
54 | dC_np = np.random.uniform(-1, 1, [N, K, P, Q]).astype("float32")
55 | 
56 | dA_np = np.zeros([N, nC, H, W]).astype("float32")
57 | 
58 | ctx = tvm.device("llvm", 0)
59 | A_tvm = tvm.nd.array(A_np, ctx)
60 | B_tvm = tvm.nd.array(B_np, ctx)
61 | dC_tvm = tvm.nd.array(dC_np, ctx)
62 | dA_tvm = tvm.nd.array(dA_np, ctx)
63 | 
64 | func(A_tvm, B_tvm, dC_tvm, dA_tvm)
65 | 
66 | 
67 | # compare the results with pytorch
68 | A_torch = torch.tensor(A_np)
69 | B_torch = torch.tensor(B_np)
70 | dC_torch = torch.tensor(dC_np)
71 | #without output_padding=1: shapes (2, 16, 14, 14), golden:(2, 16, 13, 13) mismatch
72 | golden_torch = torch.nn.functional.conv_transpose2d(dC_torch, B_torch, stride=(st, st), output_padding=0)
73 | # print("da_tvm", dA_tvm.shape)
74 | # print("golden_shape,", golden_torch.size())
75 | 
76 | # print("dA_tvm:", dA_tvm)
77 | # print("golden_torch", golden_torch)
78 | tvm.testing.assert_allclose(dA_tvm.asnumpy(), golden_torch.numpy(), atol=1e-3, rtol=1e-5)
79 | print("Success!\n")
80 | 


--------------------------------------------------------------------------------
/flextensor/test/test_tvm_expr/grad/te-conv2d-case3.py:
--------------------------------------------------------------------------------
 1 | import tvm
 2 | import numpy as np 
 3 | import torch
 4 | 
 5 | 
 6 | N = 2
 7 | nC = 16
 8 | H = 14
 9 | W = 14
10 | K = 8
11 | R = 3
12 | S = 3
13 | 
14 | st = 2
15 | group = 2
16 | 
17 | OG = K // group
18 | IG = nC // group
19 | 
20 | P = (H - R + 1) // st
21 | Q = (W - S + 1) // st
22 | 
23 | dtype = "float32"
24 | 
25 | A = tvm.te.placeholder([N, nC, H, W], dtype=dtype, name="A")
26 | B = tvm.te.placeholder([K, IG, R, S], dtype=dtype, name="B")
27 | c = tvm.te.reduce_axis([0, IG], name="c")
28 | r = tvm.te.reduce_axis([0, R], name="r")
29 | s = tvm.te.reduce_axis([0, S], name="s")
30 | C = tvm.te.compute([N, K, P, Q],
31 |   lambda n, k, h, w :
32 |     tvm.te.sum(A[n, k // OG * IG + c, h * st + r, w * st + s] * B[k, c, r, s], axis=[c,r,s]), name="C")
33 | 
34 | dC = tvm.te.placeholder([N, K, P, Q], dtype=dtype, name="dC")
35 | 
36 | print(C.op.body)
37 | 
38 | print(dir(C.op.body[0].source[0]))
39 | 
40 | print(tvm.te.expr_equal(C.op.body[0].source[0].b.args[0], C.op.body[0].source[0].b.args[1]))
41 | 
42 | dA = tvm.te.grad_op(A, C, dC)
43 | 
44 | s = tvm.te.create_schedule(dA.op)
45 | 
46 | print(tvm.lower(s, [A, B, dC, dA], simple_mode=True))
47 | 
48 | func = tvm.build(s, [A, B, dC, dA], target="llvm")
49 | 
50 | A_np = np.random.uniform(-1, 1, [N, nC, H, W]).astype("float32")
51 | B_np = np.random.uniform(-1, 1, [K, IG, R, S]).astype("float32")
52 | dC_np = np.random.uniform(-1, 1, [N, K, P, Q]).astype("float32")
53 | dA_np = np.zeros([N, nC, H, W]).astype("float32")
54 | 
55 | ctx = tvm.device("llvm", 0)
56 | A_tvm = tvm.nd.array(A_np, ctx)
57 | B_tvm = tvm.nd.array(B_np, ctx)
58 | dC_tvm = tvm.nd.array(dC_np, ctx)
59 | dA_tvm = tvm.nd.array(dA_np, ctx)
60 | 
61 | func(A_tvm, B_tvm, dC_tvm, dA_tvm)
62 | 
63 | 
64 | # =======>
65 | # compare the results with pytorch
66 | A_torch = torch.tensor(A_np)
67 | B_torch = torch.tensor(B_np)
68 | dC_torch = torch.tensor(dC_np)
69 | #without output_padding=1: shapes (2, 16, 14, 14), golden:(2, 16, 13, 13) mismatch
70 | golden_torch = torch.nn.functional.conv_transpose2d(dC_torch, B_torch, stride=(st, st), output_padding=1, groups=group)
71 | tvm.testing.assert_allclose(dA_tvm.asnumpy(), golden_torch.numpy(), rtol=1e-3)
72 | print("Compare with PyTorch success!")
73 | 


--------------------------------------------------------------------------------
/flextensor/test/test_tvm_expr/grad/te-conv2d-topi-case1.py:
--------------------------------------------------------------------------------
 1 | import tvm
 2 | import numpy as np 
 3 | import torch
 4 | import topi
 5 | 
 6 | 
 7 | N = 2
 8 | nC = 16
 9 | H = 14
10 | W = 14
11 | K = 8
12 | R = 3
13 | S = 3
14 | 
15 | st = 1
16 | 
17 | P = (H - R + 1) // st
18 | Q = (W - S + 1) // st
19 | 
20 | dtype = "float32"
21 | 
22 | A = tvm.te.placeholder([N, nC, H, W], dtype=dtype, name="A")
23 | B = tvm.te.placeholder([K, nC, R, S], dtype=dtype, name="B")
24 | c = tvm.te.reduce_axis([0, nC], name="c")
25 | r = tvm.te.reduce_axis([0, R], name="r")
26 | s = tvm.te.reduce_axis([0, S], name="s")
27 | C = topi.nn.conv2d_nchw(A, B, 1, 0, 1, out_dtype=dtype)
28 | #C = tvm.te.compute([N, K, P, Q],
29 | #  lambda n, k, h, w :
30 | #    tvm.te.sum(A[n, c, h * st + r, w * st + s] * B[k, c, r, s], axis=[c,r,s]), name="C")
31 | 
32 | dC = tvm.te.placeholder([N, K, P, Q], dtype=dtype, name="dC")
33 | 
34 | print(C.op.body)
35 | 
36 | print(dir(C.op.body[0].source[0]))
37 | 
38 | print(tvm.te.expr_equal(C.op.body[0].source[0].b.args[0], C.op.body[0].source[0].b.args[1]))
39 | 
40 | dA, = tvm.te.mygradient(C, [A], dC)
41 | #dA = tvm.te.grad_op(A, C, dC)
42 | 
43 | s = tvm.te.create_schedule(dA.op)
44 | 
45 | print(tvm.lower(s, [A, B, dC, dA], simple_mode=True))
46 | 
47 | func = tvm.build(s, [A, B, dC, dA], target="llvm")
48 | 
49 | A_np = np.random.uniform(-10, 10, [N, nC, H, W]).astype("float32")
50 | B_np = np.random.uniform(-10, 10, [K, nC, R, S]).astype("float32")
51 | dC_np = np.random.uniform(-10, 10, [N, K, P, Q]).astype("float32")
52 | dA_np = np.zeros([N, nC, H, W]).astype("float32")
53 | 
54 | ctx = tvm.device("llvm", 0)
55 | A_tvm = tvm.nd.array(A_np, ctx)
56 | B_tvm = tvm.nd.array(B_np, ctx)
57 | dC_tvm = tvm.nd.array(dC_np, ctx)
58 | dA_tvm = tvm.nd.array(dA_np, ctx)
59 | 
60 | func(A_tvm, B_tvm, dC_tvm, dA_tvm)
61 | 
62 | print(dA_tvm)
63 | 
64 | # =======>
65 | # compare the results with pytorch
66 | A_torch = torch.tensor(A_np)
67 | B_torch = torch.tensor(B_np)
68 | dC_torch = torch.tensor(dC_np)
69 | golden_torch = torch.nn.functional.conv_transpose2d(dC_torch, B_torch)
70 | tvm.testing.assert_allclose(dA_tvm.asnumpy(), golden_torch.numpy(), rtol=1e-5, atol=1e-4)
71 | print("Compare with PyTorch success!")
72 | 


--------------------------------------------------------------------------------
/flextensor/test/test_tvm_expr/grad/te-cross_entropy-case1.py:
--------------------------------------------------------------------------------
 1 | import tvm 
 2 | import numpy as np
 3 | import torch
 4 | 
 5 | 
 6 | def cross_entropy(inputs, targets, weights, reduction="mean"):
 7 |   N, C = inputs.shape
 8 |   c = tvm.te.reduce_axis([0, C], "c")
 9 |   sum_val = tvm.te.compute([N], lambda i: tvm.te.sum(tvm.tir.exp(inputs[i, c]), axis=[c]), "sum_val")
10 |   if reduction == "mean":
11 |     rn = tvm.te.reduce_axis([0, N], "rn")
12 |     rc = tvm.te.reduce_axis([0, C], "rc")
13 |     sum_weight = tvm.te.compute([1], lambda i: tvm.te.sum(weights[i+rc]*targets[rn, rc]/N, axis=[rn, rc]), "mean_weight", requires_grad=False)
14 |   elif reduction == "sum":
15 |     sum_weight = tvm.te.compute([1], lambda i: tvm.tir.expr.const(1, weights.dtype)/N, "sum_weight", requires_grad=False)
16 |   else:
17 |     raise NameError()
18 |   rrn = tvm.te.reduce_axis([0, N], "rrn")
19 |   rrc = tvm.te.reduce_axis([0, C], "rrc")
20 |   # return tvm.te.compute([1],
21 |   #   lambda i: tvm.te.sum(
22 |   #     weights[rrc] * targets[i+rrn, rrc] * (tvm.tir.log(sum_val[i+rrn]) - inputs[i+rrn, rrc]*targets[rrn, rrc])/(N*sum_weight[i]),
23 |   #     axis=[rrn, rrc]),
24 |   #   "cross_entropy")
25 |   return tvm.te.compute([1],
26 |     lambda i: tvm.te.sum(
27 |       targets[i+rrn, rrc] * (tvm.tir.log(sum_val[i+rrn]) - inputs[i+rrn, rrc]*targets[i+rrn, rrc])/(N),
28 |       axis=[rrn, rrc]),
29 |     "cross_entropy")
30 | 
31 | 
32 | N = 100
33 | C = 100
34 | dtype = "float32"
35 | ltype = "int64"
36 | target = "llvm"
37 | 
38 | A = tvm.te.placeholder([N, C], dtype=dtype, name="A")
39 | targets = tvm.te.placeholder([N, C], dtype=dtype, name="targets", requires_grad=False)
40 | labels = tvm.te.placeholder([N], dtype=ltype, name="labels", requires_grad=False)
41 | weights = tvm.te.placeholder([C], dtype=dtype, name="weights", requires_grad=False)
42 | 
43 | loss = cross_entropy(A, targets, weights, reduction="mean")
44 | 
45 | dloss = tvm.te.placeholder([1], dtype=dtype, name="dloss")
46 | 
47 | dA, = tvm.te.mygradient(loss, [A], dloss)
48 | 
49 | s = tvm.te.create_schedule([loss.op, dA.op])
50 | 
51 | print(tvm.lower(s, [A, targets, weights, loss, dloss, dA], simple_mode=True))
52 | 
53 | func = tvm.build(s, [A, targets, weights, loss, dloss, dA], target)
54 | 
55 | A_np = np.random.uniform(-1, 1, [N, C]).astype(dtype)
56 | dA_np = np.zeros([N, C]).astype(dtype) * 0 + 1
57 | labels_np = np.random.randint(0, C, [N]).astype(ltype)
58 | # labels_np[0] = 1
59 | targets_np = np.zeros([N, C]).astype(dtype)
60 | for i in range(N):
61 |   targets_np[i][labels_np[i]] = 1.0
62 | weights_np = np.random.uniform(-1, 1, [C]).astype(dtype) * 0 + 1
63 | loss_np = np.zeros([1]).astype(dtype)
64 | dloss_np = np.random.uniform(-1, 1, [1]).astype(dtype) * 0 + 1
65 | 
66 | ctx = tvm.device(target, 0)
67 | A_tvm = tvm.nd.array(A_np, ctx)
68 | dA_tvm = tvm.nd.array(dA_np, ctx)
69 | targets_tvm = tvm.nd.array(targets_np, ctx)
70 | weights_tvm = tvm.nd.array(weights_np, ctx)
71 | loss_tvm = tvm.nd.array(loss_np, ctx)
72 | dloss_tvm = tvm.nd.array(dloss_np, ctx)
73 | 
74 | func(A_tvm, targets_tvm, weights_tvm, loss_tvm, dloss_tvm, dA_tvm)
75 | print(loss_tvm)
76 | print(dA_tvm)
77 | 
78 | A_torch = torch.tensor(A_np, requires_grad=True)
79 | labels_torch = torch.tensor(labels_np)
80 | weights_torch = torch.tensor(weights_np)
81 | loss_torch = torch.nn.functional.cross_entropy(A_torch, labels_torch, weights_torch, reduction="mean")
82 | 
83 | print(loss_torch.detach().numpy())
84 | loss_torch.backward()
85 | print(A_torch.grad.numpy())
86 | 
87 | tvm.testing.assert_allclose(dA_tvm.asnumpy(), A_torch.grad.numpy(), rtol=1e-30, atol=1e-9)
88 | print("Compare to PyTorch success!")


--------------------------------------------------------------------------------
/flextensor/test/test_tvm_expr/grad/te-downcast-case1.py:
--------------------------------------------------------------------------------
 1 | import tvm
 2 | import numpy as np
 3 | 
 4 | 
 5 | H = 8
 6 | 
 7 | dtype = "float32"
 8 | 
 9 | A = tvm.te.placeholder([H, H], dtype=dtype, name="A")
10 | C = tvm.te.compute([H],
11 |   lambda i:
12 |     A[i, i], name="C")
13 | 
14 | dC = tvm.te.placeholder([H], dtype=dtype, name="dC")
15 | 
16 | dA = tvm.te.grad_op(A, C, dC)
17 | 
18 | s = tvm.te.create_schedule(dA.op)
19 | 
20 | print(tvm.lower(s, [A, dC, dA], simple_mode=True))
21 | 
22 | func = tvm.build(s, [A, dC, dA], target="llvm")
23 | 
24 | A_np = np.random.uniform(-10, 10, [H, H]).astype("float32")
25 | dC_np = np.random.uniform(-10, 10, [H]).astype("float32")
26 | dA_np = np.zeros([H, H]).astype("float32")
27 | 
28 | ctx = tvm.device("llvm", 0)
29 | A_tvm = tvm.nd.array(A_np, ctx)
30 | dC_tvm = tvm.nd.array(dC_np, ctx)
31 | dA_tvm = tvm.nd.array(dA_np, ctx)
32 | 
33 | func(A_tvm, dC_tvm, dA_tvm)
34 | 
35 | print(dA_tvm)
36 | 
37 | # =======>
38 | # compare the results with numpy
39 | golden_np = np.diag(dC_np)
40 | tvm.testing.assert_allclose(dA_tvm.asnumpy(), golden_np, rtol=1e-30)
41 | print("Compare with Numpy success!")
42 | 


--------------------------------------------------------------------------------
/flextensor/test/test_tvm_expr/grad/te-flatten.py:
--------------------------------------------------------------------------------
 1 | import tvm
 2 | import numpy as np
 3 | 
 4 | 
 5 | H = 8
 6 | W = 16
 7 | 
 8 | dtype = "float32"
 9 | 
10 | A = tvm.te.placeholder([H, W], dtype=dtype, name="A")
11 | C = tvm.te.compute([H * W],
12 |   lambda i:
13 |     A[i//W, i%W], name="C")
14 | 
15 | dC = tvm.te.placeholder([H * W], dtype=dtype, name="dC")
16 | 
17 | dA = tvm.te.grad_op(A, C, dC)
18 | 
19 | s = tvm.te.create_schedule(dA.op)
20 | 
21 | print(tvm.lower(s, [A, dC, dA], simple_mode=True))
22 | 
23 | func = tvm.build(s, [A, dC, dA], target="llvm")
24 | 
25 | A_np = np.random.uniform(-10, 10, [H, W]).astype("float32")
26 | dC_np = np.random.uniform(-10, 10, [H * W]).astype("float32")
27 | dA_np = np.zeros([H, W]).astype("float32")
28 | 
29 | ctx = tvm.device("llvm", 0)
30 | A_tvm = tvm.nd.array(A_np, ctx)
31 | dC_tvm = tvm.nd.array(dC_np, ctx)
32 | dA_tvm = tvm.nd.array(dA_np, ctx)
33 | 
34 | func(A_tvm, dC_tvm, dA_tvm)
35 | 
36 | print(dA_tvm)
37 | 
38 | # =======>
39 | # compare the results with numpy
40 | golden_np = np.reshape(dC_np, (H, W))
41 | tvm.testing.assert_allclose(dA_tvm.asnumpy(), golden_np, rtol=1e-30)
42 | print("Compare with Numpy success!")


--------------------------------------------------------------------------------
/flextensor/test/test_tvm_expr/grad/te-gemm.py:
--------------------------------------------------------------------------------
 1 | import tvm
 2 | import numpy as np
 3 | import torch
 4 | 
 5 | 
 6 | H = 8
 7 | W = 4
 8 | K = 3
 9 | 
10 | dtype = "float32"
11 | 
12 | A = tvm.te.placeholder([H, K], dtype=dtype, name="A")
13 | B = tvm.te.placeholder([K, W], dtype=dtype, name="B")
14 | k = tvm.te.reduce_axis([0, K], name="k")
15 | C = tvm.te.compute([H, W],
16 |   lambda h, w :
17 |     tvm.te.sum(A[h, k] * B[k, w], axis=[k]), name="C")
18 | 
19 | dC = tvm.te.placeholder([H, W], dtype=dtype, name="dC")
20 | 
21 | dA = tvm.te.grad_op(A, C, dC)
22 | 
23 | s = tvm.te.create_schedule(dA.op)
24 | 
25 | print(tvm.lower(s, [A, B, dC, dA], simple_mode=True))
26 | 
27 | func = tvm.build(s, [A, B, dC, dA], target="llvm")
28 | 
29 | A_np = np.random.uniform(-10, 10, [H, K]).astype("float32")
30 | B_np = np.random.uniform(-10, 10, [K, W]).astype("float32")
31 | dC_np = np.random.uniform(-10, 10, [H, W]).astype("float32")
32 | dA_np = np.zeros([H, K]).astype("float32")
33 | 
34 | ctx = tvm.device("llvm", 0)
35 | A_tvm = tvm.nd.array(A_np, ctx)
36 | B_tvm = tvm.nd.array(B_np, ctx)
37 | dC_tvm = tvm.nd.array(dC_np, ctx)
38 | dA_tvm = tvm.nd.array(dA_np, ctx)
39 | 
40 | func(A_tvm, B_tvm, dC_tvm, dA_tvm)
41 | 
42 | print(dA_tvm)
43 | 
44 | # =======>
45 | # compare the results with numpy
46 | golden_np = np.matmul(dC_np, B_np.T)
47 | tvm.testing.assert_allclose(dA_tvm.asnumpy(), golden_np, rtol=1e-5)
48 | print("Compare with Numpy success!")
49 | 
50 | 
51 | 


--------------------------------------------------------------------------------
/flextensor/test/test_tvm_expr/grad/te-maxpool-case1.py:
--------------------------------------------------------------------------------
 1 | import tvm
 2 | import numpy as np 
 3 | import torch
 4 | 
 5 | 
 6 | H = 4
 7 | W = 4
 8 | 
 9 | R = 2
10 | S = 2
11 | 
12 | P = H // R
13 | Q = W // S
14 | 
15 | dtype = "float32"
16 | 
17 | 
18 | def mse_loss(inputs, targets):
19 |   N = inputs.shape[0]
20 |   K = inputs.shape[1]
21 |   n = tvm.te.reduce_axis([0, inputs.shape[0]], name="n")
22 |   k = tvm.te.reduce_axis([0, inputs.shape[1]], name="k")
23 |   # return tvm.te.compute([1], lambda i: tvm.te.sum((inputs[i + n, k]-targets[i + n, k])*(inputs[i + n, k]-targets[i + n, k])/(N*K), axis=[n, k]), name="mse")
24 |   return tvm.te.compute([1], lambda i: tvm.te.sum(tvm.tir.power((inputs[i + n, k]-targets[i + n, k]), 2)/(N*K), axis=[n, k]), name="mse", requires_grad=True)
25 | 
26 | 
27 | A = tvm.te.placeholder([H, W], dtype=dtype, name="A", requires_grad=True)
28 | label = tvm.te.placeholder([P, Q], dtype=dtype, name="label", requires_grad=False)
29 | 
30 | p = tvm.te.reduce_axis([0, R], "p")
31 | q = tvm.te.reduce_axis([0, S], "q")
32 | 
33 | B = tvm.te.compute([P, Q], lambda a, b: tvm.te.max(A[a*R+p, b*S+q]-1e-5, axis=[p, q]), requires_grad=False, name="max_value")
34 | C = tvm.te.compute([H, W], lambda u, v: tvm.tir.if_then_else(A[u, v] > B[u//R, v//S], 1.0, 0.0), requires_grad=False, name="map")
35 | 
36 | r = tvm.te.reduce_axis([0, R], "r")
37 | s = tvm.te.reduce_axis([0, S], "s")
38 | 
39 | D = tvm.te.compute([P, Q],
40 |   lambda i, j: tvm.te.sum(A[i*R+r, j*S+s] * C[i*R+r, j*S+s], axis=[r, s]), name="D", requires_grad=True)
41 | 
42 | E = mse_loss(D, label)
43 | 
44 | dA, = tvm.te.mygradient(E, [A])
45 | 
46 | s = tvm.te.create_schedule([E.op, dA.op])
47 | 
48 | print(tvm.lower(s, [A, label, E, dA], simple_mode=True))
49 | 
50 | func = tvm.build(s, [A, label, E, dA], target="llvm")
51 | 
52 | A_np = np.random.uniform(-10, 10, [H, W]).astype("float32")
53 | label_np = np.random.uniform(-10, 10, [P, Q]).astype("float32")
54 | E_np = np.zeros([1]).astype("float32")
55 | dA_np = np.zeros([H, W]).astype("float32")
56 | 
57 | ctx = tvm.device("llvm", 0)
58 | A_tvm = tvm.nd.array(A_np, ctx)
59 | label_tvm = tvm.nd.array(label_np, ctx)
60 | E_tvm = tvm.nd.array(E_np, ctx)
61 | dA_tvm = tvm.nd.array(dA_np, ctx)
62 | 
63 | func(A_tvm, label_tvm, E_tvm, dA_tvm)
64 | 
65 | print(E_tvm)
66 | 
67 | # ==> compare to pytorch
68 | 
69 | A0_torch = torch.tensor(A_np, requires_grad=True)
70 | A_torch = A0_torch.unsqueeze(0).unsqueeze(1)
71 | label_torch = torch.tensor(label_np, requires_grad=False)
72 | D_torch = torch.max_pool2d(A_torch, [R, S]).squeeze()
73 | E_torch = torch.nn.functional.mse_loss(D_torch, label_torch)
74 | print(E_torch.detach().numpy())
75 | E_torch.backward()
76 | 
77 | tvm.testing.assert_allclose(E_tvm.asnumpy(), E_torch.detach().numpy(), atol=1e-5, rtol=1e-30)
78 | tvm.testing.assert_allclose(dA_tvm.asnumpy(), A0_torch.grad.numpy(), atol=1e-5, rtol=1e-30)


--------------------------------------------------------------------------------
/flextensor/test/test_tvm_expr/grad/te-mse_loss-case1.py:
--------------------------------------------------------------------------------
 1 | import tvm
 2 | import numpy as np 
 3 | import torch
 4 | 
 5 | batch_size = 3
 6 | num_classes = 5
 7 | shape_size = [batch_size, num_classes]
 8 | dtype = "float32"
 9 | ltype = "int64"
10 | 
11 | A = tvm.te.placeholder(shape_size, dtype=dtype, name="A", requires_grad=True)
12 | targets = tvm.te.placeholder(shape_size, dtype=dtype, name="targets", requires_grad=False)
13 | n = tvm.te.reduce_axis([0, A.shape[0]], name="n")
14 | k = tvm.te.reduce_axis([0, A.shape[1]], name="k")
15 | loss = tvm.te.compute([1], lambda i: tvm.te.sum(
16 |      (A[i + n, k]-targets[n, k])*(A[i + n, k]-targets[n, k]), axis=[n, k]), name="mse", requires_grad=True)
17 | 
18 | dloss = tvm.te.placeholder([1], dtype=dtype, name="dloss")
19 | dA, = tvm.te.mygradient(loss, [A], dloss)
20 | 
21 | s = tvm.te.create_schedule([loss.op, dA.op])
22 | 
23 | print(tvm.lower(s, [A, targets, loss, dloss, dA], simple_mode=True))
24 | 
25 | func = tvm.build(s, [A, targets, loss, dloss, dA], target="llvm")
26 | 
27 | A_np = np.random.uniform(-10, 10, shape_size).astype(dtype)
28 | dA_np = np.zeros([batch_size, num_classes]).astype(dtype) * 0 + 1
29 | labels_np = np.random.randint(0, num_classes, [batch_size]).astype(ltype)
30 | targets_np = np.zeros([batch_size, num_classes]).astype(dtype)
31 | for i in range(batch_size):
32 |   targets_np[i][labels_np[i]] = 1.0
33 | loss_np = np.zeros([1]).astype(dtype)
34 | dloss_np = np.random.uniform(-1, 1, [1]).astype(dtype) * 0 + 1
35 | 
36 | ctx = tvm.device("llvm", 0)
37 | A_tvm = tvm.nd.array(A_np, ctx)
38 | dA_tvm = tvm.nd.array(dA_np, ctx)
39 | targets_tvm = tvm.nd.array(targets_np, ctx)
40 | loss_tvm = tvm.nd.array(loss_np, ctx)
41 | dloss_tvm = tvm.nd.array(dloss_np, ctx)
42 | 
43 | func(A_tvm, targets_tvm, loss_tvm, dloss_tvm, dA_tvm)
44 | 
45 | print("loss_tvm", loss_tvm)
46 | print("dA_tvm", dA_tvm)
47 | 
48 | # =======>
49 | # compare the results with pytorch
50 | A_torch = torch.tensor(A_np, requires_grad=True)
51 | targets_torch = torch.tensor(targets_np)
52 | loss_torch = torch.nn.functional.mse_loss(A_torch, targets_torch, reduction="sum")
53 | print("loss_pytorch", loss_torch.detach().numpy())
54 | loss_torch.backward()
55 | print("dA_pytorch", A_torch.grad.numpy())
56 | tvm.testing.assert_allclose(dA_tvm.asnumpy(), A_torch.grad.numpy(), rtol=1e-30, atol=1e-30)
57 | print("Compare to PyTorch success!")
58 | 


--------------------------------------------------------------------------------
/flextensor/test/test_tvm_expr/grad/te-padding-case1.py:
--------------------------------------------------------------------------------
 1 | import tvm
 2 | import numpy as np 
 3 | import torch
 4 | 
 5 | 
 6 | N = 2
 7 | nC = 16
 8 | H = 14
 9 | W = 14
10 | K = 16
11 | R = 3
12 | S = 3
13 | 
14 | padding = 1
15 | 
16 | P = H + 2 * padding
17 | Q = W + 2 * padding
18 | 
19 | dtype = "float32"
20 | 
21 | A = tvm.te.placeholder([N, nC, H, W], dtype=dtype, name="A")
22 | C = tvm.te.compute([N, K, P, Q],
23 |   lambda n, k, h, w :
24 |     tvm.tir.if_then_else(
25 |       tvm.tir.all(h >= padding, h < P-padding, w >= padding, w < Q-padding),
26 |       A[n, k, h-padding, w-padding], 0.0),
27 |   name="C")
28 | 
29 | dC = tvm.te.placeholder([N, K, P, Q], dtype=dtype, name="dC")
30 | 
31 | print(C.op.body[0].name)
32 | 
33 | print(type(C.op.body[0].args[1]))
34 | 
35 | dA = tvm.te.grad_op(A, C, dC)
36 | 
37 | s = tvm.te.create_schedule(dA.op)
38 | 
39 | print(tvm.lower(s, [A, dC, dA], simple_mode=True))
40 | 
41 | func = tvm.build(s, [A, dC, dA], target="llvm")
42 | 
43 | A_np = np.random.uniform(-10, 10, [N, nC, H, W]).astype("float32")
44 | dC_np = np.random.uniform(-10, 10, [N, K, P, Q]).astype("float32")
45 | dA_np = np.zeros([N, nC, H, W]).astype("float32")
46 | 
47 | ctx = tvm.device("llvm", 0)
48 | A_tvm = tvm.nd.array(A_np, ctx)
49 | dC_tvm = tvm.nd.array(dC_np, ctx)
50 | dA_tvm = tvm.nd.array(dA_np, ctx)
51 | 
52 | func(A_tvm, dC_tvm, dA_tvm)
53 | 
54 | print(dA_tvm)
55 | 
56 | # =======>
57 | # compare the results with numpy
58 | golden_np = dC_np[:,:, padding:P-padding, padding:Q-padding]
59 | tvm.testing.assert_allclose(dA_tvm.asnumpy(), golden_np, rtol=1e-30)
60 | print("Compare with Numpy success!")


--------------------------------------------------------------------------------
/flextensor/test/test_tvm_expr/grad/te-power-case1.py:
--------------------------------------------------------------------------------
 1 | import tvm
 2 | import numpy as np
 3 | 
 4 | 
 5 | H = 4
 6 | W = 2
 7 | 
 8 | dtype = "float32"
 9 | 
10 | A = tvm.te.placeholder([H, W], dtype=dtype, name="A")
11 | 
12 | C = tvm.te.compute([H, W],
13 |   lambda h, w :
14 |     tvm.tir.power(A[h, w]+1e-9, 2), name="C")
15 | 
16 | dC = tvm.te.placeholder([H, W], dtype=dtype, name="dC")
17 | 
18 | dA = tvm.te.grad_op(A, C, dC)
19 | 
20 | s = tvm.te.create_schedule(dA.op)
21 | 
22 | s = tvm.te.create_schedule([C.op, dA.op])
23 | 
24 | print(tvm.lower(s, [A, C, dC, dA], simple_mode=True))
25 | 
26 | func = tvm.build(s, [A, C, dC, dA], target="llvm")
27 | 
28 | A_np = np.random.uniform(-10, 10, [H, W]).astype("float32")
29 | dC_np = np.random.uniform(-10, 10, [H, W]).astype("float32")
30 | C_np = np.zeros([H, W]).astype("float32")
31 | dA_np = np.zeros([H, W]).astype("float32")
32 | 
33 | ctx = tvm.device("llvm", 0)
34 | A_tvm = tvm.nd.array(A_np, ctx)
35 | C_tvm = tvm.nd.array(C_np, ctx)
36 | dC_tvm = tvm.nd.array(dC_np, ctx)
37 | dA_tvm = tvm.nd.array(dA_np, ctx)
38 | 
39 | func(A_tvm, C_tvm, dC_tvm, dA_tvm)
40 | 
41 | print(A_tvm)
42 | print(C_tvm)
43 | print(dC_tvm)
44 | print(dA_tvm)
45 | 


--------------------------------------------------------------------------------
/flextensor/test/test_tvm_expr/grad/te-repeat-case1.py:
--------------------------------------------------------------------------------
 1 | import tvm
 2 | 
 3 | 
 4 | H = 8
 5 | 
 6 | dtype = "float32"
 7 | 
 8 | A = tvm.te.placeholder([H, H], dtype=dtype, name="A")
 9 | k = tvm.te.reduce_axis([0, H], name="k")
10 | C = tvm.te.compute([H, H],
11 |   lambda h, w :
12 |     tvm.te.sum(A[h, k] * A[k, w], axis=[k]), name="C")
13 | 
14 | dC = tvm.te.compute([H, H], lambda h, w: 1.0, name="dC")
15 | 
16 | dA = tvm.te.grad_op(A, C, dC)
17 | 
18 | s = tvm.te.create_schedule(dA.op)
19 | 
20 | print(tvm.lower(s, [A, dC, dA], simple_mode=True))
21 | 


--------------------------------------------------------------------------------
/flextensor/test/test_tvm_expr/grad/te-softmax-case1.py:
--------------------------------------------------------------------------------
 1 | import tvm
 2 | import numpy as np 
 3 | import torch
 4 | 
 5 | 
 6 | N = 2
 7 | H = 14
 8 | 
 9 | dtype = "float32"
10 | 
11 | factor = 1
12 | 
13 | 
14 | def softmax(inputs):
15 |   K = inputs.shape[-1]
16 |   k = tvm.te.reduce_axis([0, K], name="k")
17 |   k1 = tvm.te.reduce_axis([0, K], name="k1")
18 |   max_val = tvm.te.compute([N, K], lambda n, h: tvm.te.max(inputs[n, k1], axis=[k1]), name="mean_val", requires_grad=True)
19 |   exp_val = tvm.te.compute(inputs.shape, lambda n, h: tvm.tir.exp(inputs[n, h]-max_val[n, h]), name="Softmax_exp", requires_grad=True)
20 |   sum_val = tvm.te.compute(exp_val.shape, lambda n, h: tvm.te.sum(exp_val[n, k], axis=[k]), name="Softmax_sum", requires_grad=True)
21 |   final_val = tvm.te.compute(exp_val.shape, lambda n, h: exp_val[n, h]/(sum_val[n, h]), name="Softmax_div", requires_grad=True)
22 |   return [exp_val, sum_val, final_val]
23 | 
24 | 
25 | def mse_loss(inputs, targets):
26 |   N = inputs.shape[0]
27 |   K = inputs.shape[1]
28 |   n = tvm.te.reduce_axis([0, inputs.shape[0]], name="n")
29 |   k = tvm.te.reduce_axis([0, inputs.shape[1]], name="k")
30 |   # return tvm.te.compute([1], lambda i: tvm.te.sum((inputs[i + n, k]-targets[i + n, k])*(inputs[i + n, k]-targets[i + n, k])/(N*K), axis=[n, k]), name="mse")
31 |   return tvm.te.compute([1], lambda i: tvm.te.sum(tvm.tir.power((inputs[i + n, k]-targets[i + n, k]), 2)/(N*K), axis=[n, k]), name="mse", requires_grad=True)
32 | 
33 | 
34 | A = tvm.te.placeholder([N, H], dtype=dtype, name="A", requires_grad=True)
35 | label = tvm.te.placeholder([N, H], dtype=dtype, name="label", requires_grad=False)
36 | B, C, D = softmax(A)
37 | E = mse_loss(D, label)
38 | print(E.requires_grad)
39 | 
40 | dD = tvm.te.placeholder([N, H], dtype=dtype, name="dD")
41 | dE = tvm.te.placeholder([1], dtype=dtype, name="dE")
42 | 
43 | dA, = tvm.te.mygradient(E, [A])
44 | 
45 | s = tvm.te.create_schedule([E.op, dA.op])
46 | 
47 | print(tvm.lower(s, [A, label, D, E, dA], simple_mode=True))
48 | 
49 | func = tvm.build(s, [A, label, D, E, dA], target="llvm")
50 | 
51 | A_np = np.random.uniform(-100, 100, [N, H]).astype("float32")
52 | label_np = np.random.uniform(-1, 1, [N, H]).astype("float32")
53 | D_np = np.zeros([N, H]).astype("float32")
54 | E_np = np.zeros([1]).astype("float32")
55 | 
56 | dA_np = np.zeros([N, H]).astype("float32")
57 | 
58 | ctx = tvm.device("llvm", 0)
59 | A_tvm = tvm.nd.array(A_np, ctx)
60 | label_tvm = tvm.nd.array(label_np, ctx)
61 | D_tvm = tvm.nd.array(D_np, ctx)
62 | dA_tvm = tvm.nd.array(dA_np, ctx)
63 | E_tvm = tvm.nd.array(E_np, ctx)
64 | 
65 | func(A_tvm, label_tvm, D_tvm, E_tvm, dA_tvm)
66 | 
67 | # print("TVM result:\n", D_tvm)
68 | print("TVM gradient:\n", dA_tvm)
69 | 
70 | # =======>
71 | # compare the results with pytorch
72 | A_torch = torch.tensor(A_np, requires_grad=True)
73 | label_torch = torch.tensor(label_np, requires_grad=False)
74 | E_torch = torch.tensor(E_np)
75 | 
76 | D_torch = torch.nn.functional.softmax(A_torch, dim=1)
77 | E_torch = torch.nn.functional.mse_loss(D_torch, label_torch)
78 | # print("Pytorch result:\n", D_torch.detach().numpy())
79 | E_torch.backward()
80 | print("Pytorch gradient:\n", A_torch.grad.numpy())
81 | tvm.testing.assert_allclose(D_tvm.asnumpy(), D_torch.detach().numpy(), atol=1e-6*factor, rtol=1e-5)
82 | tvm.testing.assert_allclose(dA_tvm.asnumpy(), A_torch.grad.numpy(), atol=1e-6*factor, rtol=1e-5)
83 | print("Compare with PyTorch success!")
84 | 


--------------------------------------------------------------------------------
/flextensor/test/test_tvm_expr/grad/te-sub-case1.py:
--------------------------------------------------------------------------------
 1 | import tvm
 2 | 
 3 | 
 4 | H = 8
 5 | W = 9
 6 | 
 7 | dtype = "float32"
 8 | 
 9 | A = tvm.te.placeholder([H, W], dtype=dtype, name="A")
10 | 
11 | C = tvm.te.compute([H, W],
12 |   lambda h, w :
13 |     A[h, w] * 4 - A[h, w] * A[h, w], name="C")
14 | 
15 | dC = tvm.te.compute([H, W], lambda h, w: 1.0, name="dC")
16 | 
17 | dA = tvm.te.grad_op(A, C, dC)
18 | 
19 | s = tvm.te.create_schedule(dA.op)
20 | 
21 | print(tvm.lower(s, [A, dC, dA], simple_mode=True))
22 | 


--------------------------------------------------------------------------------
/flextensor/test/test_tvm_expr/grad/te-sub-case2.py:
--------------------------------------------------------------------------------
 1 | import tvm
 2 | 
 3 | 
 4 | H = 8
 5 | W = 9
 6 | 
 7 | dtype = "float32"
 8 | 
 9 | A = tvm.te.placeholder([H], dtype=dtype, name="A")
10 | 
11 | C = tvm.te.compute([H, W],
12 |   lambda h, w :
13 |     A[h] * 4 - A[h] * A[h], name="C")
14 | 
15 | dC = tvm.te.compute([H, W], lambda h, w: 1.0, name="dC")
16 | 
17 | dA = tvm.te.grad_op(A, C, dC)
18 | 
19 | s = tvm.te.create_schedule(dA.op)
20 | 
21 | print(tvm.lower(s, [A, dC, dA], simple_mode=True))
22 | 


--------------------------------------------------------------------------------
/flextensor/test/test_tvm_expr/grad/test_report.md:
--------------------------------------------------------------------------------
 1 | ## Tested Ops
 2 | 
 3 | | op name | case No. | grad to | configs | gradient | build | correctness |
 4 | | --- | --- | --- | --- | --- | --- | --- |
 5 | | GEMM | 1 | A | | yes | yes | rtol=1e-5 |
 6 | | Conv2d | 1 | A | st=1, pad=0, group=1, dilation=1 | yes | yes | rtol=1e-3 |
 7 | | Conv2d in topi | 1 | A | st=1, pad=0, group=1, dilation=1 | yes | yes | rtol=1e-5, atol=1e-4 |
 8 | | Conv2d | 2 | A | st=2, pad=0, group=1, dilation=1 | yes | yes | rtol=1e-3 |
 9 | | Conv2d | 3 | A | st=2, pad=0, group=2, dilation=1 | yes | yes | rtol=1e-3 |
10 | | Flatten | 1 | A | | yes | yes | rtol<1e-30 |
11 | | Downcast | 1 | A | | yes | yes | rtol<1e-30 |
12 | | Broadcast | 1 | A | | yes | yes | rtol=1e-6 |
13 | | Padding | 1 | A | | yes | yes | rtol<1e-30 |
14 | | AvgPool | 1 | A | | yes | yes | rtol<1e-30 |
15 | | Softmax | 1 | A | | yes | yes | atol=1e-6, rtol=1e-5 |
16 | | Maxpool | 1 | A | | yes | yes | atol=1e-5, rtol=1e-30 |
17 | | Tanh | 1 | A | | yes | yes | atol=1e-6, rtol=1e-7 |
18 | | ReLU | 1 | A | | yes | yes | atol<1e-30, rtol<1e-30 |
19 | | Mse_loss | 1 | A | | yes | yes | rtol<1e-30, atol<1e-30 |
20 | | Cross_entropy | 1 | A | | yes | yes | rtol=1e-30, atol=1e-9 |
21 | | Concatenate | 1 | A,B | | yes | yes | rtol<1e-30, atol<1e-30 |


--------------------------------------------------------------------------------
/flextensor/test/test_tvm_expr/grad/tir-relu-case1.py:
--------------------------------------------------------------------------------
 1 | import tvm
 2 | import numpy as np 
 3 | import torch
 4 | 
 5 | 
 6 | dim1 = 8
 7 | dim2 = 4
 8 | shape_size = [dim1, dim2]
 9 | dtype = "float32"
10 | 
11 | A = tvm.te.placeholder(shape_size, dtype=dtype, name="A", requires_grad=True)
12 | zeros = tvm.tir.expr.const(0, dtype)
13 | func = lambda *args: tvm.tir.if_then_else(A[args] > zeros, A[args], zeros)
14 | C = tvm.te.compute(A.shape, func, "ReLU", requires_grad=True)
15 | 
16 | dC = tvm.te.placeholder(A.shape, dtype=dtype, name="dC")
17 | dA, = tvm.te.mygradient(C, [A], dC)
18 | #dA = tvm.te.grad_op(A, C, dC)
19 | 
20 | s = tvm.te.create_schedule(dA.op)
21 | 
22 | print(tvm.lower(s, [A, dC, dA], simple_mode=True))
23 | 
24 | func = tvm.build(s, [A, dC, dA], target="llvm")
25 | 
26 | A_np = np.random.uniform(-10, 10, shape_size).astype("float32")
27 | #elements are all 1
28 | dC_np = np.ones(shape_size).astype("float32")
29 | dA_np = np.zeros(shape_size).astype("float32")
30 | 
31 | ctx = tvm.device("llvm", 0)
32 | A_tvm = tvm.nd.array(A_np, ctx)
33 | dC_tvm = tvm.nd.array(dC_np, ctx)
34 | dA_tvm = tvm.nd.array(dA_np, ctx)
35 | 
36 | func(A_tvm, dC_tvm, dA_tvm)
37 | 
38 | print("dA_tvm", dA_tvm)
39 | 
40 | # =======>
41 | # compare the results with pytorch
42 | A_torch = torch.tensor(A_np, requires_grad=True)
43 | C_torch = torch.nn.ReLU()(A_torch)
44 | loss = C_torch.sum()
45 | loss.backward()
46 | print("Pytorch gradient:\n", A_torch.grad.numpy())
47 | tvm.testing.assert_allclose(dA_tvm.asnumpy(), A_torch.grad.numpy(), atol=1e-30, rtol=1e-30)
48 | print("Compare with PyTorch success!")
49 | 


--------------------------------------------------------------------------------
/flextensor/test/test_tvm_expr/grad/tir-tanh-case1.py:
--------------------------------------------------------------------------------
 1 | import tvm
 2 | import numpy as np 
 3 | import torch
 4 | 
 5 | 
 6 | dim1 = 8
 7 | dim2 = 4
 8 | shape_size = [dim1, dim2]
 9 | dtype = "float32"
10 | 
11 | A = tvm.te.placeholder(shape_size, dtype=dtype, name="A", requires_grad=True)
12 | C = tvm.te.compute(A.shape, lambda *args: tvm.tir.tanh(A[args]), "tanh", requires_grad=True)
13 | 
14 | dC = tvm.te.placeholder(A.shape, dtype=dtype, name="dC")
15 | dA, = tvm.te.mygradient(C, [A], dC)
16 | #dA = tvm.te.grad_op(A, C, dC)
17 | 
18 | s = tvm.te.create_schedule(dA.op)
19 | 
20 | print(tvm.lower(s, [A, dC, dA], simple_mode=True))
21 | 
22 | func = tvm.build(s, [A, dC, dA], target="llvm")
23 | 
24 | A_np = np.random.uniform(-10, 10, shape_size).astype("float32")
25 | #elements are all 1
26 | dC_np = np.ones(shape_size).astype("float32")
27 | dA_np = np.zeros(shape_size).astype("float32")
28 | 
29 | ctx = tvm.device("llvm", 0)
30 | A_tvm = tvm.nd.array(A_np, ctx)
31 | dC_tvm = tvm.nd.array(dC_np, ctx)
32 | dA_tvm = tvm.nd.array(dA_np, ctx)
33 | 
34 | func(A_tvm, dC_tvm, dA_tvm)
35 | 
36 | print("dA_tvm", dA_tvm)
37 | 
38 | # =======>
39 | # compare the results with pytorch
40 | A_torch = torch.tensor(A_np, requires_grad=True)
41 | C_torch = torch.tanh(A_torch)
42 | loss = C_torch.sum()
43 | loss.backward()
44 | print("Pytorch gradient:\n", A_torch.grad.numpy())
45 | tvm.testing.assert_allclose(dA_tvm.asnumpy(), A_torch.grad.numpy(), atol=1e-6, rtol=1e-7)
46 | print("Compare with PyTorch success!")
47 | 
48 | 
49 | 


--------------------------------------------------------------------------------
/flextensor/test/test_tvm_expr/train/get_lm_data.sh:
--------------------------------------------------------------------------------
 1 | echo "=== Acquiring datasets ==="
 2 | echo "---"
 3 | mkdir -p save
 4 | 
 5 | mkdir -p data
 6 | cd data
 7 | 
 8 | echo "- Downloading WikiText-2 (WT2)"
 9 | wget --quiet --continue https://s3.amazonaws.com/research.metamind.io/wikitext/wikitext-2-v1.zip
10 | unzip -q wikitext-2-v1.zip
11 | cd wikitext-2
12 | mv wiki.train.tokens train.txt
13 | mv wiki.valid.tokens valid.txt
14 | mv wiki.test.tokens test.txt
15 | cd ..
16 | 
17 | echo "- Downloading WikiText-103 (WT2)"
18 | wget --continue https://s3.amazonaws.com/research.metamind.io/wikitext/wikitext-103-v1.zip
19 | unzip -q wikitext-103-v1.zip
20 | cd wikitext-103
21 | mv wiki.train.tokens train.txt
22 | mv wiki.valid.tokens valid.txt
23 | mv wiki.test.tokens test.txt
24 | cd ..
25 | 
26 | echo "- Downloading enwik8 (Character)"
27 | mkdir -p enwik8
28 | cd enwik8
29 | wget --continue http://mattmahoney.net/dc/enwik8.zip
30 | wget https://raw.githubusercontent.com/salesforce/awd-lstm-lm/master/data/enwik8/prep_enwik8.py
31 | python prep_enwik8.py
32 | cd ..
33 | 
34 | echo "- Downloading Penn Treebank (PTB)"
35 | wget --quiet --continue http://www.fit.vutbr.cz/~imikolov/rnnlm/simple-examples.tgz
36 | tar -xzf simple-examples.tgz
37 | 
38 | mkdir -p penn
39 | cd penn
40 | mv ../simple-examples/data/ptb.train.txt train.txt
41 | mv ../simple-examples/data/ptb.test.txt test.txt
42 | mv ../simple-examples/data/ptb.valid.txt valid.txt
43 | cd ..
44 | 
45 | echo "- Downloading Penn Treebank (Character)"
46 | mkdir -p pennchar
47 | cd pennchar
48 | mv ../simple-examples/data/ptb.char.train.txt train.txt
49 | mv ../simple-examples/data/ptb.char.test.txt test.txt
50 | mv ../simple-examples/data/ptb.char.valid.txt valid.txt
51 | cd ..
52 | 
53 | rm -rf simple-examples/
54 | 
55 | echo "---"
56 | echo "Happy language modeling :)"


--------------------------------------------------------------------------------
/flextensor/test/test_tvm_expr/train/mi_lstm_pytorch.py:
--------------------------------------------------------------------------------
 1 | from typing import List
 2 | 
 3 | import torch
 4 | import torch.nn as nn
 5 | 
 6 | 
 7 | class MultiplicativeIntegration(nn.Module):
 8 |     def __init__(self,
 9 |                  inputs_sizes: List[int],
10 |                  output_sizes: List[int],
11 |                  bias: bool,
12 |                  bias_start: float = 0.0,
13 |                  alpha_start: float = 1.0,
14 |                  beta_start: float = 1.0):
15 |         super().__init__()
16 |         self.inputs_sizes = inputs_sizes
17 |         self.output_sizes = output_sizes
18 |         total_output_size = sum(output_sizes)
19 |         total_input_size = sum(inputs_sizes)
20 |         self.bias_start = bias_start
21 |         self.alpha_start = alpha_start
22 |         self.beta_start = beta_start
23 |         self.weights = nn.Parameter(torch.empty(total_input_size, total_output_size))
24 |         self.alphas = nn.Parameter(torch.empty([total_output_size]))
25 |         self.betas = nn.Parameter(torch.empty([2*total_output_size]))
26 |         self.biases = nn.Parameter(torch.empty([total_output_size])) if bias else None
27 |         self.reset_parameters()
28 | 
29 |     def forward(self, input0, input1):
30 |         # input0.shape = (seq_len x batch_size x input_size), input1.shape = (seq_len x batch_size x num_units)
31 |         # w1.shape = (input_size x 4 * num_units), w2.shape = (num_units x 4 * num_units)
32 |         w1, w2 = torch.split(self.weights, self.inputs_sizes, dim=0)
33 |         # b1.shape, b2.shape = (4 * num_units)
34 |         b1, b2 = torch.split(self.betas, sum(self.output_sizes), dim=0)
35 |         # wx1.shape = (seq_len x batch_size x 4 * num_units), wx2.shape = (seq_len x batch_size x 4 * num_units)
36 |         wx1, wx2 = input0 @ w1, input1 @ w2
37 |         # res.shape = (seq_len x batch_size x 4 * num_units)
38 |         res = self.alphas * wx1 * wx2 + b1 * wx1 + b2 * wx2
39 |         if self.biases is not None: res += self.biases
40 |         return res
41 | 
42 |     def reset_parameters(self):
43 |         nn.init.xavier_uniform_(self.weights, gain=1.0)
44 |         nn.init.constant_(self.alphas, self.alpha_start)
45 |         nn.init.constant_(self.betas, self.beta_start)
46 |         if self.biases is not None:
47 |             nn.init.constant_(self.biases, self.bias_start)
48 | 
49 | 
50 | class MILSTMCell(nn.Module):
51 |     def __init__(self, input_size, num_units, forget_bias=0.0,
52 |                  bias_start=0.0, alpha_start=1.0,
53 |                  beta_start=1.0, activation=torch.tanh):
54 |         super().__init__()
55 |         self._input_size = input_size
56 |         self._num_units = num_units
57 |         self._forget_bias = forget_bias
58 |         self._bias_start = bias_start
59 |         self._alpha_start = alpha_start
60 |         self._beta_start = beta_start
61 |         self._activation = activation
62 |         self.mi_module = MultiplicativeIntegration(
63 |             inputs_sizes=[input_size, num_units],
64 |             output_sizes=[num_units, num_units, num_units, num_units],
65 |             bias=True,
66 |             bias_start=bias_start,
67 |             alpha_start=alpha_start,
68 |             beta_start=beta_start,
69 |         )
70 | 
71 |     def forward(self, inputs, state):
72 |         # c/h.shape = (seq_len x batch_size x num_units)
73 |         c, h = state
74 |         # i = input_gate, j = new_input, f = forget_gate, o = output_gate
75 |         concat = self.mi_module(inputs, h)
76 |         # i/j/f/o.shape = (seq_len x batch_size x num_units)
77 |         i, j, f, o = torch.split(concat, self._num_units, dim=2)
78 |         # new_c.shape = (seq_len x batch_size x num_units)
79 |         new_c = c * torch.sigmoid(f + self._forget_bias) + torch.sigmoid(i + self._activation(j))
80 |         # new_h.shape = (seq_len x batch_size x num_units)
81 |         new_h = self._activation(new_c) * torch.sigmoid(o)
82 |         new_state = new_c, new_h
83 |         return new_h, new_state
84 | 


--------------------------------------------------------------------------------
/flextensor/test/test_tvm_expr/train/scrnn_pytorch.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.nn as nn
 3 | 
 4 | 
 5 | class SCRNCell(nn.Module):
 6 |     def __init__(self, input_size, num_units, context_units, alpha):
 7 |         super().__init__()
 8 |         self._input_size = input_size
 9 |         self._num_units = num_units
10 |         self._context_units = context_units
11 |         self._alpha = alpha
12 |         self.B = nn.Parameter(torch.empty(input_size, context_units))
13 |         self.V = nn.Parameter(torch.empty(context_units, num_units))
14 |         self.U = nn.Parameter(torch.empty(num_units, num_units))
15 |         self.fc = nn.Linear(context_units + input_size + num_units, num_units, bias=False)
16 |         self.reset_parameters()  # weight initialization: glorot uniform
17 | 
18 |     # NOTE: rnn_cell_impl._linear: https://github.com/tensorflow/tensorflow/blob/r1.5/tensorflow/contrib/rnn/python/ops/core_rnn_cell.py#L127
19 |     def forward(self, inputs, state):
20 |         # state_h.shape = (seq_len x batch_size x num_units), state_c.shape = (seq_len x batch_size x context_units)
21 |         state_h, state_c = state.split([self._num_units, self._context_units], dim=2)
22 |         # context_state.shape = (seq_len x batch_size x context_units)
23 |         context_state = (1 - self._alpha) * (inputs @ self.B) + self._alpha * state_c
24 |         # hidden_state.shape = (seq_len x batch_size x num_units)
25 |         state_h = state_h.expand(inputs.shape[0], -1, -1)
26 |         hidden_state = torch.sigmoid(self.fc(torch.cat([context_state, inputs, state_h], dim=2)))
27 |         # output.shape = (seq_len x batch_size x num_units)
28 |         output = hidden_state @ self.U + context_state @ self.V
29 |         # new_state.shape = (seq_len x batch_size x (num_units+context_units))
30 |         new_state = torch.cat([hidden_state, context_state], dim=2)
31 |         return output, new_state
32 | 
33 |     def reset_parameters(self):
34 |         for weight in self.parameters():
35 |             nn.init.xavier_uniform_(weight, gain=1.0)
36 | 


--------------------------------------------------------------------------------
/flextensor/testing/__init__.py:
--------------------------------------------------------------------------------
1 | 
2 | 


--------------------------------------------------------------------------------
/flextensor/testing/array_mul.py:
--------------------------------------------------------------------------------
 1 | import tvm 
 2 | import numpy as np
 3 | from flextensor.utils import to_tuple
 4 | 
 5 | 
 6 | def array_mul(N):
 7 |     A = tvm.te.placeholder((N,), dtype="float32")
 8 |     B = tvm.te.placeholder((N,), dtype="float32")
 9 |     C = tvm.te.compute((N,), lambda i: A[i] + B[i])
10 |     return [C.op], [A, B, C]
11 | 
12 | 
13 | 
14 | def test_array_mul(extent=1024, target="llvm", dev_id=0, number=10, verbose=False):
15 |     time_cost_lst = []
16 |     for N in range(1, extent+1):
17 |         ctx = tvm.device(target, dev_id)
18 |         ary_ops, ary_bufs = array_mul(N)
19 |         ary_inputs = [tvm.nd.array(np.random.uniform(size=to_tuple(buf.shape)).astype(buf.dtype), ctx) for buf in ary_bufs[:-1]]
20 |         ary_inputs += [tvm.nd.array(np.zeros(shape=to_tuple(buf.shape), dtype=buf.dtype), ctx) for buf in ary_bufs[-1:]]
21 | 
22 |         s = tvm.te.create_schedule(ary_ops)
23 |         func = tvm.build(s, ary_bufs, target)
24 |         evaluator = func.time_evaluator(func.entry_name, ctx, number=number)
25 | 
26 |         cost = evaluator(*ary_inputs).mean * 1e3
27 |         # print("N=", N, "cost=", "%f(ms)"%cost, "(target=%s, dev_id=%d, number=%d)"%(target, dev_id, number))
28 |         time_cost_lst.append(cost)
29 |     
30 |     res_lst = [x / time_cost_lst[0] for x in time_cost_lst]
31 |     print("array_mul |(target=%s, dev_id=%d, number=%d)"%(target, dev_id, number))
32 |     if verbose:
33 |         for i, res in enumerate(res_lst):
34 |             print("time_cost: ext=%d / ext=1 = %f"%(i + 1, res))
35 |     else:
36 |         print("time_cost: ext=%d / ext=1 = %f"%(extent, res_lst[-1]))
37 | 
38 | 
39 | if __name__ == "__main__":
40 |     test_array_mul(extent=1024, number=1000, verbose=True)


--------------------------------------------------------------------------------
/flextensor/testing/net/pytorch-overfeat.py:
--------------------------------------------------------------------------------
 1 | import torch 
 2 | import torch.nn as nn
 3 | import time
 4 | 
 5 | 
 6 | class ConvBlock(nn.Module):
 7 |     def __init__(self, in_channels, out_channels, kernel_size, stride=1, padding=0, dilation=1, groups=1, bias=False):
 8 |         super(ConvBlock, self).__init__()
 9 |         self.conv = nn.Conv2d(in_channels, out_channels, kernel_size, stride=stride, 
10 |             padding=padding, dilation=dilation, groups=groups, bias=bias)
11 | 
12 |     def forward(self, inputs):
13 |         ret = self.conv(inputs)
14 |         return ret
15 | 
16 | 
17 | class Flatten(nn.Module):
18 |     def __init__(self):
19 |         super(Flatten, self).__init__()
20 | 
21 |     def forward(self, inputs):
22 |         return torch.flatten(inputs)
23 | 
24 | 
25 | class OverFeat(nn.Module):
26 |     def __init__(self, image_channel=3, num_classes=1470):
27 |         super(OverFeat, self).__init__()
28 |         self.net = nn.Sequential(
29 |             ConvBlock(image_channel, 96, 11, 4, 5),
30 |             nn.MaxPool2d(2, 2),
31 |             ConvBlock(96, 256, 5, 1, 2),
32 |             nn.MaxPool2d(2, 2),
33 |             ConvBlock(256, 512, 3, 1, 1),
34 |             ConvBlock(512, 1024, 3, 1, 1),
35 |             ConvBlock(1024, 1024, 3, 1, 1),
36 |             nn.MaxPool2d(2, 2),
37 |             Flatten(),
38 |             nn.Linear(1024 * 6 * 6, 3072),
39 |             nn.Linear(3072, 4096),
40 |             nn.Linear(4096, num_classes)
41 |         )
42 |     
43 |     def forward(self, inputs):
44 |         return self.net(inputs)
45 | 
46 | 
47 | if __name__ == "__main__":
48 |     net = OverFeat(3, 1000)
49 |     net.cuda("cuda:0")
50 |     batch_size = 1
51 |     inputs = torch.randn([batch_size, 3, 192, 192]).cuda("cuda:0")
52 |     output = net(inputs)
53 | 
54 |     torch.cuda.synchronize()
55 |     beg = time.time()
56 |     device_time = 0.0
57 |     for i in range(50):
58 |         start = torch.cuda.Event(enable_timing=True)
59 |         finish = torch.cuda.Event(enable_timing=True)
60 |         start.record()
61 |         net(inputs)
62 |         finish.record()
63 |         torch.cuda.synchronize()
64 |         device_time += start.elapsed_time(finish)
65 |     end = time.time()
66 |     print("Host time pass {}ms".format((end - beg) * 1e3 / 50))
67 |     print("Device time pass {}ms".format(device_time / 50))


--------------------------------------------------------------------------------
/flextensor/testing/net/pytorch-yolo-v1.py:
--------------------------------------------------------------------------------
 1 | import torch 
 2 | import torch.nn as nn
 3 | import time
 4 | 
 5 | 
 6 | class ConvBlock(nn.Module):
 7 |     def __init__(self, in_channels, out_channels, kernel_size, stride=1, padding=0, dilation=1, groups=1, bias=False):
 8 |         super(ConvBlock, self).__init__()
 9 |         self.conv = nn.Conv2d(in_channels, out_channels, kernel_size, stride=stride, 
10 |             padding=padding, dilation=dilation, groups=groups, bias=bias)
11 |         self.act = nn.ReLU()
12 | 
13 |     def forward(self, inputs):
14 |         ret = self.conv(inputs)
15 |         ret = self.act(ret)
16 |         return ret
17 | 
18 | 
19 | class Flatten(nn.Module):
20 |     def __init__(self):
21 |         super(Flatten, self).__init__()
22 | 
23 |     def forward(self, inputs):
24 |         return torch.flatten(inputs)
25 | 
26 | 
27 | class YOLO(nn.Module):
28 |     def __init__(self, image_channel=3, num_classes=1470):
29 |         super(YOLO, self).__init__()
30 |         self.net = nn.Sequential(
31 |             ConvBlock(image_channel, 64, 7, 2, 3),
32 |             nn.MaxPool2d(2, 2),
33 |             ConvBlock(64, 192, 3, 1, 1),
34 |             nn.MaxPool2d(2, 2),
35 |             ConvBlock(192, 128, 1, 1, 0),
36 |             ConvBlock(128, 256, 3, 1, 1),
37 |             ConvBlock(256, 256, 1, 1, 0),
38 |             ConvBlock(256, 512, 3, 1, 1),
39 |             nn.MaxPool2d(2, 2),
40 |             ConvBlock(512, 256, 1, 1, 0),
41 |             ConvBlock(256, 512, 3, 1, 1),
42 |             ConvBlock(512, 256, 1, 1, 0),
43 |             ConvBlock(256, 512, 3, 1, 1),
44 |             ConvBlock(512, 256, 1, 1, 0),
45 |             ConvBlock(256, 512, 3, 1, 1),
46 |             ConvBlock(512, 256, 1, 1, 0),
47 |             ConvBlock(256, 512, 3, 1, 1),
48 |             ConvBlock(512, 512, 1, 1, 0),
49 |             ConvBlock(512, 1024, 3, 1, 1),
50 |             nn.MaxPool2d(2, 2),
51 |             ConvBlock(1024, 512, 1, 1, 0),
52 |             ConvBlock(512, 1024, 3, 1, 1),
53 |             ConvBlock(1024, 512, 1, 1, 0),
54 |             ConvBlock(512, 1024, 3, 1, 1),
55 |             ConvBlock(1024, 1024, 3, 1, 1),
56 |             ConvBlock(1024, 1024, 3, 2, 1),
57 |             ConvBlock(1024, 1024, 3, 1, 1),
58 |             ConvBlock(1024, 1024, 3, 1, 1),
59 |             Flatten(),
60 |             nn.Linear(1024 * 7 * 7, 4096),
61 |             nn.ReLU(),
62 |             nn.Linear(4096, num_classes)
63 |         )
64 |     
65 |     def forward(self, inputs):
66 |         return self.net(inputs)
67 | 
68 | 
69 | if __name__ == "__main__":
70 |     net = YOLO(3, 1470)
71 |     net.cuda("cuda:0")
72 |     batch_size = 1
73 |     inputs = torch.randn([batch_size, 3, 448, 448]).cuda("cuda:0")
74 |     output = net(inputs)
75 | 
76 |     torch.cuda.synchronize()
77 |     beg = time.time()
78 |     device_time = 0.0
79 |     for i in range(50):
80 |         start = torch.cuda.Event(enable_timing=True)
81 |         finish = torch.cuda.Event(enable_timing=True)
82 |         start.record()
83 |         net(inputs)
84 |         finish.record()
85 |         torch.cuda.synchronize()
86 |         device_time += start.elapsed_time(finish)
87 |     end = time.time()
88 |     print("Host time pass {}ms".format((end - beg) * 1e3 / 50))
89 |     print("Device time pass {}ms".format(device_time / 50))


--------------------------------------------------------------------------------
/flextensor/testing/others/assemble.py:
--------------------------------------------------------------------------------
 1 | '''
 2 | This is used to assemble results from experiments
 3 | '''
 4 | import os
 5 | 
 6 | 
 7 | if __name__ == "__main__":
 8 |     dir_names = os.listdir(".")
 9 |     dir_names = list(filter(lambda x: os.path.isdir(x), dir_names))
10 |     print(dir_names)
11 |     for name in dir_names:
12 |         print(name.split("conv"))
13 |     write_lines = []
14 |     dir_names = sorted(dir_names, key=lambda x: int(x.split("conv")[1]))
15 |     for dir_name in dir_names:
16 |         dir_path = os.path.join(".", dir_name)
17 |         file_names = os.listdir(dir_path)
18 |         for file_name in file_names:
19 |             if "config" in file_name:
20 |                 file_path = os.path.join(dir_path, file_name)
21 |                 with open(file_path, "r") as fin:
22 |                     lines = fin.readlines()
23 |                     if lines:
24 |                         line = lines[-1]
25 |                         write_lines.append(line)
26 |     with open("configs.txt", "w") as fout:
27 |         for line in write_lines:
28 |             fout.write(line) 


--------------------------------------------------------------------------------
/flextensor/testing/others/compare_conv_cpu.py:
--------------------------------------------------------------------------------
 1 | import time
 2 | import tvm 
 3 | import torch
 4 | import numpy as np
 5 | from flextensor.configs.conv2d_config import all_conv_shapes
 6 | from flextensor.utils import to_tuple
 7 | from flextensor.nn import conv2d_nchw
 8 | 
 9 | 
10 | def evaluate(s, bufs, target, dev_id, number=10):
11 |     ctx = tvm.device(target, dev_id)
12 |     tvm_arys = []
13 |     for arg in bufs:
14 |         shape = to_tuple(arg.shape)
15 |         tmp = np.random.uniform(-10, 10, size=shape).astype(arg.dtype)
16 |         tmp = tvm.nd.array(tmp, ctx)
17 |         tvm_arys.append(tmp)
18 |     func, evaluator = None, None
19 |     try:
20 |         func = tvm.build(s, bufs, target)
21 |         # evaluator = func.time_evaluator(func.entry_name, ctx, number=number)
22 |         # time_cost = evaluator(*tvm_arys).mean * 1e3
23 |         beg = time.time()
24 |         for i in range(number):
25 |             func(*tvm_arys)
26 |         end = time.time()
27 |         time_cost = (end - beg) * 1e3 / number
28 |         return time_cost
29 |     except Exception as e:
30 |         print(e)
31 |         return float("inf")
32 | 
33 | 
34 | def pytorch_conv(batch, channel, out_channel, height, width, k_h, k_w, stride, pad, target, number=10):
35 |     A = torch.rand((batch, channel, height, width), dtype=torch.float32)
36 |     W = torch.rand((out_channel, channel, k_h, k_w), dtype=torch.float32)
37 |     if target == "cuda":
38 |         A = A.cuda()
39 |         W = W.cuda()
40 |     # warm-up
41 |     Out = torch.nn.functional.conv2d(A, W, stride=stride, padding=pad)
42 |     beg = time.time()
43 |     for i in range(number):
44 |         Out = torch.nn.functional.conv2d(A, W, stride=stride, padding=pad)
45 |     end = time.time()
46 |     return (end - beg) * 1e3 / number
47 | 
48 | 
49 | def tvm_conv(batch, channel, out_channel, height, width, k_h, k_w, stride, pad, target, devid=0, number=10):
50 |     A = tvm.te.placeholder((batch, channel, height, width), dtype="float32")
51 |     W = tvm.te.placeholder((out_channel, channel, k_h, k_w), dtype="float32")
52 |     Output = conv2d_nchw(A, W, stride=stride, padding=pad)
53 |     s = tvm.te.create_schedule(Output.op)
54 |     bufs = [A, W, Output]
55 |     return evaluate(s, bufs, target, devid, number)
56 | 
57 | 
58 | def compare(write_file):
59 |     for config in all_conv_shapes:
60 |         batch, channel, h, w, out_channel, _, k_h, k_w, _, stride, pad, _, _ = config
61 |         torch_time = pytorch_conv(batch, channel, out_channel, h, w, k_h, k_w, stride, pad, "llvm", 10)
62 |         tvm_time = tvm_conv(batch, channel, out_channel, h, w, k_h, k_w, stride, pad, "llvm", 1, 10)
63 |         shape = (batch, channel, h, w, out_channel, k_h, k_w, stride, pad)
64 |         print(shape, ": torch:", torch_time, "   tvm", tvm_time, file=write_file, flush=True)
65 | 
66 | 
67 | if __name__ == "__main__":
68 |     with open("cmp_conv_cpu.txt", "w") as f:
69 |         compare(f)


--------------------------------------------------------------------------------
/flextensor/testing/others/hand-craft/complex-gemm.cl:
--------------------------------------------------------------------------------
 1 | #include <ap_int.h>
 2 | 
 3 | #include <algorithm>
 4 | 
 5 | extern "C" void default_function_kernel0( float* C,  float* A1,  float* A,  float* B1,  float* B) {
 6 | #pragma HLS INTERFACE m_axi port=C  offset=slave bundle=gmem
 7 | #pragma HLS INTERFACE s_axilite port=C bundle=control
 8 | #pragma HLS INTERFACE m_axi port=A1  offset=slave bundle=gmem
 9 | #pragma HLS INTERFACE s_axilite port=A1 bundle=control
10 | #pragma HLS INTERFACE m_axi port=A  offset=slave bundle=gmem
11 | #pragma HLS INTERFACE s_axilite port=A bundle=control
12 | #pragma HLS INTERFACE m_axi port=B1  offset=slave bundle=gmem
13 | #pragma HLS INTERFACE s_axilite port=B1 bundle=control
14 | #pragma HLS INTERFACE m_axi port=B  offset=slave bundle=gmem
15 | #pragma HLS INTERFACE s_axilite port=B bundle=control
16 | #pragma HLS INTERFACE s_axilite port=return bundle=control
17 | 
18 |   for (int i_inner_outer = 0; i_inner_outer < 32; ++i_inner_outer) {
19 |     for (int j_outer = 0; j_outer < 32; ++j_outer) {
20 |       for (int i_inner_inner_init = 0; i_inner_inner_init < 32; ++i_inner_inner_init) {
21 |         for (int j_inner_init = 0; j_inner_init < 16; ++j_inner_init) {
22 |           C[((((i_inner_outer * 16384) + (i_inner_inner_init * 512)) + (j_outer * 16)) + j_inner_init)] = 0.000000e+00f;
23 |         }
24 |       }
25 |       for (int k_outer = 0; k_outer < 32; ++k_outer) {
26 |         for (int i = 0; i < 32; ++i) {
27 |           for (int j = 0; j < 8; ++j) {
28 |             A1[((i * 8) + j)] = A[((((i_inner_outer * 8192) + (i * 256)) + (k_outer * 8)) + j)];
29 |           }
30 |         }
31 |         for (int i1 = 0; i1 < 8; ++i1) {
32 |           for (int j1 = 0; j1 < 16; ++j1) {
33 |             B1[((i1 * 16) + j1)] = B[((((k_outer * 4096) + (i1 * 512)) + (j_outer * 16)) + j1)];
34 |           }
35 |         }
36 |         for (int i_inner_inner = 0; i_inner_inner < 32; ++i_inner_inner) {
37 |           for (int k_inner = 0; k_inner < 8; ++k_inner) {
38 |             for (int j_inner = 0; j_inner < 16; ++j_inner) {
39 |               C[((((i_inner_outer * 16384) + (i_inner_inner * 512)) + (j_outer * 16)) + j_inner)] = (C[((((i_inner_outer * 16384) + (i_inner_inner * 512)) + (j_outer * 16)) + j_inner)] + (A1[((i_inner_inner * 8) + k_inner)] * B1[((k_inner * 16) + j_inner)]));
40 |             }
41 |           }
42 |         }
43 |       }
44 |     }
45 |   }
46 | }


--------------------------------------------------------------------------------
/flextensor/testing/others/hand-craft/conv_example.cl:
--------------------------------------------------------------------------------
 1 | #define USER_DEFINED_COMPUTE_SIZE_COMPUTE  xxx
 2 | #define USER_DEFINED_COMPUTE_SIZE_PLACEHOLDER  xxx
 3 | #define USER_DEFINED_COMPUTE_SIZE_PLACEHOLDER1  xxx
 4 | 
 5 | __kernel void default_function_kernel0(__global float* restrict compute, __global float* restrict placeholder, __global float* restrict placeholder1) {
 6 |   // declare use local memory
 7 |   local float local_compute[USER_DEFINED_COMPUTE_SIZE_COMPUTE];
 8 |   local float local_placeholder[USER_DEFINED_COMPUTE_SIZE_PLACEHOLDER];
 9 |   local float local_placeholder1[USER_DEFINED_COMPUTE_SIZE_PLACEHOLDER1];
10 | 
11 |   // read data to local memory
12 |   event_t evt[3];
13 |   evt[0] = async_work_group_copy(local_compute, compute, USER_DEFINED_COMPUTE_SIZE_COMPUTE, 0);
14 |   evt[1] = async_work_group_copy(local_placeholder, placeholder, USER_DEFINED_COMPUTE_SIZE_PLACEHOLDER, 0);
15 |   evt[2] = async_work_group_copy(local_placeholder1, placeholder1, USER_DEFINED_COMPUTE_SIZE_PLACEHOLDER1, 0);
16 |   // barrier
17 |   wait_group_events(2, evt);
18 | 
19 |   // compute pipeline
20 |   for (int b_inner = 0; b_inner < 8; ++b_inner) {
21 |     for (int c = 0; c < 1024; ++c) {
22 |       for (int h = 0; h < 7; ++h) {
23 |         for (int w = 0; w < 7; ++w) {
24 |           local_compute[((((b_inner * 50176) + (c * 49)) + (h * 7)) + w)] = 0.000000e+00f;
25 |           for (int rc = 0; rc < 1024; ++rc) {
26 |             for (int rw = 0; rw < 3; ++rw) {
27 |               // FPGA pipeline
28 |               __attribute_((xcl_pipeline_loop)){
29 |               for (int rh = 0; rh < 3; ++rh) {
30 |                 local_compute[((((b_inner * 50176) + (c * 49)) + (h * 7)) + w)] = (local_compute[((((b_inner * 50176) + (c * 49)) + (h * 7)) + w)] + ((float)(((1 <= ((h * 2) + rh)) && (1 <= ((w * 2) + rw))) ? local_placeholder[(((((((b_inner * 200704) + (rc * 196)) + (h * 28)) + (rh * 14)) + (w * 2)) + rw) - 15)] : 0.000000e+00f) * local_placeholder1[((((c * 9216) + (rc * 9)) + (rh * 3)) + rw)]));
31 |               }
32 |               }
33 |             }
34 |           }
35 |         }
36 |       }
37 |     }
38 |   }
39 | 
40 | 
41 |   // write data back to global memory
42 |   evt[0] = async_work_group_copy(compute, local_compute, USER_DEFINED_COMPUTE_SIZE_COMPUTE, 0);
43 |   evt[1] = async_work_group_copy(placeholder, local_placeholder, USER_DEFINED_COMPUTE_SIZE_PLACEHOLDER, 0);
44 |   evt[2] = async_work_group_copy(placeholder1, local_placeholder1, USER_DEFINED_COMPUTE_SIZE_PLACEHOLDER1, 0);
45 |   // barrier
46 |   wait_group_events(2, evt);
47 | }


--------------------------------------------------------------------------------
/flextensor/testing/others/hand-craft/hcl_gemm.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Test heterocl
  3 | """
  4 | import heterocl as hcl
  5 | import numpy as np
  6 | import time
  7 | 
  8 | def gemm(m=1024, n=1024, k=1024, dtype=hcl.Int(), target=None):
  9 |     matrix_1 = hcl.placeholder((m, k), dtype=dtype)
 10 |     matrix_2 = hcl.placeholder((k, n), dtype=dtype)
 11 | 
 12 |     def kernel(matrix_1, matrix_2):
 13 |        r = hcl.reduce_axis(0, k, 'k')
 14 |   
 15 |        mat1_buf =  hcl.compute((m, k), 
 16 |               lambda x, y: matrix_1[x, y],
 17 |               dtype=dtype,
 18 |               name="mat1_buf")
 19 | 
 20 |        mat2_buf =  hcl.compute((k, n), 
 21 |               lambda x, y: matrix_2[x, y],
 22 |               dtype=dtype,
 23 |               name="mat2_buf")
 24 | 
 25 |        return hcl.compute((m, n),
 26 |                 lambda x, y: hcl.sum(mat1_buf[x, r] * mat2_buf[r, y],
 27 |                                      axis=r, dtype=dtype),
 28 |                 dtype=dtype,
 29 |                 name="out_matrix")
 30 | 
 31 |     s = hcl.create_schedule([matrix_1, matrix_2], kernel)
 32 |     out_matrix = kernel.out_matrix
 33 |     mat1_buf = kernel.mat1_buf
 34 |     mat2_buf = kernel.mat2_buf
 35 | 
 36 |     m_block = 4
 37 |     n_block = 8
 38 |     k_block = 16
 39 | 
 40 |     m , k = s[mat1_buf].op.axis
 41 |     #print(m , k)
 42 |     x0, x1  = s[mat1_buf].split(m, factor=m_block)
 43 |     z0, z1  = s[mat1_buf].split(k, factor=k_block)
 44 |     s[mat1_buf].reorder(x0, z0, x1, z1)
 45 | 
 46 |     k , n = s[mat2_buf].op.axis
 47 |     z0, z1 = s[mat2_buf].split(k, factor=k_block)
 48 |     y0, y1 = s[mat2_buf].split(n, factor=n_block)
 49 |     s[mat2_buf].reorder(y0, z0, y1, z1)
 50 | 
 51 | 
 52 |     m, n, k = s[out_matrix].op.axis
 53 |     #print(m, n, k)
 54 |     #s[out_matrix].reorder(n, m, k)
 55 | 
 56 |     x0, x1 = s[out_matrix].split(m, factor=m_block)
 57 |     y0, y1 = s[out_matrix].split(n, factor=n_block)
 58 |     z0, z1 = s[out_matrix].split(k, factor=k_block)
 59 | 
 60 |     s[out_matrix].reorder( x0, y0, z0,  x1, y1, z1)
 61 |     
 62 |     s[mat1_buf].compute_at(s[out_matrix], s[out_matrix].op.axis[0])
 63 |     
 64 | 
 65 |     #s[mat1_buf].compute_at(s[out_matrix], z0)
 66 | 
 67 |     #s[mat1_buf].compute_at(s[out_matrix], z1)
 68 | 
 69 |     #s[out_matrix].pipeline(x1)
 70 | 
 71 |     f = hcl.build(s, target=target)
 72 |     print(type(f))
 73 |     print(f)
 74 |     #code = hcl.lower(s)
 75 |     #print(code) 
 76 |     return f
 77 | 
 78 | def time_gemm(dtype, m=1024, n=1024, k=1024, target=None):
 79 |     hcl.init(dtype)
 80 |     f = gemm(m, n, k, dtype, target)
 81 |     np_1 = np.random.randint(10, size=(m, k))
 82 |     np_2 = np.random.randint(10, size=(k, n))
 83 |     np_3 = np.matmul(np_1, np_2)
 84 | 
 85 |     hcl_m1 = hcl.asarray(np_1, dtype=dtype)
 86 |     hcl_m2 = hcl.asarray(np_2, dtype=dtype)
 87 |     hcl_m3 = hcl.asarray(np.zeros((m, n)), dtype=dtype)
 88 |     f(hcl_m1, hcl_m2, hcl_m3)
 89 |     begin = time.time()
 90 |     for i in range(10):
 91 |         f(hcl_m1, hcl_m2, hcl_m3)
 92 |     end = time.time()
 93 |     print("dtype is: ", dtype)
 94 |     print("average of 10 runs takes: {} sec".format((end - begin) / 10))
 95 |     np.testing.assert_allclose(hcl_m3.asnumpy(), np_3, rtol=1e-03)
 96 | 
 97 | ###############################################################################
 98 | # Test the algorithm with different data types
 99 | #dtypes = [hcl.Int(32), hcl.Float(), hcl.Fixed(32, 16)]
100 | dtypes = [hcl.Float()]
101 | for dtype in dtypes:
102 |     time_gemm(dtype, m=256, n=512, k=1024, target="vhls")
103 | 


--------------------------------------------------------------------------------
/flextensor/testing/others/hand-craft/schedule_conv2d_nchw_x86.py:
--------------------------------------------------------------------------------
  1 | """
  2 | High performance schedule for conv2d_nchw
  3 | Target X86 CPU
  4 | 
  5 | ====================================
  6 | **Author**: `Size Zheng`
  7 | """
  8 | import tvm 
  9 | from flextensor.measure import _evaluate
 10 | from flextensor.nn import *
 11 | 
 12 | 
 13 | def schedule_yolo_conv_x86(s, outputs, inputs, weight):
 14 |     # inline the padding operation
 15 |     padded = outputs.op.input_tensors[0]
 16 | 
 17 |     # create cache
 18 |     write_cache = s.cache_write(outputs, "local")
 19 | 
 20 |     # tunable parameters
 21 |     b_factors = [1, 1, 1]
 22 |     k_factors = [8, 2, 32]
 23 |     p_factors = [14, 2, 2]
 24 |     q_factors = [2, 1, 28]
 25 |     rc_factors = [32, 4, 2]         # outer-->inner
 26 |     ry_factors = [1, 3, 1]
 27 |     rx_factors = [3, 1, 1]
 28 | 
 29 |     # split the spatial axes
 30 |     b, k, p, q = s[outputs].op.axis
 31 |     bo, bi = s[outputs].split(b, nparts=b_factors[0])
 32 |     ko, ki = s[outputs].split(k, nparts=k_factors[0])
 33 |     po, pi = s[outputs].split(p, nparts=p_factors[0])
 34 |     qo, qi = s[outputs].split(q, nparts=q_factors[0])
 35 | 
 36 |     vbo, bi = s[outputs].split(bi, nparts=b_factors[1])
 37 |     vko, ki = s[outputs].split(ki, nparts=k_factors[1])
 38 |     vpo, pi = s[outputs].split(pi, nparts=p_factors[1])
 39 |     vqo, qi = s[outputs].split(qi, nparts=q_factors[1])
 40 | 
 41 |     # reorder
 42 |     s[outputs].reorder(bo, ko, po, qo, vbo, vko, vpo, vqo, bi, ki, pi, qi)
 43 | 
 44 |     # fuse
 45 |     outer = s[outputs].fuse(bo, ko, po, qo)
 46 |     middle = s[outputs].fuse(vbo, vko, vpo, vqo)
 47 |     # inner = s[outputs].fuse(bi, ki, pi, qi)
 48 | 
 49 |     # vectorize
 50 |     # s[outputs].vectorize(inner)
 51 | 
 52 |     # parallel
 53 |     s[outputs].parallel(outer)
 54 | 
 55 |     # compute at write cache
 56 |     s[write_cache].compute_at(s[outputs], middle)
 57 | 
 58 |     # split reduce axes
 59 |     wb, wk, wp, wq = s[write_cache].op.axis
 60 |     # print(s[write_cache].op.reduce_axis)
 61 |     rc, ry, rx = s[write_cache].op.reduce_axis
 62 |     rco, rci = s[write_cache].split(rc, nparts=rc_factors[0])
 63 |     rcm, rci = s[write_cache].split(rci, nparts=rc_factors[1])
 64 |     rxo, rxi = s[write_cache].split(rx, nparts=rx_factors[0])
 65 |     rxm, rxi = s[write_cache].split(rxi, nparts=rx_factors[1])
 66 |     ryo, ryi = s[write_cache].split(ry, nparts=ry_factors[0])
 67 |     rym, ryi = s[write_cache].split(ryi, nparts=ry_factors[1])
 68 | 
 69 |     # reorder
 70 |     s[write_cache].reorder(rco, ryo, rxo, rcm, rym, rxm, rci, ryi, rxi, wb, wk, wp, wq)
 71 | 
 72 |     
 73 |     s[outputs].pragma(outer, 'auto_unroll_max_step', 1500)
 74 |     s[write_cache].vectorize(s[write_cache].op.axis[-1])
 75 | 
 76 |     s[padded].compute_inline()
 77 | 
 78 | 
 79 | def try_yolo_conv(batch_size=1):
 80 |     # get the compute
 81 |     yolo_conv = YoloConvLayer6()
 82 |     input_shape = yolo_conv.get_intput_shape()
 83 |     inputs = tvm.te.placeholder((batch_size, *input_shape), dtype="float32")
 84 |     weight = yolo_conv.get_weight()
 85 |     outputs = yolo_conv(inputs)
 86 |     bias = yolo_conv.get_bias()
 87 |     
 88 |     s = tvm.te.create_schedule(outputs.op)
 89 |     schedule_yolo_conv_x86(s, outputs, inputs, weight)
 90 | 
 91 |     if bias is None:
 92 |         arg_bufs = [inputs, weight, outputs]
 93 |     else:
 94 |         arg_bufs = [inputs, weight, bias, outputs]
 95 |     stmt = tvm.lower(s, arg_bufs, simple_mode=True)
 96 |     print(stmt)
 97 |     dev_id = 1
 98 |     time_cost = _evaluate(s, arg_bufs, "llvm", dev_id, 100)
 99 |     print("Yolo conv6 use", time_cost, "ms")
100 | 
101 | 
102 | if __name__ == "__main__":
103 |     try_yolo_conv(batch_size=1)


--------------------------------------------------------------------------------
/flextensor/testing/others/hand-craft/schedule_gemm_vhls.py:
--------------------------------------------------------------------------------
 1 | import heterocl as hcl
 2 | 
 3 | 
 4 | def kernel_gemm(A, B):
 5 |     k = hcl.reduce_axis(0, A.shape[1], "k")
 6 |     return hcl.compute(
 7 |         (A.shape[0], B.shape[1]),
 8 |         lambda i, j: hcl.sum(A[i, k] * B[k, j], axis=k),
 9 |         "C")
10 | 
11 | 
12 | def main():
13 |     M = 512
14 |     N = 512
15 |     K = 512
16 |     A = hcl.placeholder((M, K), dtype="float32", name="A")
17 |     B = hcl.placeholder((K, N), dtype="float32", name="B")
18 | 
19 |     s = hcl.create_schedule([A, B], kernel_gemm)
20 |     # split
21 |     C = kernel_gemm.C
22 |     m, n, k = s[C].op.axis
23 |     mo, mi = s[C].split(m, factor=16)
24 |     no, ni = s[C].split(n, factor=32)
25 |     ko, ki = s[C].split(k, factor=8)
26 |     
27 | 
28 |     # reorder shuffle
29 |     s[C].reorder(mo, no, mi, ni, ko, ki)
30 |     
31 | 
32 |     # reorder local
33 |     s[C].reorder(mi, ko, ki, ni)
34 |     
35 | 
36 |     # reshape
37 |     s.reshape(C, [512//16, 16, 512//32, 32])
38 |     
39 | 
40 |     # partition
41 |     s.partition(A, dim=3)
42 |     
43 | 
44 |     # pipeline
45 |     s[C].pipeline(mi)
46 |     
47 | 
48 |     # reuse_at
49 |     # nothing to do
50 | 
51 |     print(hcl.build(s, target="vhls"))
52 | 
53 | 
54 | 
55 | 
56 | if __name__ == "__main__":
57 |     main()
58 |     """
59 |     // result:
60 |     #include <ap_int.h>
61 |     #include <ap_fixed.h>
62 |     #include <math.h>
63 | 
64 |     void default_function(float A[512][512], float B[512][512], ap_int<32> C[32][16][16][32]) {
65 |     #pragma HLS array_partition variable=A complete dim=3
66 |     for (ap_int<32> i_outer = 0; i_outer < 32; ++i_outer) {
67 |         for (ap_int<32> j_outer = 0; j_outer < 16; ++j_outer) {
68 |         for (ap_int<32> i_inner = 0; i_inner < 16; ++i_inner) {
69 |         #pragma HLS pipeline
70 |             for (ap_int<32> k_outer = 0; k_outer < 64; ++k_outer) {
71 |             ap_int<32> sum;
72 |             sum = 0;
73 |             for (ap_int<32> k_inner = 0; k_inner < 8; ++k_inner) {
74 |                 for (ap_int<32> j_inner = 0; j_inner < 32; ++j_inner) {
75 |                 sum = ((ap_int<32>)((A[(i_inner + (i_outer * 16))][(k_inner + (k_outer * 8))] * B[(k_inner + (k_outer * 8))][(j_inner + (j_outer * 32))]) + ((float)sum)));
76 |                 }
77 |             }
78 |             C[i_outer][i_inner][j_outer][j_inner] = sum;
79 |             }
80 |         }
81 |         }
82 |     }
83 | 
84 |     """


--------------------------------------------------------------------------------
/flextensor/testing/others/hand-craft/schedule_shfit_x86.py:
--------------------------------------------------------------------------------
 1 | import tvm
 2 | import math
 3 | import torch
 4 | import numpy as np
 5 | from flextensor.nn import ShiftConv2d_nhwc
 6 | 
 7 | shift_conv2d_shape = [
 8 |     # ShiftNet(https://arxiv.org/abs/1801.09392) with input size: 256*256
 9 |     (1, 128, 128, 64, 3, 1), 
10 |     (1, 128, 128, 64, 3, 1), 
11 |     (1, 64, 64, 128, 5, 1), 
12 |     (1, 32, 32, 256, 3, 1), 
13 |     (1, 16, 16, 512, 3, 1)
14 | ]
15 | 
16 | DEV_ID = 0
17 | 
18 | 
19 | def schedule_shift_1_x86(s, Img, KernelIndex, Output):
20 |     return
21 | 
22 | 
23 | def evaluate(shape, schedule_func):
24 |     N, H, W, C, k, dilation = shape
25 |     stride = 1
26 |     Img = tvm.te.placeholder([N, H, W, C], dtype="float32")
27 |     KernelIndex = tvm.te.placeholder([C], dtype="int32")
28 |     Output = ShiftConv2d_nhwc(Img, KernelIndex, k, dilation, stride)
29 | 
30 |     s = tvm.te.create_schedule(Output.op)
31 |     schedule_func(s, Img, KernelIndex, Output)
32 | 
33 |     func = tvm.build(s, [Img, KernelIndex, Output], "llvm")
34 |     Img_torch = torch.rand([N, H, W, C], dtype=torch.float32)
35 |     Kernel_torch = torch.rand([C, k, k], dtype=torch.float32)
36 |     KernelIndex_torch = torch.argmax(Kernel_torch.reshape([C, -1]), dim=1)
37 | 
38 |     paddings = [math.ceil(((stride - 1) * H - stride + dilation * (k - 1)) / 2), 
39 |                 math.ceil(((stride - 1) * W - stride + dilation * (k - 1)) / 2)]
40 |     image_height = H
41 |     image_width = W
42 |     out_height = math.floor((image_height + 2 * paddings[0]- dilation * (k - 1) - 1) / stride + 1)
43 |     out_width = math.floor((image_width + 2 * paddings[1] - dilation * (k - 1) - 1) / stride + 1)
44 |     output_shape = (N, out_height, out_width, C)
45 | 
46 |     Output_torch = torch.zeros(output_shape, dtype=torch.float32)
47 | 
48 |     ctx = tvm.device("llvm", DEV_ID)
49 | 
50 |     Img_tvm = tvm.nd.array(Img_torch.numpy().astype(np.float32), ctx)
51 |     KernelIndex_tvm = tvm.nd.array(KernelIndex_torch.numpy().astype(np.int32), ctx)
52 |     Output_tvm = tvm.nd.array(Output_torch.numpy().astype(np.float32), ctx)
53 | 
54 | 
55 |     evaluator = func.time_evaluator(func.entry_name, ctx, number=10)
56 |     time_cost = evaluator(Img_tvm, KernelIndex_tvm, Output_tvm).mean * 1e3
57 | 
58 |     return time_cost
59 | 
60 | 
61 | 
62 | def main():
63 |     print(evaluate(shift_conv2d_shape[0], schedule_shift_1_x86))
64 | 
65 | 
66 | if __name__ == "__main__":
67 |     main()


--------------------------------------------------------------------------------
/flextensor/testing/others/hand-craft/simple-gemm.cl:
--------------------------------------------------------------------------------
 1 | #include <ap_int.h>
 2 |   
 3 | #include <algorithm>
 4 | 
 5 | extern "C" void default_function_kernel0( float* C,  float* A,  float* B) {
 6 | #pragma HLS INTERFACE m_axi port=C  offset=slave bundle=gmem
 7 | #pragma HLS INTERFACE s_axilite port=C bundle=control
 8 | #pragma HLS INTERFACE m_axi port=A  offset=slave bundle=gmem
 9 | #pragma HLS INTERFACE s_axilite port=A bundle=control
10 | #pragma HLS INTERFACE m_axi port=B  offset=slave bundle=gmem
11 | #pragma HLS INTERFACE s_axilite port=B bundle=control
12 | #pragma HLS INTERFACE s_axilite port=return bundle=control
13 | 
14 |   for (int i_inner_outer = 0; i_inner_outer < 32; ++i_inner_outer) {
15 |     for (int j_outer = 0; j_outer < 32; ++j_outer) {
16 |       for (int i_inner_inner_init = 0; i_inner_inner_init < 32; ++i_inner_inner_init) {
17 |         for (int j_inner_init = 0; j_inner_init < 16; ++j_inner_init) {
18 |           C[((((i_inner_outer * 16384) + (i_inner_inner_init * 512)) + (j_outer * 16)) + j_inner_init)] = 0.000000e+00f;
19 |         }
20 |       }
21 |       for (int k_outer = 0; k_outer < 32; ++k_outer) {
22 |         for (int i_inner_inner = 0; i_inner_inner < 32; ++i_inner_inner) {
23 |           for (int k_inner = 0; k_inner < 8; ++k_inner) {
24 |             for (int j_inner = 0; j_inner < 16; ++j_inner) {
25 |               C[((((i_inner_outer * 16384) + (i_inner_inner * 512)) + (j_outer * 16)) + j_inner)] = (C[((((i_inner_outer * 16384) + (i_inner_inner * 512)) + (j_outer * 16)) + j_inner)] + (A[((((i_inner_outer * 8192) + (i_inner_inner * 256)) + (k_outer * 8)) + k_inner)] * B[((((k_outer * 4096) + (k_inner * 512)) + (j_outer * 16)) + j_inner)]));
26 |             }
27 |           }
28 |         }
29 |       }
30 |     }
31 |   }
32 | }
33 | 


--------------------------------------------------------------------------------
/flextensor/testing/others/hand-craft/tune_conv2d_NCHWc.py:
--------------------------------------------------------------------------------
 1 | from __future__ import absolute_import
 2 | 
 3 | import time
 4 | import json
 5 | import tvm
 6 | from flextensor.task import register_task, Task
 7 | from flextensor.measure import _evaluate
 8 | from flextensor.nn import conv2d_nchwc
 9 | from flextensor.configs.conv2d_config import yolo_shapes_b1
10 | from flextensor.scheduler import schedule, schedule_with_config
11 | from flextensor.utils import RpcInfo
12 | 
13 | 
14 | def conv2d_nchwc_compute_avx2(N, C, H, W, K, k=3, use_bias=False, st=1, pad=0, dilation=1, group=1, vlen1=8, vlen2=8):
15 |     inputs = tvm.te.placeholder([N, C // vlen1 // group, H, W, vlen1], dtype="float32")
16 |     weight = tvm.te.placeholder([K // vlen2, C // vlen1 // group, k, k, vlen1, vlen2], dtype="float32")
17 |     if use_bias:
18 |         bias = tvm.te.placeholder([K // vlen2, vlen2], dtype="float32")
19 |     else:
20 |         bias = None 
21 |     output = conv2d_nchwc(inputs, weight, bias, stride=st, padding=pad, dilation=dilation, groups=group)
22 |     if use_bias:
23 |         return output, [inputs, weight, bias, output]
24 |     else:
25 |         return [output.op], [inputs, weight, output]
26 | 
27 | 
28 | if __name__ == "__main__":
29 |     N, C, H, W, K, _, k, _, _, st, pad, dilation, group = yolo_shapes_b1[5]
30 | 
31 |     use_bias = False
32 |     vlen = 8
33 |     target = "llvm"
34 |     dev_id = 0
35 |     trials = 100
36 |     timeout = 10
37 |     parallel = 20
38 |     method = "searching"
39 |     force_inline = True 
40 |     use_model = False
41 |     logfile = open("tmp.log", "w")
42 |     rpc_info = RpcInfo("0.0.0.0", 9090, target_host="llvm")
43 | 
44 |     args = (N, C, H, W, K, k, use_bias, st, pad, dilation, group)
45 |     task = Task("conv2d_nchwc", "yolo_conv6", conv2d_nchwc_compute_avx2, args, target, dev_id=dev_id)
46 |     register_task(task, override=False)
47 | 
48 |     beg = time.time()
49 |     s, bufs, configs = schedule(
50 |             task.key, 
51 |             op_trial=trials, 
52 |             timeout=timeout, 
53 |             op_stop=30, 
54 |             parallel=parallel, 
55 |             method=method,
56 |             use_model=use_model,
57 |             trials=[trials//10, trials],
58 |             force_inline=force_inline,
59 |             rpc_info=rpc_info,
60 |             slevel=2,
61 |             rlevel=2
62 |             )
63 |     end = time.time()
64 | 
65 |     print("######################################")
66 |     print("op schedules:")
67 |     for config in configs.op_config_lst:
68 |         print("----------------------------------")
69 |         for name, value in config.items():
70 |             if value:
71 |                 print(name, value)
72 |     print("graph schedules:")
73 |     for name, value in configs.graph_config.items():
74 |         if value:
75 |             print(name, value)
76 |     string = json.dumps(configs)
77 |     line = task.key + ":" + string
78 |     print(line, file=logfile, flush=True)
79 |     s, bufs = schedule_with_config(task.key, configs)
80 |     time_cost = _evaluate(s, bufs, target, dev_id, 10)
81 |     print("Use", time_cost, "ms", "throughput: %f GFLOPS" % (N * C * H * W * K * k * k / st / st / group / 1e6 / time_cost))
82 |     print("Cost", end - beg, "s")
83 | 
84 |     logfile.close()
85 | 


--------------------------------------------------------------------------------
/flextensor/testing/others/hand-craft/tvm_pragma.py:
--------------------------------------------------------------------------------
  1 | import tvm 
  2 | import numpy as np
  3 | import os
  4 | 
  5 | 
  6 | M = 1024
  7 | N = 512
  8 | K = 256
  9 | 
 10 | 
 11 | def test1():
 12 |     A = tvm.te.placeholder([M, K], name="A")
 13 |     B = tvm.te.placeholder([K, N], name="B")
 14 |     k = tvm.te.reduce_axis((0, K), name="k")
 15 |     # A1 = tvm.te.compute([M, K], lambda i, j: A[i, j], "A1")
 16 |     # B1 = tvm.te.compute([K, N], lambda i, j: B[i, j], "B1")
 17 |     C = tvm.te.compute([M, N], lambda i, j: tvm.te.sum(A[i, k] * B[k, j], axis=[k]), "C")
 18 | 
 19 |     s = tvm.te.create_schedule(C.op)
 20 | 
 21 |     A1 = s.cache_read(A, "local", [C])
 22 |     B1 = s.cache_read(B, "local", [C])
 23 | 
 24 |     m, n = s[C].op.axis
 25 |     om, im = s[C].split(m, nparts=1)
 26 |     s[C].bind(om, tvm.te.thread_axis("blockIdx.x"))
 27 |     mo, mi = s[C].split(im, factor=32)
 28 |     no, ni = s[C].split(n, factor=16)
 29 |     k = s[C].op.reduce_axis[0]
 30 |     ko, ki = s[C].split(k, factor=8)
 31 | 
 32 |     s[C].reorder(mo, no, ko, mi, ki, ni)
 33 |     # s[C].bind(no, tvm.te.thread_axis("threadIdx.x"))
 34 | 
 35 |     s[A1].compute_at(s[C], ko)
 36 |     s[B1].compute_at(s[C], ko)
 37 | 
 38 |     # print(tvm.lower(s, [A, B, C]))
 39 | 
 40 |     f = tvm.build(s, [A, B, C], target="opencl")
 41 |     print(dir(f))
 42 |     print(f.get_source())
 43 |     print(f.imported_modules[0].get_source())
 44 | 
 45 | 
 46 | def test2():
 47 |     tgt_host="llvm"
 48 |     tgt="aocl_sw_emu"
 49 |     n = tvm.te.var("n")
 50 |     A = tvm.te.placeholder((n,), name='A')
 51 |     B = tvm.te.placeholder((n,), name='B')
 52 |     C = tvm.te.compute(A.shape, lambda i: A[i] + B[i], name="C")
 53 | 
 54 |     s = tvm.te.create_schedule(C.op)
 55 |     px, x = s[C].split(C.op.axis[0], nparts=1)
 56 | 
 57 |     s[C].bind(px, tvm.te.thread_axis("pipeline"))
 58 | 
 59 |     fadd = tvm.build(s, [A, B, C], tgt, target_host=tgt_host, name="myadd")
 60 | 
 61 |     fadd.save("myadd.o")
 62 |     fadd.imported_modules[0].save("myadd.aocx")
 63 | 
 64 |     tvm.contrib.cc.create_shared("myadd.so", ["myadd.o"])
 65 | 
 66 | 
 67 | def run_aocl():
 68 |     tgt="aocl_sw_emu"
 69 | 
 70 |     fadd = tvm.runtime.module.load_module("myadd.so")
 71 |     fadd_dev = tvm.runtime.module.load_module("myadd.aocx")
 72 |     fadd.import_module(fadd_dev)
 73 | 
 74 |     ctx = tvm.device(tgt, 0)
 75 | 
 76 |     n = 1024
 77 |     a = tvm.nd.array(np.random.uniform(size=n).astype("float32"), ctx)
 78 |     b = tvm.nd.array(np.random.uniform(size=n).astype("float32"), ctx)
 79 |     c = tvm.nd.array(np.zeros(n, dtype="float32"), ctx)
 80 | 
 81 |     fadd(a, b, c)
 82 |     tvm.testing.assert_allclose(c.asnumpy(), a.asnumpy() + b.asnumpy())
 83 | 
 84 | 
 85 | def test3():
 86 |     A = tvm.te.placeholder([M, K], name="A")
 87 |     B = tvm.te.placeholder([K, N], name="B")
 88 |     k = tvm.te.reduce_axis((0, K), name="k")
 89 |     C = tvm.te.compute([M, N], lambda i, j: tvm.te.sum(A[i, k] * B[k, j], axis=[k]), "C")
 90 | 
 91 |     s = tvm.te.create_schedule(C.op)
 92 | 
 93 |     m, n = s[C].op.axis
 94 |     om, im = s[C].split(m, nparts=1)
 95 |     s[C].bind(om, tvm.te.thread_axis("pipeline"))
 96 |     mo, mi = s[C].split(im, factor=32)
 97 |     no, ni = s[C].split(n, factor=16)
 98 |     k = s[C].op.reduce_axis[0]
 99 |     ko, ki = s[C].split(k, factor=8)
100 | 
101 |     s[C].reorder(mo, no, ko, mi, ki, ni)
102 | 
103 |     print(tvm.lower(s, [A, B, C]))
104 | 
105 |     f = tvm.build(s, [A, B, C], target="sdaccel")
106 |     print(dir(f))
107 |     print(f.get_source())
108 |     print(f.imported_modules[0].get_source())
109 | 
110 | 
111 | def main():
112 |     test1()
113 |     # test3()
114 | 
115 | 
116 | if __name__ == "__main__":
117 |     main()


--------------------------------------------------------------------------------
/flextensor/testing/others/profile/Makefile:
--------------------------------------------------------------------------------
 1 | CXX := nvcc
 2 | TARGET := profile_autate
 3 | CUDNN_PATH := /usr/local/cuda-10.1
 4 | HEADERS := -I $(CUDNN_PATH)/include
 5 | LIBS := -L $(CUDNN_PATH)/lib64 -L/usr/local/lib
 6 | LIBDEVICE := --dont-use-profile -ldir $(CUDNN_PATH)/nvvm/libdevice
 7 | CXXFLAGS := -arch=sm_60 -std=c++11 -O2
 8 | 
 9 | all: conv
10 | 
11 | conv: $(TARGET).cu
12 | 	$(CXX) $(CXXFLAGS) $(LIBDEVICE) $(HEADERS) $(LIBS) $(TARGET).cu -o $(TARGET) 
13 | 
14 | .phony: clean
15 | 
16 | clean:
17 | 	rm $(TARGET) || echo -n ""


--------------------------------------------------------------------------------
/flextensor/testing/others/profile/compute_flops.py:
--------------------------------------------------------------------------------
 1 | from __future__ import absolute_import 
 2 | from flextensor.configs.conv2d_config import yolo_shapes
 3 | 
 4 | 
 5 | def gflops(batch, in_channel, out_channel, H, W, k_h, k_w, stride, padding, dilation):
 6 |     out_h = (H + 2 * padding - dilation * (k_h - 1) - 1) // stride + 1
 7 |     out_w = (W + 2 * padding - dilation * (k_w - 1) - 1) // stride + 1
 8 |     return 2 * batch * out_h * out_w * in_channel * out_channel * k_h * k_w / 1e9
 9 | 
10 | 
11 | def perf(gflops, millis):
12 |     return gflops / (millis / 1e3)
13 | 
14 | 
15 | if __name__ == "__main__":
16 |     # flextensor
17 |     millis = [
18 |         0.1006952,
19 |         0.2825153,
20 |         0.0252457,
21 |         0.2062096,
22 |         0.0571187,
23 |         0.7426347,
24 |         0.0372696,
25 |         0.210653,
26 |         0.0540586,
27 |         0.7972785,
28 |         0.0652985,
29 |         0.2498188,
30 |         0.5609756,
31 |         0.3801411,
32 |         0.2718407
33 |     ]
34 |     # autotvm
35 |     # millis = [
36 |     #     0.145611,
37 |     #     0.385738,
38 |     #     0.038619,
39 |     #     0.311103,
40 |     #     0.080117,
41 |     #     0.897629,
42 |     #     0.059699,
43 |     #     0.287437,
44 |     #     0.090796,
45 |     #     0.903871,
46 |     #     0.069489,
47 |     #     0.399444,
48 |     #     0.668653,
49 |     #     0.588122,
50 |     #     0.555237
51 |     # ]
52 |     i = 0
53 |     for shape in yolo_shapes:
54 |         batch, in_c, H, W, out_c, _, k_h, k_w, _, stride, padding, dilation, groups = shape 
55 |         print(perf(gflops(batch, in_c, out_c, H, W, k_h, k_w, stride, padding, dilation), millis[i]))
56 |         i += 1
57 | 


--------------------------------------------------------------------------------
/flextensor/testing/others/profile/profile_flextensor_yolo_b1_conv3.cu:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pku-liang/FlexTensor/98db7a952a0508f792edfd876527c8517a5ccd36/flextensor/testing/others/profile/profile_flextensor_yolo_b1_conv3.cu


--------------------------------------------------------------------------------