├── README.md
├── source-analysis
    ├── README.md
    ├── 使用TVM进行向量加法.md
    ├── 创建自定义的OP.md
    ├── topi是怎样工作的.md
    ├── 加载ONNX模型及执行过程分析.md
    ├── 计算图.md
    ├── tvm是如何生成cuda程序的.md
    ├── 最简单的Tuner分析.md
    ├── 如何利用已生成的高效代码.md
    ├── 通过计算图如何生成最简单的代码.md
    └── Python和C++之间函数调用的实现.md
└── mytophub
    └── 20191227
        ├── vgg16_bs_8.log
        ├── vgg16_bs_128.log
        ├── vgg16_bs_1.log
        ├── resnet50v1_bs_128.log
        ├── resnet50v1_bs_1.log
        ├── resnet50v1_bs_8.log
        ├── mobilenet_v2_bs_8.log
        ├── mobilenet_v2_bs_1.log
        └── mobilenet_v2_bs_128.log


/README.md:
--------------------------------------------------------------------------------
1 | # TVMDeepDive


--------------------------------------------------------------------------------
/source-analysis/README.md:
--------------------------------------------------------------------------------
1 | 
2 | This folder contains notes for reading TVM source code.
3 | 
4 | 


--------------------------------------------------------------------------------
/source-analysis/使用TVM进行向量加法.md:
--------------------------------------------------------------------------------
 1 | 
 2 | ```python
 3 | import tvm
 4 | import numpy as np
 5 | 
 6 | m = 3
 7 | ctx = tvm.context("cpu", 1)
 8 | A = tvm.placeholder((m,), name='A')
 9 | B = tvm.compute((m,), lambda i: A[i]+2, name='B')
10 | s = tvm.create_schedule(B.op)
11 | fm = tvm.build(s, [A, B], "cpu", target_host="llvm", name="mul")
12 | 
13 | ff = fm.get_function("mul")
14 | ff = fm.entry_func
15 | 
16 | a = tvm.nd.array(np.zeros(m, A.dtype), ctx)
17 | b = tvm.nd.array(np.zeros(m, A.dtype), ctx)
18 | res = ff(a, b)
19 | print(b)
20 | ```
21 | 输出结果：
22 | [2. 2. 2.]
23 | 


--------------------------------------------------------------------------------
/source-analysis/创建自定义的OP.md:
--------------------------------------------------------------------------------
 1 | 
 2 | 在TVM的教程里，有一个自定义OP的例子：
 3 | https://docs.tvm.ai/dev/relay_add_op.html
 4 | 
 5 | ```c++
 6 | RELAY_REGISTER_OP("add")
 7 |     .set_num_inputs(2)
 8 |     .add_argument("lhs", "Tensor", "The left hand side tensor.")
 9 |     .add_argument("rhs", "Tensor", "The right hand side tensor.")
10 |     .set_support_level(1)
11 |     .add_type_rel("Broadcast", BroadcastRel);
12 | ```
13 | 
14 | RELAY_REGISTER_OP是一个宏，在使用时创建了一个OpRegistry，并注册到全局的dmlc::Registry对象里。
15 | 
16 | ```c++
17 | class OpRegistry {
18 |  public:
19 |   ......
20 | 
21 |   TVM_DLL static ::dmlc::Registry<OpRegistry>* Registry();
22 | 
23 |  private:
24 |   std::string name;
25 |   Op op_;
26 | };
27 | 
28 | 
29 | #define RELAY_REGISTER_OP(OpName)                        \
30 |   DMLC_STR_CONCAT(RELAY_REGISTER_VAR_DEF, __COUNTER__) = \
31 |       ::tvm::relay::OpRegistry::Registry()               \
32 |           ->__REGISTER_OR_GET__(OpName)                  \
33 |           .set_name()
34 | ```
35 | 
36 | 


--------------------------------------------------------------------------------
/source-analysis/topi是怎样工作的.md:
--------------------------------------------------------------------------------
 1 | 
 2 | TOPI可以看做是一组上层的API，用户在创建计算图的时候，如果使用tvm api或者relay的算子，要实现复杂的计算就比较麻烦，topi为了简化这些功能，提供了很多
 3 | 定义好的功能。
 4 | 
 5 | 关于TOPI的文档不多，我们看看在TVM里TOPI的代码是怎样起作用的。
 6 | 
 7 | 由于还不知道上层的入口如何进入到TOPI，我们先从代码看一下哪里用到了topi。在src里搜索可以看到很多地方用到了，比较典型的一个是：
 8 | relay/op/tensor/binary.cc:.set_attr<FTVMCompute>("FTVMCompute", RELAY_BINARY_COMPUTE(topi::multiply));
 9 | 
10 | 我们看一下具体的实现：
11 | src/relay/op/tensor/binary.cc
12 | ```c++
13 | #define RELAY_BINARY_COMPUTE(FTOPI)                        \
14 |   [] (const Attrs& attrs,                                  \
15 |       const Array<Tensor>& inputs,                         \
16 |       const Type& out_type,                                \
17 |       const Target& target) -> Array<Tensor> {             \
18 |     CHECK_EQ(inputs.size(), 2U);                           \
19 |     return {FTOPI(inputs[0], inputs[1])};                  \
20 |   }   
21 | 
22 | // Addition
23 | RELAY_REGISTER_BINARY_OP("add")
24 | .describe("Elementwise add with with broadcasting")
25 | .set_support_level(1)
26 | .set_attr<FTVMCompute>("FTVMCompute", RELAY_BINARY_COMPUTE(topi::add));
27 | ```
28 | 可见在relay里，通过宏RELAY_REGISTER_BINARY_OP注册了函数add，其实现为topi::add，注册的key为FTVMCompute。
29 | 
30 | 我们看看RELAY_REGISTER_BINARY_OP这个宏具体做了什么：
31 | 
32 | src/relay/op/op_common.h
33 | ```c++
34 | #define RELAY_REGISTER_BINARY_OP(OpName)                          \
35 |   TVM_REGISTER_API("relay.op._make." OpName)                      \
36 |     .set_body_typed<Expr(Expr, Expr)>([](Expr lhs, Expr rhs) {    \
37 |         static const Op& op = Op::Get(OpName);                    \
38 |         return CallNode::make(op, {lhs, rhs}, Attrs(), {});       \
39 |       });                                                         \
40 |   RELAY_REGISTER_OP(OpName)                                       \
41 |     .set_num_inputs(2)                                            \
42 |     .add_argument("lhs", "Tensor", "The left hand side tensor.")  \
43 |     .add_argument("rhs", "Tensor", "The right hand side tensor.") \
44 |     .add_type_rel("Broadcast", BroadcastRel)                      \
45 |     .set_attr<TOpPattern>("TOpPattern", kBroadcast)               \
46 |     .set_attr<TOpIsStateful>("TOpIsStateful", false)              \
47 |     .set_attr<FInferCorrectLayout>("FInferCorrectLayout",         \
48 |                                    BinaryBroadcastLayout)
49 | ```
50 | 
51 | include/relay/op.h
52 | ```c++
53 | #define RELAY_REGISTER_OP(OpName)                        \
54 |   DMLC_STR_CONCAT(RELAY_REGISTER_VAR_DEF, __COUNTER__) = \
55 |       ::tvm::relay::OpRegistry::Registry()               \
56 |           ->__REGISTER_OR_GET__(OpName)                  \
57 |           .set_name()
58 | ```
59 | 
60 | 可见宏RELAY_REGISTER_BINARY_OP做了两件事：
61 | 1. 注册一个TVM的全局API，relay.op._make.add，其实现是根据参数OpName取得对应的Op，并返回一个CallNode。
62 | 2. 向relay注册一个Op，并且设置Name, argument，type_relation，以及一些属性。关于这些属性我们暂时不去关注
63 | 3. 在relay里对应该Op的EntryType里设置属性FTVMCompute为topi::add函数。
64 | 
65 | 既然在这里设置了属性，在生成代码的时候肯定要获取该属性，很容易查到
66 | relay/backend/compile_engine.cc:        Op::GetAttr<FTVMCompute>("FTVMCompute");
67 | 
68 | 该调用发生在函数VisitExpr_（）里。
69 | Array<Tensor> VisitExpr_(const CallNode* call_node) final {
70 |   
71 | ======== 待续===========
72 | 


--------------------------------------------------------------------------------
/source-analysis/加载ONNX模型及执行过程分析.md:
--------------------------------------------------------------------------------
  1 | 
  2 | 我们需要尝试一下TVM对PyTorch模型的支持，但是TVM还不支持直接加载PyTorch的模型，在TVM的教程里有个加载ONNX模型的示例，根据这个示例我们先将PyTorch
  3 | 的模型转为ONNX格式，然后再用TVM来加载执行。
  4 | 
  5 | 
  6 | 这里是代码部分：
  7 | ```python
  8 | import onnx
  9 | import numpy as np
 10 | import tvm
 11 | import tvm.relay as relay
 12 | from tvm.contrib.download import download_testdata
 13 | 
 14 | onnx_model = onnx.load("resnet50/model.onnx")
 15 | 
 16 | from torchvision import transforms
 17 | 
 18 | normalize = transforms.Normalize(
 19 |     mean=[0.485, 0.456, 0.406], #这是imagenet數據集的均值
 20 |     std=[0.229, 0.224, 0.225]
 21 | )
 22 |  
 23 | tran=transforms.Compose([
 24 |     transforms.Resize((224,224)),
 25 |     transforms.ToTensor(),
 26 |     transforms.Normalize(mean=[0.485, 0.456, 0.406],
 27 |                                  std=[0.229, 0.224, 0.225])
 28 | ])
 29 | 
 30 | im = Image.open("cat.jpeg")
 31 | x = tran(im)
 32 | x.unsqueeze_(dim=0)
 33 | 
 34 | target = 'cuda'
 35 | target_host = 'llvm'
 36 | layout = "NCHW"
 37 | ctx = tvm.gpu(0)
 38 | 
 39 | #target = 'llvm'
 40 | #layout = "NCHW"
 41 | #ctx = tvm.cpu()
 42 | print(ctx.device_type)
 43 | 
 44 | #input_name = 'data'
 45 | input_name='gpu_0/data_0'
 46 | shape_dict = {input_name: x.shape}
 47 | mod, params = relay.frontend.from_onnx(onnx_model, shape_dict)
 48 | 
 49 | with relay.build_config(opt_level=1):
 50 |     intrp = relay.build_module.create_executor('graph', mod, tvm.gpu(0), target)
 51 | 
 52 | dtype = 'float32'
 53 | x_numpy = x.numpy()
 54 | t = tvm.nd.array(x_numpy.astype(dtype))
 55 | %time it = intrp.evaluate()
 56 |     
 57 | %time tvm_output = it(t, **params).asnumpy()
 58 | ```
 59 | 
 60 | 为了进一步了解过程，对执行过程做了Pdb和gdb的跟踪，在这里记录一下，后续再详细分析
 61 | 
 62 | ===============================================================
 63 | mod, params = relay.frontend.from_onnx(onnx_model, shape_dict)
 64 | with relay.build_config(opt_level=1):
 65 |     intrp = relay.build_module.create_executor('graph', mod, tvm.gpu(0), target)
 66 | 	return GraphExecutor(mod, ctx, target)
 67 | 	GraphExecutor(_interpreter.Executor)
 68 | 
 69 | 
 70 | 
 71 | it = intrp.evaluate()  /home/heyunlong3/notespace/from_onnx.py:83
 72 | 	GraphExecutor._make_executor()  /home/heyunlong3/tvm/tvm/python/tvm/relay/backend/interpreter.py(215)
 73 | 		graph_json, mod, params = build(self.mod, target=self.target)  /home/heyunlong3/tvm/tvm/python/tvm/relay/build_module.py(240)
 74 |     			bld_mod = BuildModule()
 75 |     			graph_json, mod, params = bld_mod.build(func, target, target_host, params)
 76 | 				self._build(func, target, target_host)
 77 | 				[C++]RelayBuildModule：：GetFunction（“build”）
 78 | 					this->Build(args[0], args[1], args[2]);
 79 | 						BuildRelay(func, params_);
 80 | 							relay::Module relay_module = relay::ModuleNode::FromExpr(func);
 81 | 							relay_module = Optimize(relay_module, targets_, params);
 82 | 							// Generate code for the updated function.
 83 | 							graph_codegen_ = std::unique_ptr<GraphCodegen>(new GraphCodegen());
 84 | 							graph_codegen_->Init(nullptr, targets_);
 85 | 								GraphRuntimeCodegenModule::init()
 86 | 							graph_codegen_->Codegen(func);
 87 | 
 88 | 							ret_.graph_json = graph_codegen_->GetJSON();
 89 | 							ret_.params = graph_codegen_->GetParams();
 90 | 
 91 | 							auto lowered_funcs = graph_codegen_->GetLoweredFunc();
 92 | 							if (lowered_funcs.size() != 0) {
 93 |  								 ret_.mod = tvm::build(
 94 |   								  lowered_funcs,
 95 |  								   target_host_,
 96 | 								    BuildConfig::Current());
 97 | 							}
 98 | 
 99 | 
100 | 
101 | 		gmodule = _graph_rt.create(graph_json, mod, self.ctx)  /home/heyunlong3/tvm/tvm-debug/python/tvm/contrib/graph_runtime.py(25)
102 | 			fcreate = get_global_func("tvm.graph_runtime.create")
103 | 			return GraphModule(fcreate(graph_json_str, libmod, *device_type_id))
104 | 				[C++] GraphRuntimeCreate()  src/runtime/graph/graph_runtime.cc(509)  /home/heyunlong3/tvm/tvm-debug/src/runtime/graph/graph_runtime.cc(482)
105 | 					GraphRuntime.init()    /home/heyunlong3/tvm/tvm-debug/src/runtime/graph/graph_runtime.cc(71)
106 | 						this->Load(&reader);  
107 | 						this->SetupOpExecs();  
108 |     							std::tie(op_execs_[nid], op_args) = CreateTVMOp(inode.param, args, inode.inputs.size());   /home/heyunlong3/tvm/tvm-debug/src/runtime/graph/graph_runtime.cc(349)
109 |  
110 |   								tvm::runtime::PackedFunc pf = module_.GetFunction(param.func_name, false);   /home/heyunlong3/tvm/tvm-debug/src/runtime/graph/graph_runtime.cc(403) // Get compiled function from the module that contains both host and device  
111 | 
112 |   								auto fexec = [arg_ptr, pf]() {…
113 | 
114 | 
115 | 		if params:
116 |     			gmodule.set_input(**params)
117 | 
118 | 
119 | tvm_output = it(t, **params).asnumpy()
120 | 	_graph_wrapper(*args, **kwargs):
121 | 		for i, arg in enumerate(args):
122 |     			gmodule.set_input(i, arg)
123 | 		# Run the module, and fetch the output.
124 | 		gmodule.run()
125 | 			GraphRuntime::Run()
126 | 
127 | 


--------------------------------------------------------------------------------
/source-analysis/计算图.md:
--------------------------------------------------------------------------------
  1 | 
  2 | 计算图可以用tvm的api来实现，下面是一个简单的例子
  3 | 
  4 | ```python
  5 | # Matmul V1: List candidate values
  6 | @autotvm.template  # 1. use a decorator
  7 | def matmul_v1(N, L, M, dtype):
  8 |     A = tvm.placeholder((N, L), name='A', dtype=dtype)
  9 |     B = tvm.placeholder((L, M), name='B', dtype=dtype)
 10 | 
 11 |     k = tvm.reduce_axis((0, L), name='k')
 12 |     C = tvm.compute((N, M), lambda i, j: tvm.sum(A[i, k] * B[k, j], axis=k), name='C')
 13 | ```
 14 | 
 15 | ```python
 16 | def placeholder(shape, dtype=None, name="placeholder"):
 17 |     shape = (shape,) if isinstance(shape, _expr.Expr) else shape
 18 |     dtype = float32 if dtype is None else dtype
 19 |     return _api_internal._Placeholder(
 20 |         shape, dtype, name)
 21 | ```
 22 | 
 23 | src/lang/api_lang.cc 
 24 | ```c++ 
 25 |  TVM_REGISTER_API("_Placeholder")
 26 | .set_body_typed<Tensor(Array<Expr>, Type, std::string)>([](
 27 |   Array<Expr> shape, Type dtype, std::string name
 28 | ) {
 29 |   return placeholder(shape, dtype, name);
 30 | });
 31 | ```
 32 | 
 33 | tvm.placeholder在C++的代码里返回的是Tensor对象，对应的，在python代码里，返回的也是Tensor对象，这部分的函数调用与数据结构的封装转换可以参考
 34 | "Python和C++之间函数调用的实现"这一章节。
 35 | 
 36 | 其中tvm.compute的代码在python/tvm/api.py里
 37 | ```python
 38 |  def compute(shape, fcompute, name="compute", tag="", attrs=None):
 39 |     """Construct a new tensor by computing over the shape domain.
 40 | 
 41 |     The compute rule is result[axis] = fcompute(axis)
 42 | 
 43 |     Parameters
 44 |     ----------
 45 |     shape: Tuple of Expr
 46 |         The shape of the tensor
 47 | 
 48 |     fcompute: lambda function of indices-> value
 49 |         Specifies the input source expression
 50 | 
 51 |     name: str, optional
 52 |         The name hint of the tensor
 53 | 
 54 |     tag: str, optional
 55 |         Additional tag information about the compute.
 56 | 
 57 |     attrs: dict, optional
 58 |         The additional auxiliary attributes about the compute.
 59 | 
 60 |     Returns
 61 |     -------
 62 |     tensor: Tensor
 63 |         The created tensor
 64 |     """
 65 |     if _tag.TagScope.get_current() is not None:
 66 |         if tag != "":
 67 |             raise ValueError("nested tag is not allowed for now")
 68 |         tag = _tag.TagScope.get_current().tag
 69 |     shape = (shape,) if isinstance(shape, _expr.Expr) else shape
 70 |     # for python3
 71 |     shape = tuple([int(s) if isinstance(s, float) else s for s in shape])
 72 |     ndim = len(shape)
 73 |     code = fcompute.__code__
 74 | 
 75 |     out_ndim = ndim
 76 |     if code.co_argcount == 0:
 77 |         arg_names = ["i%d" % i for i in range(ndim)]
 78 |     else:
 79 |         arg_names = code.co_varnames[:code.co_argcount]
 80 |         out_ndim = code.co_argcount
 81 | 
 82 |     if out_ndim != len(arg_names):
 83 |         raise ValueError("fcompute do not match dimension, ndim=%d" % ndim)
 84 | 
 85 |     dim_var = [_IterVar((0, s), x, 0) for x, s in zip(arg_names, shape[:out_ndim])]
 86 |     body = fcompute(*[v.var for v in dim_var])
 87 | 
 88 |     if isinstance(body, _tensor.TensorIntrinCall):
 89 |         for i, s in enumerate(shape[out_ndim:]):
 90 |             var_name = "ax" + str(i)
 91 |             dim_var.append(_IterVar((0, s), var_name, 4))
 92 |         op_node = _api_internal._TensorComputeOp(name,
 93 |                                                  tag,
 94 |                                                  dim_var,
 95 |                                                  body.reduce_axis,
 96 |                                                  out_ndim,
 97 |                                                  body.intrin,
 98 |                                                  body.tensors,
 99 |                                                  body.regions,
100 |                                                  body.scalar_inputs)
101 |     else:
102 |         if not isinstance(body, (list, tuple)):
103 |             body = [body]
104 |         body = convert(body)
105 |         op_node = _api_internal._ComputeOp(
106 |             name, tag, attrs, dim_var, body)
107 | 
108 |     num = op_node.num_outputs
109 |     outputs = tuple(op_node.output(i) for i in range(num))
110 |     return outputs[0] if num == 1 else outputs
111 |  ```
112 |  
113 |  代码中出现了一个类_IterVar，用来表示张量中某个维度的遍历器
114 |  
115 |  src/api/api_lang.cc
116 |  ```c++
117 |  TVM_REGISTER_API("_IterVar")
118 | .set_body_typed<IterVar(Range, Var, int, std::string)>([](
119 |   Range dom, Var var, int iter_type, std::string thread_tag
120 | ) {
121 |   return IterVarNode::make(
122 |       dom, var,
123 |       static_cast<IterVarType>(iter_type),
124 |       thread_tag);
125 | });
126 | ```
127 | 
128 | src/lang/expr.cc
129 | ```c++
130 | IterVar IterVarNode::make(Range dom,
131 |                           Var var,
132 |                           IterVarType t,
133 |                           std::string thread_tag) {
134 |   NodePtr<IterVarNode> n = make_node<IterVarNode>();
135 |   n->dom = dom;
136 |   n->var = var;
137 |   n->iter_type = t;
138 |   n->thread_tag = thread_tag;
139 |   return IterVar(n);
140 | }
141 | ```
142 | 那么现在我们看关键的一行：
143 |     body = fcompute(*[v.var for v in dim_var])
144 | 这里fcompute是python的lambda函数，
145 |  lambda i, j: tvm.sum(A[i, k] * B[k, j]
146 | 
147 | 在执行的时候，首先要计算A[i,k]*B[k,j]。 由于表达式中是基于运算的，运算的元素在TVM里都是ExprOp的实例，在运算时会调用ExprOp的函数，比如两个Tensor元素的相加会调用ExprOp::__add__()函数。
148 | python/tvm/expr.py
149 | ```python
150 | class ExprOp(object):
151 |     def __add__(self, other):
152 |         return _generic.add(self, other)
153 | 
154 |     def __radd__(self, other):
155 |         return self.__add__(other)
156 | 
157 |     def __sub__(self, other):
158 |         return _generic.subtract(self, other)
159 | 
160 |     def __rsub__(self, other):
161 |         return _generic.subtract(other, self)
162 | ```
163 | 
164 | 另外由于具体的表达式在C++中都有对应的表示，因此在实际计算时都会先调用C++的实现，例如_generic.add会调用_make._OpAdd(lhs, rhs)函数，进而会调用到C++中Add::make()，其实现在BinaryOpNode里。
165 | 
166 | include/tvm/ir.h
167 | ```c++
168 | template<typename T>
169 | class BinaryOpNode : public ExprNode {
170 |  public:
171 |   /*! \brief The left operand. */
172 |   Expr a;
173 |   /*! \brief The right operand. */
174 |   Expr b;
175 | 
176 |   void VisitAttrs(AttrVisitor* v) final {
177 |     v->Visit("dtype", &(this->type));
178 |     v->Visit("a", &a);
179 |     v->Visit("b", &b);
180 |   }
181 | 
182 |   static Expr make(Expr a, Expr b) {
183 |     CHECK(a.defined()) << "ValueError: a is undefined\n";
184 |     CHECK(b.defined()) << "ValueError: b is undefined\n";
185 |     CHECK(a.type() == b.type()) << "TypeError: mismatched types\n";
186 |     NodePtr<T> node = make_node<T>();
187 |     node->type = a.type();
188 |     node->a = std::move(a);
189 |     node->b = std::move(b);
190 |     return Expr(node);
191 |   }
192 | 
193 |   TVM_DECLARE_NODE_TYPE_INFO(T, ExprNode);
194 | };
195 | ```
196 | 
197 | 从make函数的实现上看，调用了make_node<T>()，生成一个新的node，并且把表达式的计算元素作为其子节点。
198 |     
199 | 通过这种方式，就可以建立起一个计算图，每个节点是一个Node，后续的优化就是基于这个计算图进行 
200 | 


--------------------------------------------------------------------------------
/source-analysis/tvm是如何生成cuda程序的.md:
--------------------------------------------------------------------------------
  1 | 
  2 | TVM使用relay作为IR表示，在生成程序的时候，调用的是tvm.relay.build函数，该函数调用最终被C++侧的RelayBuildModule处理：
  3 | 
  4 | src/relay/backend/build_module.cc
  5 | ```c++
  6 | class RelayBuildModule : public runtime::ModuleNode {
  7 |  public:
  8 |   /*!
  9 |    * \brief Get member function to front-end
 10 |    * \param name The name of the function.
 11 |    * \param sptr_to_self The pointer to the module node.
 12 |    * \return The corresponding member function.
 13 |    */
 14 |   PackedFunc GetFunction(const std::string& name,
 15 |                          const std::shared_ptr<ModuleNode>& sptr_to_self) final {
 16 |     if (name == "get_graph_json") {
 17 |       return PackedFunc([sptr_to_self, this](TVMArgs args, TVMRetValue* rv) {
 18 |         *rv = this->GetGraphJSON();
 19 |       });
 20 |     } else if (name == "get_module") {
 21 |       return PackedFunc([sptr_to_self, this](TVMArgs args, TVMRetValue* rv) {
 22 |         *rv = this->GetModule();
 23 |       });
 24 |     } else if (name == "build") {
 25 |       return PackedFunc([sptr_to_self, this](TVMArgs args, TVMRetValue* rv) {
 26 |         CHECK_EQ(args.num_args, 3);
 27 |         this->Build(args[0], args[1], args[2]);
 28 |       });
 29 |     } else if (name == "list_params") {
 30 |     ......
 31 | ```
 32 | Build函数调用了BuildRelay函数，在BuildRelay里，实现了Module的优化，然后做代码生成，最后调用tvm::build进行程序生成。
 33 | 
 34 | src/relay/backend/build_module.cc
 35 | ```c++
 36 | void BuildRelay(
 37 |       Function func,
 38 |       const std::unordered_map<std::string, tvm::runtime::NDArray>& params) {
 39 |     if (params.size()) {
 40 |       func = BindParamsByName(func, params);
 41 |     }
 42 | 
 43 |     // Perform Module->Module optimizations.
 44 |     relay::Module relay_module = relay::ModuleNode::FromExpr(func);
 45 |     relay_module = Optimize(relay_module, targets_, params);
 46 |     CHECK(relay_module.defined());
 47 |     // Get the updated function.
 48 |     func = relay_module->Lookup("main");
 49 | 
 50 |     // Generate code for the updated function.
 51 |     graph_codegen_ = std::unique_ptr<GraphCodegen>(new GraphCodegen());
 52 |     graph_codegen_->Init(nullptr, targets_);
 53 |     graph_codegen_->Codegen(func);
 54 | 
 55 |     ret_.graph_json = graph_codegen_->GetJSON();
 56 |     ret_.params = graph_codegen_->GetParams();
 57 | 
 58 |     auto lowered_funcs = graph_codegen_->GetLoweredFunc();
 59 |     if (lowered_funcs.size() != 0) {
 60 |       ret_.mod = tvm::build(
 61 |         lowered_funcs,
 62 |         target_host_,
 63 |         BuildConfig::Current());
 64 |     }
 65 |   }
 66 | ```
 67 | 
 68 | 在tvm::build函数里，根据不同的target, 分别调用DeviceBuild来生成程序
 69 |  
 70 | ```c++
 71 | / Build for heterogeneous execution.
 72 | runtime::Module build(const Map<Target, Array<LoweredFunc>>& inputs,
 73 |                       const Target& target_host,
 74 |                       const BuildConfig& config) {
 75 |   ......
 76 | 
 77 |   for (const auto& it : inputs) {
 78 |     auto host_dev_funcs =
 79 |         split_dev_host_funcs(it.second, it.first, target_host_val, config);
 80 |     auto& fhost = host_dev_funcs[0];
 81 |     auto& fdevice = host_dev_funcs[1];
 82 |     // Get the module for a certain target.
 83 |     runtime::Module mdev = DeviceBuild(fdevice, it.first);
 84 |     for (const auto& it : fhost) {
 85 |       fhost_all.push_back(it);
 86 |     }
 87 |     device_modules.push_back(mdev);
 88 |   }
 89 | 
 90 |   runtime::Module mhost = codegen::Build(fhost_all, target_host_val->str());
 91 |   ......
 92 | }
 93 | ```
 94 |  
 95 | 在DeviceBuild函数里，调用了codegen::Build来生成程序，在这个函数里，根据target查询对应的codegen.build_xxx函数，然后进行调用。对于cuda来说，调用的就是codegen.build_cuda()函数
 96 | 
 97 | src/codegen/codegen.cc
 98 | ```c++
 99 | runtime::Module Build(const Array<LoweredFunc>& funcs,
100 |                       const std::string& target) {
101 |   std::string mode = target;
102 |   size_t pos = mode.find(' ');
103 |   if (pos != std::string::npos) {
104 |     mode = mode.substr(0, pos);
105 |   }
106 |   std::string build_f_name = "codegen.build_" + mode;
107 |   // the build function.
108 |   const PackedFunc* bf = runtime::Registry::Get(build_f_name);
109 |   CHECK(bf != nullptr)
110 |       << "Target " << target << " is not enabled";
111 |   runtime::Module m = (*bf)(funcs, target);
112 |   return m;
113 | }
114 | ```
115 | codegen.build_cuda()函数是一个被注册的函数名，其实现是codegen里的buildCUDA函数，在其中又调用了注册的tvm_callback_cuda_compile函数
116 | 
117 | src/codegen/opt/build_cuda_on.cc
118 | ```c++
119 | runtime::Module BuildCUDA(Array<LoweredFunc> funcs) {
120 |   using tvm::runtime::Registry;l
121 |   bool output_ssa = false;
122 |   CodeGenCUDA cg;
123 |   cg.Init(output_ssa);
124 | 
125 |   for (LoweredFunc f : funcs) {
126 |     cg.AddFunction(f);
127 |   }
128 |   std::string code = cg.Finish();
129 | 
130 |   if (const auto* f = Registry::Get("tvm_callback_cuda_postproc")) {
131 |     code = (*f)(code).operator std::string();
132 |   }
133 |   std::string fmt = "ptx";
134 |   std::string ptx;
135 |   if (const auto* f = Registry::Get("tvm_callback_cuda_compile")) {
136 |     ptx = (*f)(code).operator std::string();
137 |     // Dirty matching to check PTX vs cubin.
138 |     // TODO(tqchen) more reliable checks
139 |     if (ptx[0] != '/') fmt = "cubin";
140 |   } else {
141 |     ptx = NVRTCCompile(code, cg.need_include_path());
142 |   }
143 |   return CUDAModuleCreate(ptx, fmt, ExtractFuncInfo(funcs), code);
144 | }
145 | 
146 | TVM_REGISTER_API("codegen.build_cuda")
147 | .set_body_typed(BuildCUDA);
148 | ```
149 | 
150 | tvm_callback_cuda_compile函数是从python里注册的。
151 | 
152 | python/tvm/autotvm/measure/measure_methods.py
153 | ```python
154 | @register_func
155 | def tvm_callback_cuda_compile(code):
156 |     """use nvcc to generate ptx code for better optimization"""
157 |     ptx = nvcc.compile_cuda(code, target="ptx", arch=AutotvmGlobalScope.current.cuda_target_arch)
158 |     return ptx
159 | ```
160 | 
161 | 而在nvcc.compile_cuda函数里，使用了命令行调用nvcc来生成程序
162 | ```python
163 | def compile_cuda(code,
164 |                  target="ptx",
165 |                  arch=None,
166 |                  options=None,
167 |                  path_target=None):
168 |     """Compile cuda code with NVCC from env.
169 | 
170 |     Parameters
171 |     ----------
172 |     code : str
173 |         The cuda code.
174 | 
175 |     target : str
176 |         The target format
177 | 
178 |     arch : str
179 |         The architecture
180 | 
181 |     options : str or list of str
182 |         The additional options
183 | 
184 |     path_target : str, optional
185 |         Output file.
186 | 
187 |     Return
188 |     ------
189 |     cubin : bytearray
190 |         The bytearray of the cubin
191 |     """
192 |     temp = util.tempdir()
193 |     if target not in ["cubin", "ptx", "fatbin"]:
194 |         raise ValueError("target must be in cubin, ptx, fatbin")
195 |     temp_code = temp.relpath("my_kernel.cu")
196 |     temp_target = temp.relpath("my_kernel.%s" % target)
197 | 
198 |     with open(temp_code, "w") as out_file:
199 |         out_file.write(code)
200 | 
201 |     if arch is None:
202 |         if nd.gpu(0).exist:
203 |             # auto detect the compute arch argument
204 |             arch = "sm_" + "".join(nd.gpu(0).compute_version.split('.'))
205 |         else:
206 |             raise ValueError("arch(sm_xy) is not passed, and we cannot detect it from env")
207 | 
208 |     file_target = path_target if path_target else temp_target
209 |     cmd = ["nvcc"]
210 |     cmd += ["--%s" % target, "-O3"]
211 |     cmd += ["-arch", arch]
212 | 
213 |     if options:
214 |         if isinstance(options, str):
215 |             cmd += [options]
216 |         elif isinstance(options, list):
217 |             cmd += options
218 |         else:
219 |             raise ValueError("options must be str or list of str")
220 | 
221 |     cmd += ["-o", file_target]
222 |     cmd += [temp_code]
223 | 
224 |     proc = subprocess.Popen(
225 |         cmd, stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
226 | 
227 |     (out, _) = proc.communicate()
228 | 
229 |     if proc.returncode != 0:
230 |         msg = "Compilation error:\n"
231 |         msg += py_str(out)
232 |         raise RuntimeError(msg)
233 | 
234 |     data = bytearray(open(file_target, "rb").read())
235 |     if not data:
236 |         raise RuntimeError(
237 |             "Compilation error: empty result is generated")
238 |     return data
239 | ```
240 | 


--------------------------------------------------------------------------------
/mytophub/20191227/vgg16_bs_8.log:
--------------------------------------------------------------------------------
 1 | {"i": ["cuda -model=unknown", "topi_nn_conv2d", [["TENSOR", [8, 3, 224, 224], "float32"], ["TENSOR", [64, 3, 3, 3], "float32"], [1, 1], [1, 1], [1, 1], "NCHW", "float32"], {}, ["conv2d", [8, 3, 224, 224, "float32"], [64, 3, 3, 3, "float32"], [1, 1], [1, 1], [1, 1], "NCHW", "float32"], {"i": 123169609, "t": "direct", "c": null, "e": [["tile_f", "sp", [-1, 1, 8, 8]], ["tile_y", "sp", [-1, 1, 1, 1]], ["tile_x", "sp", [-1, 14, 16, 1]], ["tile_rc", "sp", [-1, 3]], ["tile_ry", "sp", [-1, 1]], ["tile_rx", "sp", [-1, 3]], ["auto_unroll_max_step", "ot", 0], ["unroll_explicit", "ot", 1]]}], "r": [[0.00043884453801169593, 0.0004389303333333333, 0.0004389471783625731, 0.00043897208479532163, 0.00043899851461988307, 0.0004390054532163743, 0.0004390314356725146, 0.0004391633771929825], 0, 31.968744039535522, 1576756800.1847723], "v": 0.1}
 2 | {"i": ["cuda -model=unknown", "topi_nn_conv2d", [["TENSOR", [8, 64, 224, 224], "float32"], ["TENSOR", [64, 64, 3, 3], "float32"], [1, 1], [1, 1], [1, 1], "NCHW", "float32"], {}, ["conv2d", [8, 64, 224, 224, "float32"], [64, 64, 3, 3, "float32"], [1, 1], [1, 1], [1, 1], "NCHW", "float32"], {"i": 6849591, "t": "winograd", "c": null, "e": [["tile_b", "sp", [-1, 1, 1, 1]], ["tile_y", "sp", [-1, 1, 16, 4]], ["tile_x", "sp", [-1, 4, 16, 1]], ["tile_rc", "sp", [-1, 4]], ["auto_unroll_max_step", "ot", 1500], ["unroll_explicit", "ot", 1]]}], "r": [[0.0051756865, 0.005176650233333333, 0.005176739033333333, 0.005176748266666667, 0.0051773150333333335, 0.0051774181, 0.005178210966666667, 0.005178377299999999], 0, 13.34377384185791, 1576760985.4057124], "v": 0.1}
 3 | {"i": ["cuda -model=unknown", "topi_nn_conv2d", [["TENSOR", [8, 64, 112, 112], "float32"], ["TENSOR", [128, 64, 3, 3], "float32"], [1, 1], [1, 1], [1, 1], "NCHW", "float32"], {}, ["conv2d", [8, 64, 112, 112, "float32"], [128, 64, 3, 3, "float32"], [1, 1], [1, 1], [1, 1], "NCHW", "float32"], {"i": 5485309, "t": "winograd", "c": null, "e": [["tile_b", "sp", [-1, 1, 1, 1]], ["tile_y", "sp", [-1, 1, 8, 16]], ["tile_x", "sp", [-1, 4, 16, 1]], ["tile_rc", "sp", [-1, 8]], ["auto_unroll_max_step", "ot", 1500], ["unroll_explicit", "ot", 1]]}], "r": [[0.00201113948, 0.0020111664, 0.0020113777866666667, 0.0020113957466666665, 0.0020118043733333337, 0.0020118242666666668, 0.00201183216, 0.0020119977066666668], 0, 27.35636830329895, 1576765599.6230526], "v": 0.1}
 4 | {"i": ["cuda -model=unknown", "topi_nn_conv2d", [["TENSOR", [8, 128, 112, 112], "float32"], ["TENSOR", [128, 128, 3, 3], "float32"], [1, 1], [1, 1], [1, 1], "NCHW", "float32"], {}, ["conv2d", [8, 128, 112, 112, "float32"], [128, 128, 3, 3, "float32"], [1, 1], [1, 1], [1, 1], "NCHW", "float32"], {"i": 2749281, "t": "winograd", "c": null, "e": [["tile_b", "sp", [-1, 1, 1, 1]], ["tile_y", "sp", [-1, 4, 8, 4]], ["tile_x", "sp", [-1, 4, 16, 1]], ["tile_rc", "sp", [-1, 8]], ["auto_unroll_max_step", "ot", 1500], ["unroll_explicit", "ot", 0]]}], "r": [[0.0026774523859649122, 0.002678032736842105, 0.002678074543859649, 0.0026782198947368423, 0.0026784177543859645, 0.0026787226315789473, 0.002678738631578947, 0.0026790771578947365], 0, 25.030742168426514, 1576769260.3778613], "v": 0.1}
 5 | {"i": ["cuda -model=unknown", "topi_nn_conv2d", [["TENSOR", [8, 128, 56, 56], "float32"], ["TENSOR", [256, 128, 3, 3], "float32"], [1, 1], [1, 1], [1, 1], "NCHW", "float32"], {}, ["conv2d", [8, 128, 56, 56, "float32"], [256, 128, 3, 3, "float32"], [1, 1], [1, 1], [1, 1], "NCHW", "float32"], {"i": 3262689, "t": "winograd", "c": null, "e": [["tile_b", "sp", [-1, 1, 1, 1]], ["tile_y", "sp", [-1, 1, 16, 16]], ["tile_x", "sp", [-1, 2, 8, 2]], ["tile_rc", "sp", [-1, 8]], ["auto_unroll_max_step", "ot", 128], ["unroll_explicit", "ot", 1]]}], "r": [[0.0011918108015873015, 0.0011918624761904763, 0.0011919034126984126, 0.0011919254523809523, 0.0011919315476190478, 0.0011919345952380953, 0.0011920441587301588, 0.0011921915317460318], 0, 32.866421699523926, 1576775616.8286216], "v": 0.1}
 6 | {"i": ["cuda -model=unknown", "topi_nn_conv2d", [["TENSOR", [8, 256, 56, 56], "float32"], ["TENSOR", [256, 256, 3, 3], "float32"], [1, 1], [1, 1], [1, 1], "NCHW", "float32"], {}, ["conv2d", [8, 256, 56, 56, "float32"], [256, 256, 3, 3, "float32"], [1, 1], [1, 1], [1, 1], "NCHW", "float32"], {"i": 3632289, "t": "winograd", "c": null, "e": [["tile_b", "sp", [-1, 1, 1, 1]], ["tile_y", "sp", [-1, 1, 16, 16]], ["tile_x", "sp", [-1, 2, 8, 2]], ["tile_rc", "sp", [-1, 8]], ["auto_unroll_max_step", "ot", 128], ["unroll_explicit", "ot", 1]]}], "r": [[0.0017913505952380953, 0.0017916381785714286, 0.001791678892857143, 0.0017921378095238096, 0.0017922411904761903, 0.0017922746785714286, 0.001792527857142857, 0.0017926656785714288], 0, 48.27368664741516, 1576781325.8835814], "v": 0.1}
 7 | {"i": ["cuda -model=unknown", "topi_nn_conv2d", [["TENSOR", [8, 256, 28, 28], "float32"], ["TENSOR", [512, 256, 3, 3], "float32"], [1, 1], [1, 1], [1, 1], "NCHW", "float32"], {}, ["conv2d", [8, 256, 28, 28, "float32"], [512, 256, 3, 3, "float32"], [1, 1], [1, 1], [1, 1], "NCHW", "float32"], {"i": 531014, "t": "winograd", "c": null, "e": [["tile_b", "sp", [-1, 1, 1, 1]], ["tile_y", "sp", [-1, 1, 8, 8]], ["tile_x", "sp", [-1, 2, 8, 2]], ["tile_rc", "sp", [-1, 16]], ["auto_unroll_max_step", "ot", 0], ["unroll_explicit", "ot", 0]]}], "r": [[0.001271321462184874, 0.0012713655210084033, 0.0012714528235294117, 0.0012714854957983194, 0.001271555848739496, 0.0012715957899159666, 0.001271636025210084, 0.0012725462689075632], 0, 6.772793531417847, 1576785302.1855698], "v": 0.1}
 8 | {"i": ["cuda -model=unknown", "topi_nn_conv2d", [["TENSOR", [8, 512, 28, 28], "float32"], ["TENSOR", [512, 512, 3, 3], "float32"], [1, 1], [1, 1], [1, 1], "NCHW", "float32"], {}, ["conv2d", [8, 512, 28, 28, "float32"], [512, 512, 3, 3, "float32"], [1, 1], [1, 1], [1, 1], "NCHW", "float32"], {"i": 1763018, "t": "winograd", "c": null, "e": [["tile_b", "sp", [-1, 1, 1, 1]], ["tile_y", "sp", [-1, 1, 16, 8]], ["tile_x", "sp", [-1, 2, 8, 2]], ["tile_rc", "sp", [-1, 16]], ["auto_unroll_max_step", "ot", 128], ["unroll_explicit", "ot", 0]]}], "r": [[0.0021595973428571425, 0.002159768957142857, 0.0021598224285714284, 0.0021598359428571428, 0.0021598460142857145, 0.0021604248714285714, 0.002161826742857143, 0.002163405814285714], 0, 6.428175449371338, 1576789309.7965672], "v": 0.1}
 9 | {"i": ["cuda -model=unknown", "topi_nn_conv2d", [["TENSOR", [8, 512, 14, 14], "float32"], ["TENSOR", [512, 512, 3, 3], "float32"], [1, 1], [1, 1], [1, 1], "NCHW", "float32"], {}, ["conv2d", [8, 512, 14, 14, "float32"], [512, 512, 3, 3, "float32"], [1, 1], [1, 1], [1, 1], "NCHW", "float32"], {"i": 2340042, "t": "winograd", "c": null, "e": [["tile_b", "sp", [-1, 1, 1, 1]], ["tile_y", "sp", [-1, 2, 8, 4]], ["tile_x", "sp", [-1, 7, 8, 1]], ["tile_rc", "sp", [-1, 8]], ["auto_unroll_max_step", "ot", 1500], ["unroll_explicit", "ot", 1]]}], "r": [[0.0005950515256916996, 0.0005951132292490118, 0.0005951994664031621, 0.0005952432411067194, 0.000595274442687747, 0.0005952786007905139, 0.0005952991739130435, 0.00059571485770751], 0, 19.57026195526123, 1576792761.9634597], "v": 0.1}
10 | {"i": ["cuda -model=unknown", "topi_nn_dense", [["TENSOR", [8, 25088], "float32"], ["TENSOR", [4096, 25088], "float32"], null, "float32"], {}, ["dense", [8, 25088, "float32"], [4096, 25088, "float32"], 0, "float32"], {"i": 20, "t": "direct", "c": null, "e": [["tile_k", "sp", [-1, 512]]]}], "r": [[0.013735978133333333, 0.013737530266666666, 0.0137389568, 0.013739324, 0.013739936866666667, 0.0137405719, 0.013741592666666667, 0.013742133600000001], 0, 5.85648512840271, 1576796347.5640316], "v": 0.1}
11 | {"i": ["cuda -model=unknown", "topi_nn_dense", [["TENSOR", [8, 4096], "float32"], ["TENSOR", [4096, 4096], "float32"], null, "float32"], {}, ["dense", [8, 4096, "float32"], [4096, 4096, "float32"], 0, "float32"], {"i": 10, "t": "direct", "c": null, "e": [["tile_k", "sp", [-1, 1024]]]}], "r": [[0.0020749986315789473, 0.002075076184210526, 0.0020751295175438595, 0.0020752010263157894, 0.002075247114035088, 0.002075259842105263, 0.0020753068859649124, 0.002075465684210526], 0, 3.755605697631836, 1576796424.9416084], "v": 0.1}
12 | {"i": ["cuda -model=unknown", "topi_nn_dense", [["TENSOR", [8, 4096], "float32"], ["TENSOR", [1000, 4096], "float32"], null, "float32"], {}, ["dense", [8, 4096, "float32"], [1000, 4096, "float32"], 0, "float32"], {"i": 10, "t": "direct", "c": null, "e": [["tile_k", "sp", [-1, 1024]]]}], "r": [[0.0005117982194092827, 0.0005118024556962026, 0.000511832552742616, 0.0005119014367088607, 0.000511920835443038, 0.0005119257658227848, 0.0005119599385665529, 0.0005120105632911392], 0, 3.3364245891571045, 1576796444.9151719], "v": 0.1}
13 | 


--------------------------------------------------------------------------------
/mytophub/20191227/vgg16_bs_128.log:
--------------------------------------------------------------------------------
 1 | {"i": ["cuda -model=unknown", "topi_nn_conv2d", [["TENSOR", [128, 3, 224, 224], "float32"], ["TENSOR", [64, 3, 3, 3], "float32"], [1, 1], [1, 1], [1, 1], "NCHW", "float32"], {}, ["conv2d", [128, 3, 224, 224, "float32"], [64, 3, 3, 3, "float32"], [1, 1], [1, 1], [1, 1], "NCHW", "float32"], {"i": 68546767, "t": "direct", "c": null, "e": [["tile_f", "sp", [-1, 1, 4, 16]], ["tile_y", "sp", [-1, 1, 1, 1]], ["tile_x", "sp", [-1, 4, 56, 1]], ["tile_rc", "sp", [-1, 1]], ["tile_ry", "sp", [-1, 1]], ["tile_rx", "sp", [-1, 1]], ["auto_unroll_max_step", "ot", 1500], ["unroll_explicit", "ot", 0]]}], "r": [[0.0067756335, 0.006775662866666667, 0.0067762048666666665, 0.006777074466666667, 0.0067775465666666665, 0.006777862433333334, 0.006778341166666666, 0.006779208033333334], 0, 8.05925464630127, 1577084106.3366344], "v": 0.1}
 2 | {"i": ["cuda -model=unknown", "topi_nn_conv2d", [["TENSOR", [128, 64, 224, 224], "float32"], ["TENSOR", [64, 64, 3, 3], "float32"], [1, 1], [1, 1], [1, 1], "NCHW", "float32"], {}, ["conv2d", [128, 64, 224, 224, "float32"], [64, 64, 3, 3, "float32"], [1, 1], [1, 1], [1, 1], "NCHW", "float32"], {"i": 12247764, "t": "winograd", "c": null, "e": [["tile_b", "sp", [-1, 1, 1, 1]], ["tile_y", "sp", [-1, 4, 4, 4]], ["tile_x", "sp", [-1, 4, 16, 1]], ["tile_rc", "sp", [-1, 32]], ["auto_unroll_max_step", "ot", 0], ["unroll_explicit", "ot", 1]]}], "r": [[0.0806969979, 0.08069946880000001, 0.08070511206666667, 0.08070760733333333, 0.0807098196, 0.08071011046666668, 0.08071752313333333, 0.08071768186666667], 0, 66.33862924575806, 1577092677.258777], "v": 0.1}
 3 | {"i": ["cuda -model=unknown", "topi_nn_conv2d", [["TENSOR", [128, 64, 112, 112], "float32"], ["TENSOR", [128, 64, 3, 3], "float32"], [1, 1], [1, 1], [1, 1], "NCHW", "float32"], {}, ["conv2d", [128, 64, 112, 112, "float32"], [128, 64, 3, 3, "float32"], [1, 1], [1, 1], [1, 1], "NCHW", "float32"], {"i": 14435333, "t": "winograd", "c": null, "e": [["tile_b", "sp", [-1, 1, 1, 1]], ["tile_y", "sp", [-1, 16, 4, 2]], ["tile_x", "sp", [-1, 4, 16, 1]], ["tile_rc", "sp", [-1, 32]], ["auto_unroll_max_step", "ot", 128], ["unroll_explicit", "ot", 1]]}], "r": [[0.032299483766666665, 0.03230264333333333, 0.03231041233333334, 0.032312964933333334, 0.0323159521, 0.03231791446666667, 0.03232048813333333, 0.0323223204], 0, 63.83933424949646, 1577099581.9539115], "v": 0.1}
 4 | {"i": ["cuda -model=unknown", "topi_nn_conv2d", [["TENSOR", [128, 128, 112, 112], "float32"], ["TENSOR", [128, 128, 3, 3], "float32"], [1, 1], [1, 1], [1, 1], "NCHW", "float32"], {}, ["conv2d", [128, 128, 112, 112, "float32"], [128, 128, 3, 3, "float32"], [1, 1], [1, 1], [1, 1], "NCHW", "float32"], {"i": 15398697, "t": "winograd", "c": null, "e": [["tile_b", "sp", [-1, 1, 1, 1]], ["tile_y", "sp", [-1, 8, 8, 2]], ["tile_x", "sp", [-1, 2, 4, 4]], ["tile_rc", "sp", [-1, 8]], ["auto_unroll_max_step", "ot", 128], ["unroll_explicit", "ot", 1]]}], "r": [[0.0486389951, 0.048642012166666665, 0.048642853933333334, 0.048643071366666665, 0.0486454236, 0.04864591186666667, 0.04865160723333333, 0.04865232966666667], 0, 119.94365000724792, 1577112890.8096297], "v": 0.1}
 5 | {"i": ["cuda -model=unknown", "topi_nn_conv2d", [["TENSOR", [128, 128, 56, 56], "float32"], ["TENSOR", [256, 128, 3, 3], "float32"], [1, 1], [1, 1], [1, 1], "NCHW", "float32"], {}, ["conv2d", [128, 128, 56, 56, "float32"], [256, 128, 3, 3, "float32"], [1, 1], [1, 1], [1, 1], "NCHW", "float32"], {"i": 13454535, "t": "winograd", "c": null, "e": [["tile_b", "sp", [-1, 1, 1, 1]], ["tile_y", "sp", [-1, 4, 16, 4]], ["tile_x", "sp", [-1, 4, 16, 1]], ["tile_rc", "sp", [-1, 32]], ["auto_unroll_max_step", "ot", 128], ["unroll_explicit", "ot", 1]]}], "r": [[0.018519757600000002, 0.018520261033333332, 0.018522302966666668, 0.01852448563333333, 0.018527181533333333, 0.018528770966666667, 0.018530687866666666, 0.018535437], 0, 46.35883831977844, 1577126759.9553223], "v": 0.1}
 6 | {"i": ["cuda -model=unknown", "topi_nn_conv2d", [["TENSOR", [128, 256, 56, 56], "float32"], ["TENSOR", [256, 256, 3, 3], "float32"], [1, 1], [1, 1], [1, 1], "NCHW", "float32"], {}, ["conv2d", [128, 256, 56, 56, "float32"], [256, 256, 3, 3, "float32"], [1, 1], [1, 1], [1, 1], "NCHW", "float32"], {"i": 17447555, "t": "winograd", "c": null, "e": [["tile_b", "sp", [-1, 1, 1, 1]], ["tile_y", "sp", [-1, 2, 8, 8]], ["tile_x", "sp", [-1, 4, 16, 1]], ["tile_rc", "sp", [-1, 8]], ["auto_unroll_max_step", "ot", 1500], ["unroll_explicit", "ot", 1]]}], "r": [[0.027734048233333333, 0.027735653533333335, 0.0277368192, 0.027737061266666665, 0.027737964966666664, 0.027741680300000002, 0.027743143966666665, 0.027746712266666664], 0, 115.22682452201843, 1577141152.0336742], "v": 0.1}
 7 | {"i": ["cuda -model=unknown", "topi_nn_conv2d", [["TENSOR", [128, 256, 28, 28], "float32"], ["TENSOR", [512, 256, 3, 3], "float32"], [1, 1], [1, 1], [1, 1], "NCHW", "float32"], {}, ["conv2d", [128, 256, 28, 28, "float32"], [512, 256, 3, 3, "float32"], [1, 1], [1, 1], [1, 1], "NCHW", "float32"], {"i": 24231422, "t": "winograd", "c": null, "e": [["tile_b", "sp", [-1, 1, 1, 1]], ["tile_y", "sp", [-1, 1, 16, 16]], ["tile_x", "sp", [-1, 4, 16, 1]], ["tile_rc", "sp", [-1, 32]], ["auto_unroll_max_step", "ot", 1500], ["unroll_explicit", "ot", 1]]}], "r": [[0.019290107100000002, 0.019290146866666667, 0.019290160100000002, 0.019290439766666667, 0.019290458833333336, 0.01929097546666667, 0.0192913101, 0.019291328], 0, 21.033047199249268, 1577150648.347193], "v": 0.1}
 8 | {"i": ["cuda -model=unknown", "topi_nn_conv2d", [["TENSOR", [128, 512, 28, 28], "float32"], ["TENSOR", [512, 512, 3, 3], "float32"], [1, 1], [1, 1], [1, 1], "NCHW", "float32"], {}, ["conv2d", [128, 512, 28, 28, "float32"], [512, 512, 3, 3, "float32"], [1, 1], [1, 1], [1, 1], "NCHW", "float32"], {"i": 25683395, "t": "winograd", "c": null, "e": [["tile_b", "sp", [-1, 1, 1, 1]], ["tile_y", "sp", [-1, 2, 8, 8]], ["tile_x", "sp", [-1, 4, 16, 1]], ["tile_rc", "sp", [-1, 8]], ["auto_unroll_max_step", "ot", 1500], ["unroll_explicit", "ot", 1]]}], "r": [[0.033039312566666666, 0.0330393956, 0.0330400433, 0.033040511433333336, 0.0330418653, 0.033042196466666666, 0.03304370053333334, 0.03304389433333334], 0, 68.99351716041565, 1577161518.120259], "v": 0.1}
 9 | {"i": ["cuda -model=unknown", "topi_nn_conv2d", [["TENSOR", [128, 512, 14, 14], "float32"], ["TENSOR", [512, 512, 3, 3], "float32"], [1, 1], [1, 1], [1, 1], "NCHW", "float32"], {}, ["conv2d", [128, 512, 14, 14, "float32"], [512, 512, 3, 3, "float32"], [1, 1], [1, 1], [1, 1], "NCHW", "float32"], {"i": 6624328, "t": "winograd", "c": null, "e": [["tile_b", "sp", [-1, 1, 1, 1]], ["tile_y", "sp", [-1, 4, 16, 4]], ["tile_x", "sp", [-1, 4, 16, 1]], ["tile_rc", "sp", [-1, 32]], ["auto_unroll_max_step", "ot", 1500], ["unroll_explicit", "ot", 0]]}], "r": [[0.0084244303, 0.008424947666666667, 0.008425014966666667, 0.008425178866666666, 0.0084256713, 0.008425792900000001, 0.0084277559, 0.008427820499999999], 0, 21.29027485847473, 1577172345.6539006], "v": 0.1}
10 | {"i": ["cuda -model=unknown", "topi_nn_dense", [["TENSOR", [128, 25088], "float32"], ["TENSOR", [4096, 25088], "float32"], null, "float32"], {}, ["dense", [128, 25088, "float32"], [4096, 25088, "float32"], 0, "float32"], {"i": 516298, "t": "direct", "c": null, "e": [["tile_x", "sp", [-1, 1, 16, 2]], ["tile_y", "sp", [-1, 1, 16, 4]], ["tile_k", "sp", [-1, 49, 1]]]}], "r": [[0.0100990975, 0.010099897166666667, 0.0101001379, 0.0101002811, 0.010100448666666668, 0.0101010431, 0.010101399533333333, 0.0101016677], 0, 58.480523109436035, 1577177656.6776683], "v": 0.1}
11 | {"i": ["cuda -model=unknown", "topi_nn_dense", [["TENSOR", [128, 4096], "float32"], ["TENSOR", [4096, 4096], "float32"], null, "float32"], {}, ["dense", [128, 4096, "float32"], [4096, 4096, "float32"], 0, "float32"], {"i": 1981828, "t": "direct", "c": null, "e": [["tile_x", "sp", [-1, 4, 16, 1]], ["tile_y", "sp", [-1, 4, 16, 2]], ["tile_k", "sp", [-1, 1, 8]]]}], "r": [[0.0023489219555555556, 0.002352316322222222, 0.0023525885444444445, 0.0023527369333333336, 0.002353442922222222, 0.002353896488888889, 0.0023555189444444445, 0.0023664568], 0, 3.5413146018981934, 1577181748.0170348], "v": 0.1}
12 | {"i": ["cuda -model=unknown", "topi_nn_dense", [["TENSOR", [128, 4096], "float32"], ["TENSOR", [1000, 4096], "float32"], null, "float32"], {}, ["dense", [128, 4096, "float32"], [1000, 4096, "float32"], 0, "float32"], {"i": 149902, "t": "direct", "c": null, "e": [["tile_x", "sp", [-1, 2, 8, 1]], ["tile_y", "sp", [-1, 5, 8, 1]], ["tile_k", "sp", [-1, 8, 1]]]}], "r": [[0.0006632385418502203, 0.0006632589207048458, 0.00066326459030837, 0.0006636647444933921, 0.0006639716167400881, 0.0006640934713656387, 0.0006641958414096916, 0.0006642457268722467], 0, 9.101941585540771, 1577183463.872119], "v": 0.1}
13 | 


--------------------------------------------------------------------------------
/mytophub/20191227/vgg16_bs_1.log:
--------------------------------------------------------------------------------
 1 | {"i": ["cuda -model=unknown", "topi_nn_conv2d", [["TENSOR", [1, 3, 224, 224], "float32"], ["TENSOR", [64, 3, 3, 3], "float32"], [1, 1], [1, 1], [1, 1], "NCHW", "float32"], {}, ["conv2d", [1, 3, 224, 224, "float32"], [64, 3, 3, 3, "float32"], [1, 1], [1, 1], [1, 1], "NCHW", "float32"], {"i": 110545142, "t": "direct", "c": null, "e": [["tile_f", "sp", [-1, 2, 1, 4]], ["tile_y", "sp", [-1, 2, 2, 1]], ["tile_x", "sp", [-1, 1, 28, 1]], ["tile_rc", "sp", [-1, 1]], ["tile_ry", "sp", [-1, 3]], ["tile_rx", "sp", [-1, 1]], ["auto_unroll_max_step", "ot", 0], ["unroll_explicit", "ot", 1]]}], "r": [[6.0289909044844186e-05, 6.030973853559665e-05, 6.033552039523689e-05, 6.0351307068659744e-05, 6.035974740309095e-05, 6.0368374714973394e-05, 6.038294476817837e-05, 6.046449657968077e-05], 0, 6.081462383270264, 1576639639.1194658], "v": 0.1}
 2 | {"i": ["cuda -model=unknown", "topi_nn_conv2d", [["TENSOR", [1, 64, 224, 224], "float32"], ["TENSOR", [64, 64, 3, 3], "float32"], [1, 1], [1, 1], [1, 1], "NCHW", "float32"], {}, ["conv2d", [1, 64, 224, 224, "float32"], [64, 64, 3, 3, "float32"], [1, 1], [1, 1], [1, 1], "NCHW", "float32"], {"i": 219649, "t": "winograd", "c": null, "e": [["tile_b", "sp", [-1, 1, 1, 1]], ["tile_y", "sp", [-1, 1, 8, 8]], ["tile_x", "sp", [-1, 4, 16, 1]], ["tile_rc", "sp", [-1, 8]], ["auto_unroll_max_step", "ot", 0], ["unroll_explicit", "ot", 0]]}], "r": [[0.0006486326379310344, 0.0006487552025862069, 0.0006487734396551725, 0.0006488455, 0.0006488479181034483, 0.0006488770474137931, 0.0006489530775862068, 0.0006489756034482758], 0, 4.759263515472412, 1576641806.1470792], "v": 0.1}
 3 | {"i": ["cuda -model=unknown", "topi_nn_conv2d", [["TENSOR", [1, 64, 112, 112], "float32"], ["TENSOR", [128, 64, 3, 3], "float32"], [1, 1], [1, 1], [1, 1], "NCHW", "float32"], {}, ["conv2d", [1, 64, 112, 112, "float32"], [128, 64, 3, 3, "float32"], [1, 1], [1, 1], [1, 1], "NCHW", "float32"], {"i": 427400, "t": "winograd", "c": null, "e": [["tile_b", "sp", [-1, 1, 1, 1]], ["tile_y", "sp", [-1, 2, 8, 4]], ["tile_x", "sp", [-1, 7, 16, 1]], ["tile_rc", "sp", [-1, 8]], ["auto_unroll_max_step", "ot", 128], ["unroll_explicit", "ot", 0]]}], "r": [[0.00026877778747203576, 0.0002687925212527964, 0.000268806870246085, 0.00026880836241610737, 0.0002688106029082774, 0.0002689192125279642, 0.000268941615212528, 0.00026932233668903804], 0, 6.267563581466675, 1576644767.460554], "v": 0.1}
 4 | {"i": ["cuda -model=unknown", "topi_nn_conv2d", [["TENSOR", [1, 128, 112, 112], "float32"], ["TENSOR", [128, 128, 3, 3], "float32"], [1, 1], [1, 1], [1, 1], "NCHW", "float32"], {}, ["conv2d", [1, 128, 112, 112, "float32"], [128, 128, 3, 3, "float32"], [1, 1], [1, 1], [1, 1], "NCHW", "float32"], {"i": 1141417, "t": "winograd", "c": null, "e": [["tile_b", "sp", [-1, 1, 1, 1]], ["tile_y", "sp", [-1, 1, 8, 8]], ["tile_x", "sp", [-1, 7, 16, 1]], ["tile_rc", "sp", [-1, 8]], ["auto_unroll_max_step", "ot", 0], ["unroll_explicit", "ot", 1]]}], "r": [[0.0003633437602905569, 0.0003634449031476998, 0.0003634681016949152, 0.000363498598062954, 0.00036350954479418885, 0.00036362252300242134, 0.0003636384043583535, 0.0003637305544794189], 0, 7.891664743423462, 1576647452.214853], "v": 0.1}
 5 | {"i": ["cuda -model=unknown", "topi_nn_conv2d", [["TENSOR", [1, 128, 56, 56], "float32"], ["TENSOR", [256, 128, 3, 3], "float32"], [1, 1], [1, 1], [1, 1], "NCHW", "float32"], {}, ["conv2d", [1, 128, 56, 56, "float32"], [256, 128, 3, 3, "float32"], [1, 1], [1, 1], [1, 1], "NCHW", "float32"], {"i": 318380, "t": "winograd", "c": null, "e": [["tile_b", "sp", [-1, 1, 1, 1]], ["tile_y", "sp", [-1, 2, 4, 4]], ["tile_x", "sp", [-1, 7, 28, 1]], ["tile_rc", "sp", [-1, 8]], ["auto_unroll_max_step", "ot", 1500], ["unroll_explicit", "ot", 0]]}], "r": [[0.0001731678460987831, 0.00017317816320687186, 0.00017323992197566213, 0.00017326432641374373, 0.00017327575232641373, 0.00017328797136721547, 0.00017329106800286327, 0.0001732960365068003], 0, 21.01165533065796, 1576650634.9818578], "v": 0.1}
 6 | {"i": ["cuda -model=unknown", "topi_nn_conv2d", [["TENSOR", [1, 256, 56, 56], "float32"], ["TENSOR", [256, 256, 3, 3], "float32"], [1, 1], [1, 1], [1, 1], "NCHW", "float32"], {}, ["conv2d", [1, 256, 56, 56, "float32"], [256, 256, 3, 3, "float32"], [1, 1], [1, 1], [1, 1], "NCHW", "float32"], {"i": 351380, "t": "winograd", "c": null, "e": [["tile_b", "sp", [-1, 1, 1, 1]], ["tile_y", "sp", [-1, 2, 4, 4]], ["tile_x", "sp", [-1, 7, 28, 1]], ["tile_rc", "sp", [-1, 8]], ["auto_unroll_max_step", "ot", 1500], ["unroll_explicit", "ot", 0]]}], "r": [[0.00026320864526659414, 0.0002632155527747552, 0.00026323465070729056, 0.00026325022959738844, 0.00026325169967355823, 0.0002633182208922742, 0.0002633227769314472, 0.0002633241109902068], 0, 34.0358304977417, 1576653622.5075867], "v": 0.1}
 7 | {"i": ["cuda -model=unknown", "topi_nn_conv2d", [["TENSOR", [1, 256, 28, 28], "float32"], ["TENSOR", [512, 256, 3, 3], "float32"], [1, 1], [1, 1], [1, 1], "NCHW", "float32"], {}, ["conv2d", [1, 256, 28, 28, "float32"], [512, 256, 3, 3, "float32"], [1, 1], [1, 1], [1, 1], "NCHW", "float32"], {"i": 468529, "t": "winograd", "c": null, "e": [["tile_b", "sp", [-1, 1, 1, 1]], ["tile_y", "sp", [-1, 1, 4, 8]], ["tile_x", "sp", [-1, 7, 28, 1]], ["tile_rc", "sp", [-1, 8]], ["auto_unroll_max_step", "ot", 1500], ["unroll_explicit", "ot", 0]]}], "r": [[0.00018472845107033638, 0.00018478204204892967, 0.0001847839373088685, 0.00018481514678899083, 0.0001848305359327217, 0.00018489081651376147, 0.00018489247171253823, 0.00018495724082568806], 0, 9.295566320419312, 1576656137.8889992], "v": 0.1}
 8 | {"i": ["cuda -model=unknown", "topi_nn_conv2d", [["TENSOR", [1, 512, 28, 28], "float32"], ["TENSOR", [512, 512, 3, 3], "float32"], [1, 1], [1, 1], [1, 1], "NCHW", "float32"], {}, ["conv2d", [1, 512, 28, 28, "float32"], [512, 512, 3, 3, "float32"], [1, 1], [1, 1], [1, 1], "NCHW", "float32"], {"i": 1172529, "t": "winograd", "c": null, "e": [["tile_b", "sp", [-1, 1, 1, 1]], ["tile_y", "sp", [-1, 1, 4, 8]], ["tile_x", "sp", [-1, 7, 28, 1]], ["tile_rc", "sp", [-1, 8]], ["auto_unroll_max_step", "ot", 1500], ["unroll_explicit", "ot", 1]]}], "r": [[0.0003213928288288288, 0.00032142876126126124, 0.00032144187087087086, 0.00032147865615615614, 0.0003215641951951952, 0.0003216147882882883, 0.0003216725465465465, 0.00032167922822822826], 0, 4.207469701766968, 1576658581.2087324], "v": 0.1}
 9 | {"i": ["cuda -model=unknown", "topi_nn_conv2d", [["TENSOR", [1, 512, 14, 14], "float32"], ["TENSOR", [512, 512, 3, 3], "float32"], [1, 1], [1, 1], [1, 1], "NCHW", "float32"], {}, ["conv2d", [1, 512, 14, 14, "float32"], [512, 512, 3, 3, "float32"], [1, 1], [1, 1], [1, 1], "NCHW", "float32"], {"i": 31806, "t": "winograd", "c": null, "e": [["tile_b", "sp", [-1, 1, 1, 1]], ["tile_y", "sp", [-1, 1, 16, 4]], ["tile_x", "sp", [-1, 7, 7, 1]], ["tile_rc", "sp", [-1, 16]], ["auto_unroll_max_step", "ot", 128], ["unroll_explicit", "ot", 0]]}], "r": [[0.00012261084089750128, 0.00012261468179500256, 0.0001226194385517593, 0.0001226484033656298, 0.00012265550178480367, 0.00012267854207037224, 0.00012268475063742987, 0.00012270045283018868], 0, 8.118497133255005, 1576660636.3987286], "v": 0.1}
10 | {"i": ["cuda -model=unknown", "topi_nn_dense", [["TENSOR", [1, 25088], "float32"], ["TENSOR", [4096, 25088], "float32"], null, "float32"], {}, ["dense", [1, 25088, "float32"], [4096, 25088, "float32"], 0, "float32"], {"i": 20, "t": "direct", "c": null, "e": [["tile_k", "sp", [-1, 512]]]}], "r": [[0.0017204462500000001, 0.0017206680227272728, 0.0017207383863636363, 0.0017208100568181818, 0.0017209053977272728, 0.0017209912159090908, 0.0017212201136363634, 0.001721318909090909], 0, 3.1095077991485596, 1576661831.3531494], "v": 0.1}
11 | {"i": ["cuda -model=unknown", "topi_nn_dense", [["TENSOR", [1, 4096], "float32"], ["TENSOR", [4096, 4096], "float32"], null, "float32"], {}, ["dense", [1, 4096, "float32"], [4096, 4096, "float32"], 0, "float32"], {"i": 10, "t": "direct", "c": null, "e": [["tile_k", "sp", [-1, 1024]]]}], "r": [[0.0002654189538807649, 0.0002654267705286839, 0.0002654766062992126, 0.0002654796209223847, 0.00026549371766029247, 0.00026550077615298086, 0.00026550680877390325, 0.00027823597975253094], 0, 4.076004505157471, 1576661847.877501], "v": 0.1}
12 | {"i": ["cuda -model=unknown", "topi_nn_dense", [["TENSOR", [1, 4096], "float32"], ["TENSOR", [1000, 4096], "float32"], null, "float32"], {}, ["dense", [1, 4096, "float32"], [1000, 4096, "float32"], 0, "float32"], {"i": 8, "t": "direct", "c": null, "e": [["tile_k", "sp", [-1, 256]]]}], "r": [[6.899598885395827e-05, 6.899706201771935e-05, 6.902277507859388e-05, 6.903307116318948e-05, 6.903308116604745e-05, 6.905720777364962e-05, 6.906179079737068e-05, 6.906601571877679e-05], 0, 4.010441780090332, 1576661867.8294497], "v": 0.1}
13 | 


--------------------------------------------------------------------------------
/source-analysis/最简单的Tuner分析.md:
--------------------------------------------------------------------------------
  1 | 
  2 | 在TVM官网上有一个简单的自动Tuner的例子,其中关于schedule的部分代码如下：
  3 | 
  4 | ```python
  5 | # Matmul V1: List candidate values
  6 | @autotvm.template  # 1. use a decorator
  7 | def matmul_v1(N, L, M, dtype):
  8 |     A = tvm.placeholder((N, L), name='A', dtype=dtype)
  9 |     B = tvm.placeholder((L, M), name='B', dtype=dtype)
 10 | 
 11 |     k = tvm.reduce_axis((0, L), name='k')
 12 |     C = tvm.compute((N, M), lambda i, j: tvm.sum(A[i, k] * B[k, j], axis=k), name='C')
 13 |     s = tvm.create_schedule(C.op)
 14 | 
 15 |     # schedule
 16 |     y, x = s[C].op.axis
 17 |     k = s[C].op.reduce_axis[0]
 18 | 
 19 |     # 2. get the config object
 20 |     cfg = autotvm.get_config()
 21 | 
 22 |     # 3. define search space
 23 |     cfg.define_knob("tile_y", [1, 2, 4, 8, 16])
 24 |     cfg.define_knob("tile_x", [1, 2, 4, 8, 16])
 25 | 
 26 |     # 4. schedule according to config
 27 |     yo, yi = s[C].split(y, cfg['tile_y'].val)
 28 |     xo, xi = s[C].split(x, cfg['tile_x'].val)
 29 | 
 30 |     s[C].reorder(yo, xo, k, yi, xi)
 31 | 
 32 |     return s, [A, B, C]
 33 | ``` 
 34 | 
 35 | 这里比较重要的概念是scheduler、config、knob，下面逐一从代码层面分析。
 36 | 
 37 | create_schedule调用了C++实现的函数create_schedule
 38 | 
 39 | python/tvm/schedule.py
 40 | ```python
 41 | def create_schedule(ops):
 42 |     if not isinstance(ops, (list, _container.Array)):
 43 |         ops = [ops]
 44 |     return _api_internal._CreateSchedule(ops)
 45 | ```
 46 | 
 47 | src/api/api_lang.cc
 48 | ```c++
 49 | TVM_REGISTER_API("_CreateSchedule")
 50 | .set_body_typed(create_schedule);
 51 | ```
 52 | 
 53 | include/tvm/schedule.h
 54 | ```c++
 55 | inline Schedule create_schedule(Array<Operation> ops) {
 56 |   return ScheduleNode::make(ops);
 57 | }
 58 | ```
 59 | 
 60 | ScheduleNode的继承关系是：ScheduleNode->Node->NodeBase，让人稍感麻烦的是这几个类定义在不同的地方，分别是
 61 | include/tvm/schedule.h include/tvm/node/node.h  include/tvm/runtime/node_base.h
 62 | 
 63 | ScheduleNode::make(Array<Operation> ops)定义在这里：
 64 |     
 65 | src/schedule/schedule_lang.cc
 66 | ```c++
 67 | Schedule ScheduleNode::make(Array<Operation> ops) {
 68 |   auto n = make_node<ScheduleNode>();
 69 |   Schedule sch(n);
 70 |   n->outputs = ops;
 71 |   auto g = schedule::CreateReadGraph(n->outputs);
 72 |   Array<Operation> post_order = schedule::PostDFSOrder(n->outputs, g);
 73 |   // output set.
 74 |   std::unordered_set<Operation> output_set;
 75 |   for (Operation x : ops) {
 76 |     output_set.insert(x);
 77 |   }
 78 |   for (Operation op : post_order) {
 79 |     Stage stage(op);
 80 |     stage->is_output = output_set.count(op) != 0;
 81 |     n->stages.push_back(stage);
 82 |     n->stage_map.Set(op, stage);
 83 |     // mark scan updates.
 84 |     if (const ScanOpNode* scan = op.as<ScanOpNode>()) {
 85 |       Array<Tensor> inputs;
 86 |       for (Tensor t : scan->state_placeholder) {
 87 |         inputs.push_back(t);
 88 |       }
 89 |       for (Tensor t : scan->inputs) {
 90 |         inputs.push_back(t);
 91 |       }
 92 |       // Create the scan group.
 93 |       Stage scan_group = sch.create_group(scan->update, inputs, false);
 94 |       scan_group->attach_type = kScanUpdate;
 95 |       scan_group->attach_stage = stage;
 96 | 
 97 |       for (size_t i = 0; i < scan->update.size(); ++i) {
 98 |         Stage s = n->stage_map[scan->update[i]->op];
 99 |         CHECK(scan_group.same_as(s->group));
100 |       }
101 |     }
102 |   }
103 |   return sch;
104 | }
105 | ```
106 | 从这段代码可以看出，返回的是一个Schedule对象，继承关系为 Schedule->NodeRef
107 | Schedule控制所有的优化过程，优化包含很多步骤，每个步骤称为一个Stage(也是继承于NodeRef)，Stage也是定义在include/tvm/schedule.h里的，在Stage里实现了很多优化的方法，比如split,tile, fuse, reorder, bind, compute_at, compute_inline,vectorize,tensorize,prefetch等，关于这些优化方法的介绍，最好的方式是参考TVM的文档和源码，后面有机会我也会做一些分析介绍。
108 | 
109 | 一些参考资料：
110 | * https://docs.tvm.ai/tutorials/language/schedule_primitives.html
111 | * https://blog.csdn.net/sayandroid/article/details/88784933#2_stage_50(上一篇的中文版)
112 | * https://docs.tvm.ai/tutorials/language/tensorize.html
113 | 
114 | 
115 | 下一步是获取config，get_config()定义在这里
116 | 
117 | python/tvm/autotvm/task/task.py
118 | ```python
119 | def get_config():
120 |     """Get current config object
121 |     Returns
122 |     -------
123 |     cfg: ConfigSpace or ConfigEntity
124 |         The current config
125 |     """
126 |     return DispatchContext.current.query(None, None)
127 | ```
128 |     def query(self, target, workload):
129 | 
130 | 缺省的DispatchContext是FallbackContext, 在其内部维护了一个hashmap，用来保存ConfigSpace，这个例子里获取的是缺省的ConfigSpace，实现类为FallbackConfigEntity
131 | python/tvm/autotvm/task/dispatcher.py
132 | ```python
133 | DispatchContext.current = FallbackContext()
134 | ```
135 | 
136 | 接下来是调用了ConfigSpace.define_knob，也就是在Config里增加了一个transform操作。TransformSpace有多个子类，包括VirtualAxis，SplitSpace，ReorderSpace，AnnotateSpace，OtherOptionSpace
137 | ```python
138 |     def define_knob(self, name, candidate):
139 |         """Define a tunable knob with a list of candidates
140 | 
141 |         Parameters
142 |         ----------
143 |         name: str
144 |             name key of that option
145 |         candidate: list
146 |             list of candidates
147 |         """
148 |         return self._add_new_transform(OtherOptionSpace, name, [], None, candidate=candidate)
149 | ```
150 | ```python
151 |     def _add_new_transform(self, space_class, name, axes, policy, **kwargs):
152 |         """Add a new transform space in template"""
153 |         if self._collect:
154 |             # convert schedule axis to space definition axis
155 |             axes = [x if isinstance(x, (VirtualAxis, Axis)) else self.axis(x) for x in axes]
156 | 
157 |             # add subspace (knob)
158 |             space = space_class(axes, policy, **kwargs)
159 |             self.space_map[name] = space
160 |             self._entity_map[name] = space[0]
161 |             return [Axis(space, i) for i in range(space.num_output)]
162 |         return [Axis(None, i) for i in range(space_class.get_num_output(axes, policy, **kwargs))]
163 | ```
164 | 
165 | 最后我们看一下split是如何工作的
166 | 
167 | src/schedule/schedule_lang.cc
168 | ```c++
169 | Stage& Stage::split(
170 |     IterVar parent, Expr factor, IterVar* p_outer, IterVar* p_inner) {  // NOLINT(*)
171 |   Split(operator->(), parent, factor, Expr(), p_outer, p_inner);
172 |   return *this;
173 | }
174 | 
175 | void Split(StageNode* self,
176 |            IterVar parent,
177 |            Expr factor,
178 |            Expr nparts,
179 |            IterVar* p_outer,
180 |            IterVar* p_inner) {
181 |   // Check if split is valid.
182 |   CHECK(parent->iter_type == kDataPar ||
183 |         parent->iter_type == kCommReduce ||
184 |         parent->iter_type == kOrdered)
185 |       << "Cannot split on " << IterVarType2String(parent->iter_type);
186 |   IterVar outer = IterVarNode::make(
187 |       Range(), parent->var.copy_with_suffix(".outer"), parent->iter_type);
188 |   IterVar inner = IterVarNode::make(
189 |       Range(), parent->var.copy_with_suffix(".inner"), parent->iter_type);
190 |   *p_outer = outer;
191 |   *p_inner = inner;
192 |   // The splits
193 |   ArrayNode* all_vars = self->all_iter_vars.CopyOnWrite();
194 |   ArrayNode* leaf_vars = self->leaf_iter_vars.CopyOnWrite();
195 |   size_t pos = FindLeafVar(all_vars, leaf_vars, parent);
196 |   self->relations.push_back(SplitNode::make(parent, outer, inner, factor, nparts));
197 |   // add vars to all vars
198 |   all_vars->data.push_back(outer.node_);
199 |   all_vars->data.push_back(inner.node_);
200 |   // replace the position.
201 |   leaf_vars->data.erase(leaf_vars->data.begin() + pos);
202 |   leaf_vars->data.insert(leaf_vars->data.begin() + pos, inner.node_);
203 |   leaf_vars->data.insert(leaf_vars->data.begin() + pos, outer.node_);
204 | }
205 | ```
206 | 
207 | 这里StageNode有几个关键的成员：relations, all_iter_vars, leaf_iter_vars，具体解释在StageNode的类定义里：
208 | 
209 | ```c++
210 | /*!
211 |  * \brief represents a stage.
212 |  *
213 |  *  relations form a Directed acylic hypergraph in bipartite manner.
214 |  *  With each node is represented by a IterVar,
215 |  *  and each hyper-edge is represented by a IterVarRelation.
216 |  *  The relations connects the IterVars in the graph.
217 |  *
218 |  *  Besides typical stage that corresponds to operations.
219 |  *  There is also group stage, which groups stages together.
220 |  *  Each stage's group(given by group) represent an constraint,
221 |  *  the stage can only be attached to stages within the group.
222 |  *
223 |  *  The group stage node can be attached to IterVars as in normal stage.
224 |  */
225 | class StageNode : public Node {
226 |  public:
227 |  
228 |   /*! \brief All the nodes in the iter var */
229 |   Array<IterVar> all_iter_vars;
230 |   /*! \brief The current active leaf iter vars in the stage. */
231 |   Array<IterVar> leaf_iter_vars;
232 | 
233 |   Array<IterVarRelation> relations;
234 |   /*! \brief additional attributes about iter var. */
235 |   Map<IterVar, IterVarAttr> iter_var_attrs;
236 |   
237 |   ...
238 | }
239 | ···
240 | IterVars代表的是DAG图的节点，IterVarRelation代表的是DAG图中的边。
241 | 
242 | 未完待续
243 | 


--------------------------------------------------------------------------------
/source-analysis/如何利用已生成的高效代码.md:
--------------------------------------------------------------------------------
  1 | 
  2 | 在调用relay.build()来生成代码时，有一种可能是已经有很多Op之前曾经见过并且做过比较好的优化，那很自然的我们会希望用到这些优化的代码。
  3 | 
  4 | python/tvm/relay/build_module.py
  5 | ```python
  6 | def build(mod, target=None, target_host=None, params=None):
  7 |     ......
  8 | 
  9 |     # If current dispatch context is fallback context (the default root context),
 10 |     # then load pre-tuned parameters from TopHub
 11 |     if isinstance(autotvm.DispatchContext.current, autotvm.FallbackContext):
 12 |         tophub_context = autotvm.tophub.context(list(target.values()))
 13 |     else:
 14 |         tophub_context = autotvm.util.EmptyContext()
 15 | 
 16 |     with tophub_context:
 17 |         bld_mod = BuildModule()
 18 |         graph_json, mod, params = bld_mod.build(func, target, target_host, params)
 19 |     return graph_json, mod, params
 20 | ```
 21 | 
 22 | 在上面的relay.build的实现里，使用了tophub_context，通过查看其实现，可以知道生成的是ApplyHistoryBest([])。为什么叫这个名字呢？因为它的原理就是
 23 | 从之前已经优化过的配置中寻找用于当前操作的代码。
 24 | 已经优化过的配置是从github上下载的，大家可以访问https://github.com/uwsampl/tvm-distro/tree/master/tophub，里面包含了很多已经优化过的配置。
 25 | 
 26 | python/tvm/autotvm/tophub.py
 27 | ```python
 28 | def context(target, extra_files=None):
 29 |     """Return the dispatch context with pre-tuned parameters.
 30 |     This function will load the corresponding *.log files in AUTOTVM_TOPHUB_ROOT_PATH.
 31 |     If cannot find them, it will download them from TopHub github repo.
 32 |     Users can also add their own files in argument `extra_files`.
 33 | 
 34 |     Parameters
 35 |     ----------
 36 |     target: Target or List of Target
 37 |         The compilation target
 38 |     extra_files: list of str, optional
 39 |         Extra log files to load
 40 |     """
 41 |     best_context = ApplyHistoryBest([])
 42 | 
 43 |     targets = target if isinstance(target, (list, tuple)) else [target]
 44 | 
 45 |     for tgt in targets:
 46 |         if isinstance(tgt, str):
 47 |             tgt = _target.create(tgt)
 48 | 
 49 |         possible_names = []
 50 |         for opt in tgt.options:
 51 |             if opt.startswith("-device"):
 52 |                 device = _alias(opt[8:])
 53 |                 possible_names.append(device)
 54 |         possible_names.append(tgt.target_name)
 55 | 
 56 |         all_packages = list(PACKAGE_VERSION.keys())
 57 |         for name in possible_names:
 58 |             name = _alias(name)
 59 |             if name in all_packages:
 60 |                 if not check_backend(name):
 61 |                     continue
 62 | 
 63 |                 filename = "%s_%s.log" % (name, PACKAGE_VERSION[name])
 64 |                 best_context.load(os.path.join(AUTOTVM_TOPHUB_ROOT_PATH, filename))
 65 |                 break   # only load one file to avoid some fallback template mismatch problem
 66 | 
 67 |     if extra_files:
 68 |         for filename in extra_files:
 69 |             best_context.load(filename)
 70 | 
 71 |     return best_context
 72 |  
 73 |  def check_backend(backend):
 74 |     """Check whether have pre-tuned parameters of the certain target.
 75 |     If not, will download it.
 76 | 
 77 |     Parameters
 78 |     ----------
 79 |     backend: str
 80 |         The name of backend.
 81 | 
 82 |     Returns
 83 |     ----------
 84 |     success: bool
 85 |         Whether the check is successful.
 86 |     """
 87 |     backend = _alias(backend)
 88 |     assert backend in PACKAGE_VERSION, 'Cannot find backend "%s" in TopHub' % backend
 89 | 
 90 |     version = PACKAGE_VERSION[backend]
 91 |     package_name = "%s_%s.log" % (backend, version)
 92 |     if os.path.isfile(os.path.join(AUTOTVM_TOPHUB_ROOT_PATH, package_name)):
 93 |         return True
 94 | 
 95 |     if sys.version_info >= (3,):
 96 |         import urllib.request as urllib2
 97 |     else:
 98 |         import urllib2
 99 |     try:
100 |         download_package(package_name)
101 |         return True
102 |     except urllib2.URLError as e:
103 |         logging.warning("Failed to download tophub package for %s: %s", backend, e)
104 |         return False
105 | 
106 | 
107 | def download_package(package_name):
108 |     """Download pre-tuned parameters of operators for a backend
109 | 
110 |     Parameters
111 |     ----------
112 |     package_name: str
113 |         The name of package
114 |     """
115 |     rootpath = AUTOTVM_TOPHUB_ROOT_PATH
116 | 
117 |     if not os.path.isdir(rootpath):
118 |         # make directory
119 |         splits = os.path.split(rootpath)
120 |         for j in range(1, len(splits)+1):
121 |             path = os.path.join(*splits[:j])
122 |             if not os.path.isdir(path):
123 |                 os.mkdir(path)
124 | 
125 |     logger.info("Download pre-tuned parameters package %s", package_name)
126 |     download("https://raw.githubusercontent.com/uwsampl/tvm-distro/master/tophub/%s"
127 |              % package_name, os.path.join(rootpath, package_name), True, verbose=0)
128 | 
129 | ```
130 | 在ApplyHistoryBest初始化时，会调用自己的load函数去加载已经下载的配置文件，加载时使用target作为key值。
131 | 配置文件一般是json格式的，当然TVM当前也支持pickle格式，我们选取其中一行来看一下具体的内容。
132 | 
133 | ```json
134 | {
135 | 	"i": ["cuda -model=titanx", "topi_nn_conv2d", [
136 | 			["TENSOR", [1, 128, 56, 56], "float32"],
137 | 			["TENSOR", [256, 128, 3, 3], "float32"],
138 | 			[1, 1],
139 | 			[1, 1], "NCHW", "float32"
140 | 		], {},
141 | 		["conv2d", [1, 128, 56, 56, "float32"],
142 | 			[256, 128, 3, 3, "float32"],
143 | 			[1, 1],
144 | 			[1, 1], "NCHW", "float32"
145 | 		], {
146 | 			"c": null,
147 | 			"e": [
148 | 				["tile_b", "sp", [36, 1, 1, 1]],
149 | 				["tile_y", "sp", [4, 2, 8, 4]],
150 | 				["tile_x", "sp", [1, 7, 28, 1]],
151 | 				["tile_rc", "sp", [16, 8]],
152 | 				["auto_unroll_max_step", "ot", 1500],
153 | 				["unroll_explicit", "ot", 1]
154 | 			],
155 | 			"i": 714385,
156 | 			"t": "winograd"
157 | 		}
158 | 	],
159 | 	"r": [
160 | 		[0.00023493554432348364], 0, 4.635826826095581, 1535411899.2398431
161 | 	],
162 | 	"v": 0.1
163 | }
164 | ```
165 | 在读取配置文件时，我们能够看到，配置文件中的"i"包含了目标任务的描述，包括target，task_name, task_args, task_kwargs, workload, config等，
166 | 其中target和task信息用来匹配待优化任务，config则是该任务下建议的优化配置。
167 | 
168 | python/tvm/autotvm/record.py
169 | ```python
170 | def decode(row, protocol='json'):
171 |     """Decode encoded record string to python object
172 | 
173 |     Parameters
174 |     ----------
175 |     row: str
176 |         a row in the logger file
177 |     protocol: str
178 |         log protocol, json or pickle
179 | 
180 |     Returns
181 |     -------
182 |     input: autotvm.tuner.MeasureInput
183 |     result: autotvm.tuner.MeasureResult
184 |     """
185 |     # pylint: disable=unused-variable
186 |     if protocol == 'json':
187 |         row = json.loads(row)
188 |         tgt, task_name, task_args, task_kwargs, workload, config = row['i']
189 |         tgt = _target.create(str(tgt))
190 | 
191 |         def clean_json_to_python(x):
192 |             """1. Convert all list in x to tuple (hashable)
193 |                2. Convert unicode to str for python2
194 |             """
195 |             if isinstance(x, list):
196 |                 return tuple([clean_json_to_python(a) for a in x])
197 |             if isinstance(x, _unicode):
198 |                 return str(x)
199 |             if isinstance(x, (_long, int)):
200 |                 return int(x)
201 |             return x
202 | 
203 |         tsk = task.Task(clean_json_to_python(task_name), clean_json_to_python(task_args))
204 |         tsk.workload = clean_json_to_python(workload)
205 |         config = ConfigEntity.from_json_dict(config)
206 |         inp = MeasureInput(tgt, tsk, config)
207 |         result = MeasureResult(*[tuple(x) if isinstance(x, list) else x for x in row["r"]])
208 | 
209 |         return inp, result
210 |     if protocol == 'pickle':
211 |         items = row.split("\t")
212 |         tgt = _target.create(items[0])
213 |         task_tuple = pickle.loads(base64.b64decode(items[1].encode()))
214 |         config = pickle.loads(base64.b64decode(items[2].encode()))
215 |         result = pickle.loads(base64.b64decode(items[3].encode()))
216 | 
217 |         tsk = task.Task(task_tuple[0], task_tuple[1])
218 |         tsk.workload = task_tuple[3]
219 |         return MeasureInput(tgt, tsk, config), MeasureResult(*result)
220 | 
221 |     raise RuntimeError("Invalid log protocol: " + protocol)
222 | 
223 | ```
224 | 
225 | 对于其中ConfigSpace的解析，使用的是ConfigEntity类，json文件中“e"对应的每一项就是一个config,
226 | 参考ConfigEntity中函数load_from_json的实现
227 | 
228 | python/tvm/autotvm/task/space.py
229 | ```python
230 |         index = json_dict["i"]
231 |         code_hash = json_dict["c"]
232 |         template_key = json_dict["t"]
233 |         constraints = []
234 |         entity_map = OrderedDict()
235 |         
236 |         for item in json_dict["e"]:
237 |             key, knob_type, knob_args = item
238 |             if knob_type == 'sp':
239 |                 entity = SplitEntity(knob_args)
240 |             elif knob_type == 're':
241 |                 entity = ReorderEntity(knob_args)
242 |             elif knob_type == 'an':
243 |                 entity = AnnotateEntity(knob_args)
244 |             elif knob_type == 'ot':
245 |                 entity = OtherOptionEntity(knob_args)
246 |             else:
247 |                 raise RuntimeError("Invalid config knob type: " + knob_type)
248 |             entity_map[str(key)] = entity
249 | 
250 | ```
251 | 


--------------------------------------------------------------------------------
/source-analysis/通过计算图如何生成最简单的代码.md:
--------------------------------------------------------------------------------
  1 | 
  2 | 考虑下面最简单的例子
  3 | ```python
  4 | n = tvm.const(128, "int32")
  5 | a = tvm.placeholder((n, ), name="a")
  6 | b = tvm.placeholder((n, ), name="b")
  7 | c = tvm.compute((n, ), lambda i: a[i] + b[i], name='c')
  8 | 
  9 | sch = tvm.create_schedule(c.op)
 10 | ir  = tvm.lower(sch, [a, b, c], simple_mode=True)
 11 | print(ir)
 12 | ```
 13 | 
 14 | 输出结果为：
 15 | ```c++
 16 | produce c {
 17 |   for (i, 0, 128) {
 18 |     c[i] = (a[i] + b[i])
 19 |   }
 20 | }
 21 | ```
 22 | 
 23 | 那么tvm.lower函数是如何生成这段代码的呢？
 24 | 
 25 | python/tvm/build_module.py
 26 | ```python
 27 | def lower(sch,
 28 |           args,
 29 |           name="default_function",
 30 |           binds=None,
 31 |           simple_mode=False):
 32 |   ......
 33 |   
 34 |     binds, arg_list = get_binds(args, binds)
 35 |     cfg = current_build_config()
 36 |     add_lower_pass = cfg.add_lower_pass if cfg.add_lower_pass else []
 37 |     if cfg.dump_pass_ir:
 38 |         add_lower_pass = BuildConfig._dump_ir.decorate_custompass(add_lower_pass)
 39 |     lower_phase0 = [x[1] for x in add_lower_pass if x[0] == 0]
 40 |     lower_phase1 = [x[1] for x in add_lower_pass if x[0] == 1]
 41 |     lower_phase2 = [x[1] for x in add_lower_pass if x[0] == 2]
 42 |     lower_phase3 = [x[1] for x in add_lower_pass if x[0] > 2]
 43 | 
 44 |     # Phase 0
 45 |     if isinstance(sch, schedule.Schedule):
 46 |         stmt = form_body(sch)
 47 | 
 48 |     for f in lower_phase0:
 49 |         stmt = f(stmt)
 50 |     # Phase 1
 51 | 
 52 | 
 53 |     stmt = ir_pass.InjectVirtualThread(stmt)
 54 |     stmt = ir_pass.InjectDoubleBuffer(stmt, cfg.double_buffer_split_loop)
 55 |     stmt = ir_pass.StorageRewrite(stmt)
 56 |     stmt = ir_pass.UnrollLoop(
 57 |         stmt,
 58 |         cfg.auto_unroll_max_step,
 59 |         cfg.auto_unroll_max_depth,
 60 |         cfg.auto_unroll_max_extent,
 61 |         cfg.unroll_explicit)
 62 |     for f in lower_phase2:
 63 |         stmt = f(stmt)
 64 |     # Phase 3
 65 |     stmt = ir_pass.Simplify(stmt)
 66 |     stmt = ir_pass.LowerStorageAccessInfo(stmt)
 67 |     stmt = ir_pass.RemoveNoOp(stmt)
 68 |     if not cfg.disable_select_rewriting:
 69 |         stmt = ir_pass.RewriteUnsafeSelect(stmt)
 70 |     for f in lower_phase3:
 71 |         stmt = f(stmt)
 72 |     # Instrument BoundCheckers
 73 |     if cfg.instrument_bound_checkers:
 74 |         stmt = ir_pass.InstrumentBoundCheckers(stmt)
 75 |     if simple_mode:
 76 |         return stmt
 77 |     return ir_pass.MakeAPI(stmt, name, arg_list, 0, cfg.restricted_func)
 78 |  ```
 79 |  
 80 |  我们先看一下tvm.lower()的返回值吧，这个例子里simple_mode是True，所以返回的是Stmt类型。具体的Stmt类型比较多，通过打印返回的类型可以看出实际
 81 |  类型是tvm.stmt.ProducerConsumer，这个类定义在tvm/stmt.py里，当我们打印其内容时，会调用到NodeBase类重载的__repr__函数内，也就是会跳转到
 82 |  C++侧的实现_format_str()里。
 83 |  
 84 |  ```python
 85 |  class NodeBase(_NodeBase):
 86 |     """NodeBase is the base class of all TVM language AST object."""
 87 |     def __repr__(self):
 88 |         return _api_internal._format_str(self)
 89 |  ```
 90 | 
 91 | 在C++实现里，调用的是ProducerConsumer类的operator NodeRef()方法。
 92 | src/api/api_base.cc
 93 | ```c++
 94 | TVM_REGISTER_API("_format_str")
 95 | .set_body([](TVMArgs args,  TVMRetValue *ret) {
 96 |     CHECK(args[0].type_code() == kNodeHandle);
 97 |     std::ostringstream os;
 98 |     os << args[0].operator NodeRef();
 99 |     *ret = os.str();
100 |   });
101 | ```
102 |  第一次看到这个的时候我也有点蒙，debug后发现进入了一个模板定义函数TVMArgValue::operator T()，之后转到detail::TVMValueCase::Apply()里。
103 |  
104 | include/tvm/runtime/packed_func.h
105 | ```c++
106 | namespace detail {
107 | template<typename T, typename TSrc, bool is_ext, bool is_nd>
108 | struct TVMValueCast {
109 |   static T Apply(const TSrc* self) {
110 |     static_assert(!is_ext && !is_nd, "The default case accepts only non-extensions");
111 |     return self->template AsNodeRef<T>();
112 |   }
113 | };
114 | ......
115 | template<typename T, typename>
116 | inline TVMArgValue::operator T() const {
117 |   return detail::
118 |       TVMValueCast<T, TVMArgValue,
119 |                    (extension_type_info<T>::code != 0),
120 |                    (array_type_info<T>::code > 0)>
121 |       ::Apply(this);
122 | }
123 | ```
124 | 
125 | 接下来调用的是TVMArgValue::AsNodeRef()，其中又调用了ptr模板函数，因此最终返回的是NodeRef对象，其内部保存了Node中的value。
126 | 
127 | include/tvm/packed_func_ext.h
128 | ```c++
129 | template<typename TNodeRef>
130 | inline TNodeRef TVMArgValue::AsNodeRef() const {
131 |   static_assert(
132 |       std::is_base_of<NodeRef, TNodeRef>::value,
133 |       "Conversion only works for NodeRef");
134 |   if (type_code_ == kNull) return TNodeRef(NodePtr<Node>(nullptr));
135 |   TVM_CHECK_TYPE_CODE(type_code_, kNodeHandle);
136 |   NodePtr<Node>& sptr = *ptr<NodePtr<Node> >();
137 |   CHECK(NodeTypeChecker<TNodeRef>::Check(sptr.get()))
138 |       << "Expected type " << NodeTypeName<TNodeRef>()
139 |       << " but get " << sptr->type_key();
140 |   return TNodeRef(sptr);
141 | }
142 | ```
143 | 
144 | include/tvm/runtime/packed_func.h
145 | ```c++
146 | class TVMPODValue_ {
147 |  public:
148 |   ......
149 |   template<typename T>
150 |   T* ptr() const {
151 |     return static_cast<T*>(value_.v_handle);
152 |   }
153 | ```
154 | 
155 | 实际在用ostringstream来打印NodeRef对象时，调用了重载的<<()函数，这里调用了IRPrinter::Print()函数。
156 | 
157 | include/tvm/expr.h
158 | ```c++
159 | class IRPrinter {
160 |  public:
161 |   /*! \brief The output stream */
162 |   std::ostream& stream;
163 |   /*! \brief The indentation level. */
164 |   int indent{0};
165 |   explicit IRPrinter(std::ostream& stream)  // NOLINT(*)
166 |       : stream(stream) {}
167 | 
168 |   /*! \brief The node to be printed. */
169 |   TVM_DLL void Print(const NodeRef& node);
170 |   /*! \brief Print indent to the stream */
171 |   TVM_DLL void PrintIndent();
172 |   // Allow registration to be printer.
173 |   using FType = IRFunctor<void(const NodeRef&, IRPrinter *)>;
174 |   TVM_DLL static FType& vtable();
175 | };
176 | 
177 | inline std::ostream& operator<<(std::ostream& os, const NodeRef& n) {  // NOLINT(*)
178 |   IRPrinter(os).Print(n);
179 |   return os;
180 | }
181 | }  // namespace tvm
182 | ```
183 | 
184 | 在实现Print函数时，如果该NodeRef包含的是函数，就调用该函数，否则直接输出其包含的值。
185 | src/lang/expr.cc
186 | ```c++
187 | void IRPrinter::Print(const NodeRef& ir) {
188 |   static const FType& f = vtable();
189 |   if (!ir.defined()) {
190 |     stream << "(nullptr)";
191 |   } else {
192 |     if (f.can_dispatch(ir)) {
193 |       f(ir, this);
194 |     } else {
195 |       // default value, output type key and addr.
196 |       stream << ir->type_key() << "(" << ir.get() << ")";
197 |     }
198 |   }
199 | }
200 | 
201 | IRPrinter::FType& IRPrinter::vtable() {
202 |   static FType inst;
203 |   return inst;
204 | }
205 | ```
206 | 
207 | src/lang/ir.cc
208 | ```c++
209 | TVM_STATIC_IR_FUNCTOR(IRPrinter, vtable)
210 | .set_dispatch<ProducerConsumer>([](const ProducerConsumer* op, IRPrinter* p) {
211 |     if (op->is_producer) {
212 |       p->PrintIndent();
213 |       p->stream << "produce " << op->func->func_name() << " {\n";
214 |       p->indent += 2;
215 |       p->Print(op->body);
216 |       p->indent -= 2;
217 |       p->PrintIndent();
218 |       p->stream << "}\n";
219 |     } else {
220 |       p->Print(op->body);
221 |     }
222 |   });
223 | ```
224 | IRPrinter的vtable函数里保存了对于某个NodeRef类型的处理函数，上面代码里的set_dispatch()的作用就是指定对于ProducerConsumer类型的打印方法。
225 | 该方法被加入到Functor的函数列表里，函数列表的下标就是NodeRef对象的类型所对应的关键字。
226 | include/tvm/node/ir_functor.h
227 | ```c++
228 | template<typename R, typename ...Args>
229 | class IRFunctor<R(const NodeRef& n, Args...)> {
230 |  private:
231 |   using Function = std::function<R (const NodeRef&n, Args...)>;
232 |   using TSelf = IRFunctor<R (const NodeRef& n, Args...)>;
233 |   /*! \brief internal function table */
234 |   std::vector<Function> func_;
235 | 
236 |  public:
237 |   /*! \brief the result type of this functor */
238 |   using result_type = R;
239 |   /*!
240 |    * \brief Whether the functor can dispatch the corresponding Node
241 |    * \param n The node to be dispatched
242 |    * \return Whether dispatching function is registered for n's type.
243 |    */
244 |   inline bool can_dispatch(const NodeRef& n) const {
245 |     uint32_t type_index = n.type_index();
246 |     return type_index < func_.size() && func_[type_index] != nullptr;
247 |   }
248 |   /*!
249 |    * \brief invoke the functor , dispatch on type of n
250 |    * \param n The Node argument
251 |    * \param args The additional arguments
252 |    * \return The result.
253 |    */
254 |   inline R operator()(const NodeRef& n, Args... args) const {
255 |     uint32_t type_index = n.type_index();
256 |     CHECK(type_index < func_.size() &&
257 |           func_[type_index] != nullptr)
258 |         << "IRFunctor calls un-registered function on type "
259 |         << Node::TypeIndex2Key(type_index);
260 |     return func_[type_index](n, std::forward<Args>(args)...);
261 |   }
262 |   /*!
263 |    * \brief set the dispacher for type TNode
264 |    * \param f The function to be set.
265 |    * \tparam TNode the type of Node to be dispatched.
266 |    * \return reference to self.
267 |    */
268 |   template<typename TNode>
269 |   inline TSelf& set_dispatch(Function f) {  // NOLINT(*)
270 |     uint32_t tindex = Node::TypeKey2Index(TNode::_type_key);
271 |     if (func_.size() <= tindex) {
272 |       func_.resize(tindex + 1, nullptr);
273 |     }
274 |     CHECK(func_[tindex] == nullptr)
275 |         << "Dispatch for " << Node::TypeIndex2Key(tindex)
276 |         << " is already set";
277 |     func_[tindex] = f;
278 |     return *this;
279 |   }
280 |   /*!
281 |    * \brief set the dispacher for type TNode
282 |    *  This allows f to used detailed const Node pointer to replace NodeRef
283 |    *
284 |    * \param f The function to be set.
285 |    * \tparam TNode the type of Node to be dispatched.
286 |    * \return reference to self.
287 |    */
288 |   template<typename TNode>
289 |   inline TSelf& set_dispatch(std::function<R(const TNode* n, Args...)> f) { // NOLINT(*)
290 |     Function fun = [f](const NodeRef& n, Args... args) {
291 |       return f(static_cast<const TNode*>(n.node_.get()),
292 |                std::forward<Args>(args)...);
293 |     };
294 |     return this->set_dispatch<TNode>(fun);
295 |   }
296 |   /*!
297 |   * \brief unset the dispacher for type TNode
298 |   *
299 |   * \tparam TNode the type of Node to be dispatched.
300 |   * \return reference to self.
301 |   */
302 |   template<typename TNode>
303 |   inline TSelf& clear_dispatch() {  // NOLINT(*)
304 |     uint32_t tindex = Node::TypeKey2Index(TNode::_type_key);
305 |     CHECK_LT(tindex, func_.size()) << "clear_dispatch: index out of range";
306 |     func_[tindex] = nullptr;
307 |     return *this;
308 |   }
309 | };
310 | 
311 | #define TVM_REGISTER_VAR_DEF(ClsName)                                 \
312 |   static TVM_ATTRIBUTE_UNUSED auto & __make_functor ## _ ## ClsName
313 |   
314 | #define TVM_STATIC_IR_FUNCTOR(ClsName, FField)                       \
315 |   TVM_STR_CONCAT(TVM_REGISTER_VAR_DEF(ClsName), __COUNTER__)  =      \
316 |                               ClsName::FField()
317 | ```                              
318 | 由此可见，打印ProducerConsumer的实现是使用IRPrinter，然后指定了ProducerConsumer对应的打印函数来实现的，在注释里也提到了vtable的方式类似于实现了
319 | 继承关系，但是这种方式就不需要像继承那样重新实现Print函数而已。
320 | 
321 | 打印时核心代码来自于这一句：
322 | p->Print(op->body);
323 | 
324 | op是ProducerConsumer类型，继承于StmtNode，ProducerConsumer有个成员body，通过debug我们可以知道这个body是For循环。
325 | 我们看一下这个ForNode对象时什么创建的。
326 | 
327 | 前面分析的时候，python/tvm/build_module.py里有个lower函数，里边调用了函数form_body(sch),form_body(sch)又调用了schedule.ScheduleOps函数:
328 |     stmt = schedule.ScheduleOps(sch, bounds)
329 |     stmt = ir_pass.InjectPrefetch(stmt)
330 | schedule.ScheduleOps是C++侧的实现，
331 | 
332 | src/api/api_schedule.cc
333 | ```c++
334 | TVM_REGISTER_API("schedule.ScheduleOps")
335 | .set_body([](TVMArgs args, TVMRetValue* ret) {
336 |   if (args.size() == 2)
337 |     *ret = ScheduleOps(args[0], args[1], false);
338 |   else
339 |     *ret = ScheduleOps(args[0], args[1], args[2]);
340 | });
341 | ```
342 | ScheduleOps()调用了
343 | Stmt ScheduleOps(Schedule sch, Map<IterVar, Range> dom_map_, bool debug_keep_trivial_loop) {
344 |   ......
345 |   body = MakePipeline(s, dom_map, body, debug_keep_trivial_loop);
346 |     ......
347 |     Stmt producer = s->op->BuildProvide(s, dom_map, debug_keep_trivial_loop);
348 |       ......
349 |       MakeComputeStmt(this, stage, dom_map, debug_keep_trivial_loop);
350 |         ComputeLoopNest n = ComputeLoopNest::make(self, stage, dom_map, debug_keep_trivial_loop);
351 |           ret.main_nest = op::MakeLoopNest(stage, dom_map, 0, false, std::unordered_set<IterVar>(), &ret.main_vmap,
352 |              ......
353 |              nest[i + 1].emplace_back(For::make(var, 0, dom->extent,for_type, DeviceAPI::None, no_op));
354 | 
355 | 这里是For::make的定义，构造时指定了循环变量，最小值和最大值，循环类型（串行、并行、向量话或者循环展开），
356 | ```c++
357 | Stmt For::make(Var loop_var,
358 |                Expr min,
359 |                Expr extent,
360 |                ForType for_type,
361 |                DeviceAPI device_api,
362 |                Stmt body) {
363 |   CHECK(min.defined());
364 |   CHECK(extent.defined());
365 |   CHECK(min.type().is_scalar());
366 |   CHECK(extent.type().is_scalar());
367 |   CHECK(loop_var.type().is_scalar());
368 |   CHECK(body.defined());
369 | 
370 |   NodePtr<For> node = make_node<For>();
371 |   node->loop_var = std::move(loop_var);
372 |   node->min = std::move(min);
373 |   node->extent = std::move(extent);
374 |   node->for_type = for_type;
375 |   node->device_api = device_api;
376 |   node->body = std::move(body);
377 |   return Stmt(node);
378 | }
379 | ```
380 | 


--------------------------------------------------------------------------------
/source-analysis/Python和C++之间函数调用的实现.md:
--------------------------------------------------------------------------------
  1 | 
  2 | TVM很多核心代码是用C++实现的，但是为了方便工程师使用，提供了python的前端，这样就经常涉及到在python里调用C++的代码，为了扩展方便，在实现时定义了几个宏用来注册函数和查找函数，在include/tvm/runtime/registry.h里，定义了宏TVM_REGISTER_GLOBAL，在include/tvm/api_registry.h里定义了宏TVM_REGISTER_API(OpName)，事实上也是用了宏TVM_REGISTER_GLOBAL.
  3 | 
  4 | #define TVM_REGISTER_API(OpName) TVM_REGISTER_GLOBAL(OpName)
  5 | 
  6 | ```c++
  7 | /*!
  8 |  * \brief Register a function globally.
  9 |  * \code
 10 |  *   TVM_REGISTER_GLOBAL("MyPrint")
 11 |  *   .set_body([](TVMArgs args, TVMRetValue* rv) {
 12 |  *   });
 13 |  * \endcode
 14 |  */
 15 | #define TVM_REGISTER_GLOBAL(OpName)                              \
 16 |   TVM_STR_CONCAT(TVM_FUNC_REG_VAR_DEF, __COUNTER__) =            \
 17 |       ::tvm::runtime::Registry::Register(OpName)
 18 | ```
 19 | 
 20 | 通过这个宏定义的函数能够被python等前端语言调用。具体的说明可以看这个类Registry的说明，建议的使用方法是使用时顺便调用set_body方法，
 21 | 并传递一个lambda函数作为参数。比较容易理解的是在前端语言里调用时会直接调用到这个lambda函数。
 22 | 
 23 | 在注册的时候，TVM使用了一个全局的Manager对象，该对象中维护了一个function map，该函数的名字和指针就保存在这个map里，以后的查找也是通过这个方式进行
 24 | 查找。
 25 | ```c++
 26 | Registry& Registry::Register(const std::string& name, bool override) {  // NOLINT(*)
 27 |   Manager* m = Manager::Global();
 28 |   std::lock_guard<std::mutex> lock(m->mutex);
 29 |   auto it = m->fmap.find(name);
 30 |   if (it == m->fmap.end()) {
 31 |     Registry* r = new Registry();
 32 |     r->name_ = name;
 33 |     m->fmap[name] = r;
 34 |     return *r;
 35 |   } else {
 36 |     CHECK(override)
 37 |       << "Global PackedFunc " << name << " is already registered";
 38 |     return *it->second;
 39 |   }
 40 | }
 41 | ```
 42 | 
 43 | ```c++
 44 | class Registry {
 45 |  public:
 46 |   TVM_DLL Registry& set_body(PackedFunc f);  // NOLINT(*)
 47 |   Registry& set_body(PackedFunc::FType f) {  // NOLINT(*)
 48 |     return set_body(PackedFunc(f));
 49 |   }
 50 | ```
 51 | 
 52 | set_body函数的实现在src/tvm/runtime/registry.c
 53 | 
 54 | TVM是是通过ctyps和ffi机制来实现在python语言里调用c++语言里的函数的。在查找时是通过在python里调用c++实现的TVMFuncGetGlobal函数来得到的。
 55 | 
 56 | python/tvm/_ffi/function.py
 57 | ```python
 58 | def get_global_func(name, allow_missing=False):
 59 |     handle = FunctionHandle()
 60 |     check_call(_LIB.TVMFuncGetGlobal(c_str(name), ctypes.byref(handle)))
 61 |     if handle.value:
 62 |         return Function(handle, False)
 63 | 
 64 |     if allow_missing:
 65 |         return None
 66 | 
 67 |     raise ValueError("Cannot find global function %s" % name)
 68 | ```
 69 | 
 70 | 可以看到，C++的函数在Python里被封装成一个Function类，而这个类是基于ctyps的FunctionBase定义的
 71 | 
 72 | ```python
 73 | from ._ctypes.function import FunctionBase as _FunctionBase
 74 | 
 75 | class Function(_FunctionBase):
 76 |     """The PackedFunc object used in TVM.
 77 | 
 78 |     Function plays an key role to bridge front and backend in TVM.
 79 |     Function provide a type-erased interface, you can call function with positional arguments.
 80 | 
 81 |     The compiled module returns Function.
 82 |     TVM backend also registers and exposes its API as Functions.
 83 |     For example, the developer function exposed in tvm.ir_pass are actually
 84 |     C++ functions that are registered as PackedFunc
 85 | 
 86 |     The following are list of common usage scenario of tvm.Function.
 87 | 
 88 |     - Automatic exposure of C++ API into python
 89 |     - To call PackedFunc from python side
 90 |     - To call python callbacks to inspect results in generated code
 91 |     - Bring python hook into C++ backend
 92 | 
 93 |     See Also
 94 |     --------
 95 |     tvm.register_func: How to register global function.
 96 |     tvm.get_global_func: How to get global function.
 97 |     """
 98 | ```
 99 | 
100 | python/tvm/_ffi/_ctypes/node.py
101 | ```python
102 | def _return_node(x):
103 |     """Return node function"""
104 |     handle = x.v_handle
105 |     if not isinstance(handle, NodeHandle):
106 |         handle = NodeHandle(handle)
107 |     tindex = ctypes.c_int()
108 |     check_call(_LIB.TVMNodeGetTypeIndex(handle, ctypes.byref(tindex)))
109 |     cls = NODE_TYPE.get(tindex.value, NodeBase)
110 |     # Avoid calling __init__ of cls, instead directly call __new__
111 |     # This allows child class to implement their own __init__
112 |     node = cls.__new__(cls)
113 |     node.handle = handle
114 |     return node
115 | ```
116 | 
117 | 那么，在python中是如何识别这些函数并调用的呢？在python代码中我们可以找到初始化的代码：
118 | 
119 | python/tvm/module.py
120 | ```python
121 | _init_api("tvm.module")
122 | _set_class_module(Module)
123 | ```
124 | 
125 | 具体实现时，先通过调用C++函数TVMFuncListGlobalNames获取所有注册的函数列表，再根据函数名一个一个的生成PackedFunction，并绑定到指定模块的函数名上，这样在python端调用该函数时，会直接调用该PackedFunction的__call__函数，具体见下面的代码。
126 | 
127 | python/tvm/_ffi/function.py
128 | ```python
129 | def list_global_func_names():
130 |     """Get list of global functions registered.
131 | 
132 |     Returns
133 |     -------
134 |     names : list
135 |        List of global functions names.
136 |     """
137 |     plist = ctypes.POINTER(ctypes.c_char_p)()
138 |     size = ctypes.c_uint()
139 | 
140 |     check_call(_LIB.TVMFuncListGlobalNames(ctypes.byref(size),
141 |                                            ctypes.byref(plist)))
142 |     fnames = []
143 |     for i in range(size.value):
144 |         fnames.append(py_str(plist[i]))
145 |     return fnames
146 | 
147 | def get_global_func(name, allow_missing=False):
148 |     """Get a global function by name
149 | 
150 |     Parameters
151 |     ----------
152 |     name : str
153 |         The name of the global function
154 | 
155 |     allow_missing : bool
156 |         Whether allow missing function or raise an error.
157 | 
158 |     Returns
159 |     -------
160 |     func : tvm.Function
161 |         The function to be returned, None if function is missing.
162 |     """
163 |     handle = FunctionHandle()
164 |     check_call(_LIB.TVMFuncGetGlobal(c_str(name), ctypes.byref(handle)))
165 |     if handle.value:
166 |         return Function(handle, False)
167 | 
168 |     if allow_missing:
169 |         return None
170 | 
171 |     raise ValueError("Cannot find global function %s" % name)
172 |     
173 | def _init_api_prefix(module_name, prefix):
174 |     module = sys.modules[module_name]
175 | 
176 |     for name in list_global_func_names():
177 |         if prefix == "api":
178 |             fname = name
179 |             if name.startswith("_"):
180 |                 target_module = sys.modules["tvm._api_internal"]
181 |             else:
182 |                 target_module = module
183 |         else:
184 |             if not name.startswith(prefix):
185 |                 continue
186 |             fname = name[len(prefix)+1:]
187 |             target_module = module
188 | 
189 |         if fname.find(".") != -1:
190 |             continue
191 |         f = get_global_func(name)
192 |         ff = _get_api(f)
193 |         ff.__name__ = fname
194 |         ff.__doc__ = ("TVM PackedFunc %s. " % fname)
195 |         setattr(target_module, ff.__name__, ff)
196 | ```
197 | 那么还有一个问题，在Python和C++里，各自有对象的表示，在函数调用时，参数和返回值的表示是需要转换的，因为ctypes还做不到玩去自动转换。从前面可以看到Function是继承于FunctionBase的，在函数调用时，缺省调用的是FunctionBase的__call__函数，在函数调用的开头，_make_tvm_args做了函数参数的封装转换，在函数调用的末尾，RETURN_SWITCH[ret_tcode.value](ret_val)对返回值做了转换。
198 | 
199 | ```python
200 | class FunctionBase(object):
201 |     """Function base."""
202 |     __slots__ = ["handle", "is_global"]
203 |     # pylint: disable=no-member
204 |     def __init__(self, handle, is_global):
205 |         """Initialize the function with handle
206 | 
207 |         Parameters
208 |         ----------
209 |         handle : FunctionHandle
210 |             the handle to the underlying function.
211 | 
212 |         is_global : bool
213 |             Whether this is a global function in python
214 |         """
215 |         self.handle = handle
216 |         self.is_global = is_global
217 | 
218 |     def __del__(self):
219 |         if not self.is_global and _LIB is not None:
220 |             if _LIB.TVMFuncFree(self.handle) != 0:
221 |                 raise get_last_ffi_error()
222 | 
223 |     def __call__(self, *args):
224 |         """Call the function with positional arguments
225 | 
226 |         args : list
227 |            The positional arguments to the function call.
228 |         """
229 |         temp_args = []
230 |         values, tcodes, num_args = _make_tvm_args(args, temp_args)
231 |         ret_val = TVMValue()
232 |         ret_tcode = ctypes.c_int()
233 |         if _LIB.TVMFuncCall(
234 |                 self.handle, values, tcodes, ctypes.c_int(num_args),
235 |                 ctypes.byref(ret_val), ctypes.byref(ret_tcode)) != 0:
236 |             raise get_last_ffi_error()
237 |         _ = temp_args
238 |         _ = args
239 |         return RETURN_SWITCH[ret_tcode.value](ret_val)
240 | ```
241 | 
242 | 在这里定义了缺省的集中返回值处理方法
243 | python/tvm/_ffi/_ctypes/types.py
244 | ```python
245 | RETURN_SWITCH = {
246 |     TypeCode.INT: lambda x: x.v_int64,
247 |     TypeCode.FLOAT: lambda x: x.v_float64,
248 |     TypeCode.HANDLE: _return_handle,
249 |     TypeCode.NULL: lambda x: None,
250 |     TypeCode.STR: lambda x: py_str(x.v_str),
251 |     TypeCode.BYTES: _return_bytes,
252 |     TypeCode.TVM_CONTEXT: _return_context
253 | }
254 | ```
255 | 
256 | 这里定义了TypeCode：
257 | python/tvm/_ffi/runtime_ctypes.py
258 | ```python
259 | class TypeCode(object):
260 |     """Type code used in API calls"""
261 |     INT = 0
262 |     UINT = 1
263 |     FLOAT = 2
264 |     HANDLE = 3
265 |     NULL = 4
266 |     TVM_TYPE = 5
267 |     TVM_CONTEXT = 6
268 |     ARRAY_HANDLE = 7
269 |     NODE_HANDLE = 8
270 |     MODULE_HANDLE = 9
271 |     FUNC_HANDLE = 10
272 |     STR = 11
273 |     BYTES = 12
274 |     NDARRAY_CONTAINER = 13
275 |     OBJECT_CELL = 14
276 |     EXT_BEGIN = 15
277 | ```
278 | 
279 | 有一些TypeCode的处理函数是在其他模块指定的，比如对于Node类型的返回值处理是在这里定义的，至于根据NodeType再选择合适的类型实例化，那是更进一步的处理了，需要在python代码和C++代码之间协商好了。
280 | 
281 | python/tvm/_ffi/_ctypes/node.py
282 | ```python
283 | def _return_node(x):
284 |     """Return node function"""
285 |     handle = x.v_handle
286 |     if not isinstance(handle, NodeHandle):
287 |         handle = NodeHandle(handle)
288 |     tindex = ctypes.c_int()
289 |     check_call(_LIB.TVMNodeGetTypeIndex(handle, ctypes.byref(tindex)))
290 |     cls = NODE_TYPE.get(tindex.value, NodeBase)
291 |     # Avoid calling __init__ of cls, instead directly call __new__
292 |     # This allows child class to implement their own __init__
293 |     node = cls.__new__(cls)
294 |     node.handle = handle
295 |     return node
296 |     
297 | RETURN_SWITCH[TypeCode.NODE_HANDLE] = _return_node
298 | ```
299 | 
300 | 在C++侧，函数调用也是先被注册到Registry里的，我们需要看看最终是如何调用到注册的函数以及如何处理参数和返回值的。比如调用tvm.placeholder函数时，最终应该调用到placeholder函数
301 | src/api/api_lang.cc
302 | ```c++
303 | TVM_REGISTER_API("_Placeholder")
304 | .set_body_typed<Tensor(Array<Expr>, Type, std::string)>([](
305 |   Array<Expr> shape, Type dtype, std::string name
306 | ) {
307 |   return placeholder(shape, dtype, name);
308 | });
309 | ```
310 | 
311 | src/op/placeholder_op.cc
312 | ```c++
313 | Tensor placeholder(Array<Expr> shape, Type dtype, std::string name) {
314 |   return PlaceholderOpNode::make(name, shape, dtype).output(0);
315 | }
316 | ```
317 | 
318 | 在C++侧，函数调用也是先被封装成了PackedFunc，然后注册到了Registry里，对于像placeholder这样的op，注册时使用了TypedPackedFunc，在调用到实际代码前，先经过detail::unpack_call的几个函数对参数进行转换，最终调用到了实际的代码。
319 | 
320 | 并且PackedFunc重载了运算符()，这样就可以被直接调用。
321 | include/tvm/runtime/registry.h
322 | ```c++
323 | class Registry {
324 |  public:
325 |   template<typename FType, typename FLambda>
326 |   Registry& set_body_typed(FLambda f) {
327 |     return set_body(TypedPackedFunc<FType>(f).packed());
328 |   }
329 | };  
330 | ```
331 | 
332 | include/tvm/runtime/packed_func.h
333 | ```c++
334 | template<typename R, typename ...Args>
335 | class TypedPackedFunc<R(Args...)> {
336 |  public:
337 |   template<typename FLambda,
338 |            typename = typename std::enable_if<
339 |              std::is_convertible<FLambda,
340 |                                  std::function<R(Args...)>
341 |                                  >::value>::type>
342 |   TypedPackedFunc(const FLambda& typed_lambda) {  // NOLINT(*)
343 |     this->AssignTypedLambda(typed_lambda);
344 |   }
345 |   
346 |  private:
347 |   friend class TVMRetValue;
348 |   /*! \brief The internal packed function */
349 |   PackedFunc packed_;
350 |   template<typename FLambda>
351 |   inline void AssignTypedLambda(FLambda flambda);
352 | };
353 | 
354 | template<typename R, typename ...Args>
355 | template<typename FType>
356 | inline void TypedPackedFunc<R(Args...)>::AssignTypedLambda(FType flambda) {
357 |   packed_ = PackedFunc([flambda](const TVMArgs& args, TVMRetValue* rv) {
358 |       detail::unpack_call<R, sizeof...(Args)>(flambda, args, rv);
359 |     });
360 | }
361 | 
362 | template<typename... Args>
363 | inline TVMRetValue PackedFunc::operator()(Args&& ...args) const {
364 |   const int kNumArgs = sizeof...(Args);
365 |   const int kArraySize = kNumArgs > 0 ? kNumArgs : 1;
366 |   TVMValue values[kArraySize];
367 |   int type_codes[kArraySize];
368 |   detail::for_each(TVMArgsSetter(values, type_codes),
369 |                    std::forward<Args>(args)...);
370 |   TVMRetValue rv;
371 |   body_(TVMArgs(values, type_codes, kNumArgs), &rv);
372 |   return rv;
373 | }
374 | 
375 | ```
376 | 在C++处理参数时，参数一步步的被转换为TVMArgs的形式，全部转换后，根据是否需要返回值，调用不同的unpack_call_dispatcher函数实现。
377 | ```c++
378 | namespace detail {
379 | template<typename R, int nleft, int index, typename F>
380 | struct unpack_call_dispatcher {
381 |   template<typename ...Args>
382 |   static void run(const F& f,
383 |                   const TVMArgs& args_pack,
384 |                   TVMRetValue* rv,
385 |                   Args&&... unpacked_args) {
386 |     unpack_call_dispatcher<R, nleft - 1, index + 1, F>
387 |         ::run(f, args_pack, rv,
388 |               std::forward<Args>(unpacked_args)...,
389 |               args_pack[index]);
390 |   }
391 | };
392 | 
393 | template<typename R, int index, typename F>
394 | struct unpack_call_dispatcher<R, 0, index, F> {
395 |   template<typename ...Args>
396 |   static void run(const F& f,
397 |                   const TVMArgs& args_pack,
398 |                   TVMRetValue* rv,
399 |                   Args&&... unpacked_args) {
400 |     *rv = R(f(std::forward<Args>(unpacked_args)...));
401 |   }
402 | };
403 | 
404 | template<int index, typename F>
405 | struct unpack_call_dispatcher<void, 0, index, F> {
406 |   template<typename ...Args>
407 |   static void run(const F& f,
408 |                   const TVMArgs& args_pack,
409 |                   TVMRetValue* rv,
410 |                   Args&&... unpacked_args) {
411 |     f(std::forward<Args>(unpacked_args)...);
412 |   }
413 | };
414 | 
415 | template<typename R, int nargs, typename F>
416 | inline void unpack_call(const F& f, const TVMArgs& args, TVMRetValue* rv) {
417 |   unpack_call_dispatcher<R, nargs, 0, F>::run(f, args, rv);
418 | }
419 | 
420 | template<typename R>
421 | struct typed_packed_call_dispatcher {
422 |   template<typename ...Args>
423 |   static inline R run(const PackedFunc& pf, Args&& ...args) {
424 |     return pf(std::forward<Args>(args)...);
425 |   }
426 | };
427 | 
428 | }
429 | ```
430 | 


--------------------------------------------------------------------------------
/mytophub/20191227/resnet50v1_bs_128.log:
--------------------------------------------------------------------------------
 1 | {"i": ["cuda -model=unknown", "topi_nn_conv2d", [["TENSOR", [128, 1024, 14, 14], "float32"], ["TENSOR", [2048, 1024, 1, 1], "float32"], [2, 2], [0, 0], [1, 1], "NCHW", "float32"], {}, ["conv2d", [128, 1024, 14, 14, "float32"], [2048, 1024, 1, 1, "float32"], [2, 2], [0, 0], [1, 1], "NCHW", "float32"], {"i": 271364, "t": "direct", "c": null, "e": [["tile_f", "sp", [-1, 1, 32, 4]], ["tile_y", "sp", [-1, 7, 1, 1]], ["tile_x", "sp", [-1, 1, 7, 1]], ["tile_rc", "sp", [-1, 4]], ["tile_ry", "sp", [-1, 1]], ["tile_rx", "sp", [-1, 1]], ["auto_unroll_max_step", "ot", 512], ["unroll_explicit", "ot", 1]]}], "r": [[0.0063333786, 0.006333583966666667, 0.0063340782, 0.0063342022, 0.006334789666666666, 0.006334933066666667, 0.006335168433333333, 0.006335286533333333], 0, 10.870361804962158, 1577184029.5545614], "v": 0.1}
 2 | {"i": ["cuda -model=unknown", "topi_nn_conv2d", [["TENSOR", [128, 512, 28, 28], "float32"], ["TENSOR", [1024, 512, 1, 1], "float32"], [2, 2], [0, 0], [1, 1], "NCHW", "float32"], {}, ["conv2d", [128, 512, 28, 28, "float32"], [1024, 512, 1, 1, "float32"], [2, 2], [0, 0], [1, 1], "NCHW", "float32"], {"i": 3917438, "t": "direct", "c": null, "e": [["tile_f", "sp", [-1, 8, 8, 2]], ["tile_y", "sp", [-1, 2, 1, 1]], ["tile_x", "sp", [-1, 1, 14, 1]], ["tile_rc", "sp", [-1, 8]], ["tile_ry", "sp", [-1, 1]], ["tile_rx", "sp", [-1, 1]], ["auto_unroll_max_step", "ot", 1500], ["unroll_explicit", "ot", 1]]}], "r": [[0.006335183066666666, 0.006335312166666667, 0.0063353601, 0.006335797733333333, 0.0063359148, 0.0063360051, 0.006336081733333334, 0.006336378266666666], 0, 6.4039671421051025, 1577187556.1523485], "v": 0.1}
 3 | {"i": ["cuda -model=unknown", "topi_nn_conv2d", [["TENSOR", [128, 256, 56, 56], "float32"], ["TENSOR", [512, 256, 1, 1], "float32"], [2, 2], [0, 0], [1, 1], "NCHW", "float32"], {}, ["conv2d", [128, 256, 56, 56, "float32"], [512, 256, 1, 1, "float32"], [2, 2], [0, 0], [1, 1], "NCHW", "float32"], {"i": 16684918, "t": "direct", "c": null, "e": [["tile_f", "sp", [-1, 8, 4, 4]], ["tile_y", "sp", [-1, 1, 1, 1]], ["tile_x", "sp", [-1, 2, 14, 1]], ["tile_rc", "sp", [-1, 4]], ["tile_ry", "sp", [-1, 1]], ["tile_rx", "sp", [-1, 1]], ["auto_unroll_max_step", "ot", 1500], ["unroll_explicit", "ot", 1]]}], "r": [[0.006168428366666667, 0.006168544366666666, 0.0061705516, 0.0061715065333333334, 0.006171509966666667, 0.006173020333333334, 0.0061733456, 0.0061744653], 0, 12.13925814628601, 1577189666.2390392], "v": 0.1}
 4 | {"i": ["cuda -model=unknown", "topi_nn_conv2d", [["TENSOR", [128, 3, 224, 224], "float32"], ["TENSOR", [64, 3, 7, 7], "float32"], [2, 2], [3, 3], [1, 1], "NCHW", "float32"], {}, ["conv2d", [128, 3, 224, 224, "float32"], [64, 3, 7, 7, "float32"], [2, 2], [3, 3], [1, 1], "NCHW", "float32"], {"i": 61363740, "t": "direct", "c": null, "e": [["tile_f", "sp", [-1, 4, 4, 4]], ["tile_y", "sp", [-1, 1, 1, 1]], ["tile_x", "sp", [-1, 7, 16, 1]], ["tile_rc", "sp", [-1, 3]], ["tile_ry", "sp", [-1, 1]], ["tile_rx", "sp", [-1, 7]], ["auto_unroll_max_step", "ot", 512], ["unroll_explicit", "ot", 1]]}], "r": [[0.00444663625, 0.0044471325, 0.004447371958333333, 0.004447579895833334, 0.004448198833333333, 0.004448200979166666, 0.004448517604166666, 0.004449726416666667], 0, 16.35187339782715, 1577192937.8761232], "v": 0.1}
 5 | {"i": ["cuda -model=unknown", "topi_nn_conv2d", [["TENSOR", [128, 64, 56, 56], "float32"], ["TENSOR", [64, 64, 1, 1], "float32"], [1, 1], [0, 0], [1, 1], "NCHW", "float32"], {}, ["conv2d", [128, 64, 56, 56, "float32"], [64, 64, 1, 1, "float32"], [1, 1], [0, 0], [1, 1], "NCHW", "float32"], {"i": 20623909, "t": "direct", "c": null, "e": [["tile_f", "sp", [-1, 1, 8, 4]], ["tile_y", "sp", [-1, 4, 1, 1]], ["tile_x", "sp", [-1, 1, 56, 1]], ["tile_rc", "sp", [-1, 8]], ["tile_ry", "sp", [-1, 1]], ["tile_rx", "sp", [-1, 1]], ["auto_unroll_max_step", "ot", 1500], ["unroll_explicit", "ot", 1]]}], "r": [[0.0008555583011363635, 0.0008557415511363637, 0.0008557471250000001, 0.0008557529772727272, 0.0008557752670454545, 0.0008558784715909091, 0.0008558810568181818, 0.0008559708579545454], 0, 11.823119163513184, 1577195723.720199], "v": 0.1}
 6 | {"i": ["cuda -model=unknown", "topi_nn_conv2d", [["TENSOR", [128, 256, 56, 56], "float32"], ["TENSOR", [64, 256, 1, 1], "float32"], [1, 1], [0, 0], [1, 1], "NCHW", "float32"], {}, ["conv2d", [128, 256, 56, 56, "float32"], [64, 256, 1, 1, "float32"], [1, 1], [0, 0], [1, 1], "NCHW", "float32"], {"i": 1767468, "t": "direct", "c": null, "e": [["tile_f", "sp", [-1, 4, 16, 1]], ["tile_y", "sp", [-1, 2, 1, 1]], ["tile_x", "sp", [-1, 7, 8, 1]], ["tile_rc", "sp", [-1, 8]], ["tile_ry", "sp", [-1, 1]], ["tile_rx", "sp", [-1, 1]], ["auto_unroll_max_step", "ot", 0], ["unroll_explicit", "ot", 0]]}], "r": [[0.0022989868, 0.002299052123809524, 0.0022990729714285714, 0.0022993259428571427, 0.0022999061904761904, 0.0023003163238095236, 0.002300357419047619, 0.002300424161904762], 0, 11.046765804290771, 1577197279.0836613], "v": 0.1}
 7 | {"i": ["cuda -model=unknown", "topi_nn_conv2d", [["TENSOR", [128, 64, 56, 56], "float32"], ["TENSOR", [64, 64, 3, 3], "float32"], [1, 1], [1, 1], [1, 1], "NCHW", "float32"], {}, ["conv2d", [128, 64, 56, 56, "float32"], [64, 64, 3, 3, "float32"], [1, 1], [1, 1], [1, 1], "NCHW", "float32"], {"i": 3338391, "t": "winograd", "c": null, "e": [["tile_b", "sp", [-1, 1, 1, 1]], ["tile_y", "sp", [-1, 1, 16, 4]], ["tile_x", "sp", [-1, 4, 16, 1]], ["tile_rc", "sp", [-1, 16]], ["auto_unroll_max_step", "ot", 1500], ["unroll_explicit", "ot", 0]]}], "r": [[0.0056325614, 0.005641175866666666, 0.005642314866666666, 0.0056435340666666665, 0.005643602533333333, 0.005644356866666667, 0.005644704733333334, 0.0056456084], 0, 5.522424697875977, 1577200478.7034633], "v": 0.1}
 8 | {"i": ["cuda -model=unknown", "topi_nn_conv2d", [["TENSOR", [128, 64, 56, 56], "float32"], ["TENSOR", [256, 64, 1, 1], "float32"], [1, 1], [0, 0], [1, 1], "NCHW", "float32"], {}, ["conv2d", [128, 64, 56, 56, "float32"], [256, 64, 1, 1, "float32"], [1, 1], [0, 0], [1, 1], "NCHW", "float32"], {"i": 25652622, "t": "direct", "c": null, "e": [["tile_f", "sp", [-1, 2, 16, 2]], ["tile_y", "sp", [-1, 1, 1, 2]], ["tile_x", "sp", [-1, 7, 8, 1]], ["tile_rc", "sp", [-1, 8]], ["tile_ry", "sp", [-1, 1]], ["tile_rx", "sp", [-1, 1]], ["auto_unroll_max_step", "ot", 0], ["unroll_explicit", "ot", 1]]}], "r": [[0.00234870725, 0.00234874696875, 0.00234890246875, 0.002348962984375, 0.00234902284375, 0.002349046234375, 0.00234921534375, 0.002349832796875], 0, 105.75595426559448, 1577202198.2353306], "v": 0.1}
 9 | {"i": ["cuda -model=unknown", "topi_nn_conv2d", [["TENSOR", [128, 256, 56, 56], "float32"], ["TENSOR", [128, 256, 1, 1], "float32"], [1, 1], [0, 0], [1, 1], "NCHW", "float32"], {}, ["conv2d", [128, 256, 56, 56, "float32"], [128, 256, 1, 1, "float32"], [1, 1], [0, 0], [1, 1], "NCHW", "float32"], {"i": 37853820, "t": "direct", "c": null, "e": [["tile_f", "sp", [-1, 4, 16, 2]], ["tile_y", "sp", [-1, 1, 2, 1]], ["tile_x", "sp", [-1, 7, 8, 1]], ["tile_rc", "sp", [-1, 16]], ["tile_ry", "sp", [-1, 1]], ["tile_rx", "sp", [-1, 1]], ["auto_unroll_max_step", "ot", 1500], ["unroll_explicit", "ot", 1]]}], "r": [[0.00392392, 0.003924131020833333, 0.003924147395833333, 0.00392435775, 0.003924503833333333, 0.003925032770833333, 0.003925191604166667, 0.003925416625], 0, 7.427847862243652, 1577203574.600377], "v": 0.1}
10 | {"i": ["cuda -model=unknown", "topi_nn_conv2d", [["TENSOR", [128, 128, 56, 56], "float32"], ["TENSOR", [128, 128, 3, 3], "float32"], [2, 2], [1, 1], [1, 1], "NCHW", "float32"], {}, ["conv2d", [128, 128, 56, 56, "float32"], [128, 128, 3, 3, "float32"], [2, 2], [1, 1], [1, 1], "NCHW", "float32"], {"i": 29237619, "t": "direct", "c": null, "e": [["tile_f", "sp", [-1, 1, 16, 8]], ["tile_y", "sp", [-1, 1, 2, 1]], ["tile_x", "sp", [-1, 7, 4, 1]], ["tile_rc", "sp", [-1, 1]], ["tile_ry", "sp", [-1, 3]], ["tile_rx", "sp", [-1, 3]], ["auto_unroll_max_step", "ot", 512], ["unroll_explicit", "ot", 1]]}], "r": [[0.0046910962499999995, 0.0046914873125, 0.0046915503125, 0.004691641, 0.004691673541666667, 0.004691932645833333, 0.0046925105833333335, 0.0046926720208333335], 0, 6.548671245574951, 1577204844.5141144], "v": 0.1}
11 | {"i": ["cuda -model=unknown", "topi_nn_conv2d", [["TENSOR", [128, 512, 28, 28], "float32"], ["TENSOR", [128, 512, 1, 1], "float32"], [1, 1], [0, 0], [1, 1], "NCHW", "float32"], {}, ["conv2d", [128, 512, 28, 28, "float32"], [128, 512, 1, 1, "float32"], [1, 1], [0, 0], [1, 1], "NCHW", "float32"], {"i": 2550080, "t": "direct", "c": null, "e": [["tile_f", "sp", [-1, 2, 8, 4]], ["tile_y", "sp", [-1, 1, 4, 1]], ["tile_x", "sp", [-1, 7, 4, 1]], ["tile_rc", "sp", [-1, 8]], ["tile_ry", "sp", [-1, 1]], ["tile_rx", "sp", [-1, 1]], ["auto_unroll_max_step", "ot", 512], ["unroll_explicit", "ot", 0]]}], "r": [[0.0017684523445378152, 0.0017684902184873948, 0.0017688306050420168, 0.001768910512605042, 0.0017689984033613445, 0.0017690652773109242, 0.0017692476722689077, 0.0017698901848739496], 0, 3.747542381286621, 1577206867.8462474], "v": 0.1}
12 | {"i": ["cuda -model=unknown", "topi_nn_conv2d", [["TENSOR", [128, 128, 28, 28], "float32"], ["TENSOR", [128, 128, 3, 3], "float32"], [1, 1], [1, 1], [1, 1], "NCHW", "float32"], {}, ["conv2d", [128, 128, 28, 28, "float32"], [128, 128, 3, 3, "float32"], [1, 1], [1, 1], [1, 1], "NCHW", "float32"], {"i": 5033138, "t": "winograd", "c": null, "e": [["tile_b", "sp", [-1, 1, 1, 1]], ["tile_y", "sp", [-1, 2, 8, 8]], ["tile_x", "sp", [-1, 4, 16, 1]], ["tile_rc", "sp", [-1, 8]], ["auto_unroll_max_step", "ot", 1500], ["unroll_explicit", "ot", 0]]}], "r": [[0.004257382, 0.004257907041666667, 0.00425800125, 0.004258857666666667, 0.004258938375, 0.0042590041458333335, 0.004259054520833334, 0.004259465083333333], 0, 9.401336908340454, 1577208403.7020595], "v": 0.1}
13 | {"i": ["cuda -model=unknown", "topi_nn_conv2d", [["TENSOR", [128, 128, 28, 28], "float32"], ["TENSOR", [512, 128, 1, 1], "float32"], [1, 1], [0, 0], [1, 1], "NCHW", "float32"], {}, ["conv2d", [128, 128, 28, 28, "float32"], [512, 128, 1, 1, "float32"], [1, 1], [0, 0], [1, 1], "NCHW", "float32"], {"i": 6787122, "t": "direct", "c": null, "e": [["tile_f", "sp", [-1, 2, 8, 4]], ["tile_y", "sp", [-1, 1, 4, 1]], ["tile_x", "sp", [-1, 7, 4, 1]], ["tile_rc", "sp", [-1, 8]], ["tile_ry", "sp", [-1, 1]], ["tile_rx", "sp", [-1, 1]], ["auto_unroll_max_step", "ot", 1500], ["unroll_explicit", "ot", 0]]}], "r": [[0.0018619601327433627, 0.0018620013805309735, 0.001862185814159292, 0.0018622138318584072, 0.0018622363628318583, 0.0018623675044247786, 0.0018624400088495577, 0.0018782160619469027], 0, 3.6450414657592773, 1577210775.4263706], "v": 0.1}
14 | {"i": ["cuda -model=unknown", "topi_nn_conv2d", [["TENSOR", [128, 512, 28, 28], "float32"], ["TENSOR", [256, 512, 1, 1], "float32"], [1, 1], [0, 0], [1, 1], "NCHW", "float32"], {}, ["conv2d", [128, 512, 28, 28, "float32"], [256, 512, 1, 1, "float32"], [1, 1], [0, 0], [1, 1], "NCHW", "float32"], {"i": 6146350, "t": "direct", "c": null, "e": [["tile_f", "sp", [-1, 2, 8, 4]], ["tile_y", "sp", [-1, 1, 4, 1]], ["tile_x", "sp", [-1, 7, 4, 1]], ["tile_rc", "sp", [-1, 8]], ["tile_ry", "sp", [-1, 1]], ["tile_rx", "sp", [-1, 1]], ["auto_unroll_max_step", "ot", 1500], ["unroll_explicit", "ot", 0]]}], "r": [[0.003360054520833333, 0.0033604686666666665, 0.0033610647708333333, 0.003361179854166667, 0.0033613111875, 0.00336152825, 0.0033617021666666664, 0.003361851291666667], 0, 10.219793796539307, 1577212144.9801183], "v": 0.1}
15 | {"i": ["cuda -model=unknown", "topi_nn_conv2d", [["TENSOR", [128, 256, 28, 28], "float32"], ["TENSOR", [256, 256, 3, 3], "float32"], [2, 2], [1, 1], [1, 1], "NCHW", "float32"], {}, ["conv2d", [128, 256, 28, 28, "float32"], [256, 256, 3, 3, "float32"], [2, 2], [1, 1], [1, 1], "NCHW", "float32"], {"i": 7229672, "t": "direct", "c": null, "e": [["tile_f", "sp", [-1, 4, 16, 1]], ["tile_y", "sp", [-1, 1, 14, 1]], ["tile_x", "sp", [-1, 7, 1, 1]], ["tile_rc", "sp", [-1, 1]], ["tile_ry", "sp", [-1, 3]], ["tile_rx", "sp", [-1, 3]], ["auto_unroll_max_step", "ot", 512], ["unroll_explicit", "ot", 1]]}], "r": [[0.005012377766666667, 0.005013697366666667, 0.005014059766666666, 0.0050140877, 0.005014166433333333, 0.005014345866666667, 0.005014913533333333, 0.005015982933333333], 0, 4.985369920730591, 1577213996.6693065], "v": 0.1}
16 | {"i": ["cuda -model=unknown", "topi_nn_conv2d", [["TENSOR", [128, 1024, 14, 14], "float32"], ["TENSOR", [256, 1024, 1, 1], "float32"], [1, 1], [0, 0], [1, 1], "NCHW", "float32"], {}, ["conv2d", [128, 1024, 14, 14, "float32"], [256, 1024, 1, 1, "float32"], [1, 1], [0, 0], [1, 1], "NCHW", "float32"], {"i": 2006798, "t": "direct", "c": null, "e": [["tile_f", "sp", [-1, 4, 8, 2]], ["tile_y", "sp", [-1, 7, 1, 1]], ["tile_x", "sp", [-1, 1, 14, 1]], ["tile_rc", "sp", [-1, 8]], ["tile_ry", "sp", [-1, 1]], ["tile_rx", "sp", [-1, 1]], ["auto_unroll_max_step", "ot", 512], ["unroll_explicit", "ot", 1]]}], "r": [[0.0018891316756756756, 0.0018892606216216216, 0.001890942864864865, 0.0018910406036036036, 0.001891078945945946, 0.0018923574684684683, 0.0018925857117117117, 0.0019125934864864865], 0, 3.913076162338257, 1577215706.467319], "v": 0.1}
17 | {"i": ["cuda -model=unknown", "topi_nn_conv2d", [["TENSOR", [128, 256, 14, 14], "float32"], ["TENSOR", [256, 256, 3, 3], "float32"], [1, 1], [1, 1], [1, 1], "NCHW", "float32"], {}, ["conv2d", [128, 256, 14, 14, "float32"], [256, 256, 3, 3, "float32"], [1, 1], [1, 1], [1, 1], "NCHW", "float32"], {"i": 9918294, "t": "winograd", "c": null, "e": [["tile_b", "sp", [-1, 1, 1, 1]], ["tile_y", "sp", [-1, 1, 16, 16]], ["tile_x", "sp", [-1, 4, 16, 1]], ["tile_rc", "sp", [-1, 32]], ["auto_unroll_max_step", "ot", 1500], ["unroll_explicit", "ot", 1]]}], "r": [[0.0028194786296296295, 0.0028195532222222226, 0.002819559092592593, 0.002819758222222222, 0.0028198510185185183, 0.0028208181111111113, 0.0028208662777777778, 0.002820978537037037], 0, 12.24785852432251, 1577217545.5047538], "v": 0.1}
18 | {"i": ["cuda -model=unknown", "topi_nn_conv2d", [["TENSOR", [128, 256, 14, 14], "float32"], ["TENSOR", [1024, 256, 1, 1], "float32"], [1, 1], [0, 0], [1, 1], "NCHW", "float32"], {}, ["conv2d", [128, 256, 14, 14, "float32"], [1024, 256, 1, 1, "float32"], [1, 1], [0, 0], [1, 1], "NCHW", "float32"], {"i": 2893549, "t": "direct", "c": null, "e": [["tile_f", "sp", [-1, 4, 4, 2]], ["tile_y", "sp", [-1, 7, 2, 1]], ["tile_x", "sp", [-1, 1, 14, 1]], ["tile_rc", "sp", [-1, 8]], ["tile_ry", "sp", [-1, 1]], ["tile_rx", "sp", [-1, 1]], ["auto_unroll_max_step", "ot", 512], ["unroll_explicit", "ot", 1]]}], "r": [[0.0018912268272727273, 0.0018912852272727273, 0.0018913062636363638, 0.0018913225636363637, 0.0018913725, 0.0018914268818181817, 0.0018914759818181818, 0.0018919370454545455], 0, 4.0207695960998535, 1577219490.0937035], "v": 0.1}
19 | {"i": ["cuda -model=unknown", "topi_nn_conv2d", [["TENSOR", [128, 1024, 14, 14], "float32"], ["TENSOR", [512, 1024, 1, 1], "float32"], [1, 1], [0, 0], [1, 1], "NCHW", "float32"], {}, ["conv2d", [128, 1024, 14, 14, "float32"], [512, 1024, 1, 1, "float32"], [1, 1], [0, 0], [1, 1], "NCHW", "float32"], {"i": 3352170, "t": "direct", "c": null, "e": [["tile_f", "sp", [-1, 8, 8, 1]], ["tile_y", "sp", [-1, 7, 2, 1]], ["tile_x", "sp", [-1, 1, 14, 1]], ["tile_rc", "sp", [-1, 16]], ["tile_ry", "sp", [-1, 1]], ["tile_rx", "sp", [-1, 1]], ["auto_unroll_max_step", "ot", 1500], ["unroll_explicit", "ot", 1]]}], "r": [[0.0037138228125, 0.0037138601249999998, 0.0037142617291666667, 0.0037151835416666668, 0.0037152678333333337, 0.0037155553541666666, 0.003715655520833333, 0.0037160250625], 0, 6.917834043502808, 1577220783.8878543], "v": 0.1}
20 | {"i": ["cuda -model=unknown", "topi_nn_conv2d", [["TENSOR", [128, 512, 14, 14], "float32"], ["TENSOR", [512, 512, 3, 3], "float32"], [2, 2], [1, 1], [1, 1], "NCHW", "float32"], {}, ["conv2d", [128, 512, 14, 14, "float32"], [512, 512, 3, 3, "float32"], [2, 2], [1, 1], [1, 1], "NCHW", "float32"], {"i": 812716, "t": "direct", "c": null, "e": [["tile_f", "sp", [-1, 4, 16, 1]], ["tile_y", "sp", [-1, 1, 7, 1]], ["tile_x", "sp", [-1, 1, 1, 7]], ["tile_rc", "sp", [-1, 1]], ["tile_ry", "sp", [-1, 3]], ["tile_rx", "sp", [-1, 3]], ["auto_unroll_max_step", "ot", 1500], ["unroll_explicit", "ot", 1]]}], "r": [[0.005348595133333333, 0.005348718333333334, 0.0053490687333333335, 0.005349613366666667, 0.005349644966666667, 0.0053498461, 0.005350681533333334, 0.005357540333333333], 0, 14.815961837768555, 1577222382.7355168], "v": 0.1}
21 | {"i": ["cuda -model=unknown", "topi_nn_conv2d", [["TENSOR", [128, 2048, 7, 7], "float32"], ["TENSOR", [512, 2048, 1, 1], "float32"], [1, 1], [0, 0], [1, 1], "NCHW", "float32"], {}, ["conv2d", [128, 2048, 7, 7, "float32"], [512, 2048, 1, 1, "float32"], [1, 1], [0, 0], [1, 1], "NCHW", "float32"], {"i": 223770, "t": "direct", "c": null, "e": [["tile_f", "sp", [-1, 8, 8, 1]], ["tile_y", "sp", [-1, 7, 1, 1]], ["tile_x", "sp", [-1, 1, 7, 1]], ["tile_rc", "sp", [-1, 8]], ["tile_ry", "sp", [-1, 1]], ["tile_rx", "sp", [-1, 1]], ["auto_unroll_max_step", "ot", 1500], ["unroll_explicit", "ot", 1]]}], "r": [[0.002460925557377049, 0.002461113969387755, 0.0024633469387755104, 0.0024633795901639346, 0.0024636931475409836, 0.002464621918032787, 0.0024662767540983605, 0.0024701937213114753], 0, 14.985524892807007, 1577223791.8787625], "v": 0.1}
22 | {"i": ["cuda -model=unknown", "topi_nn_conv2d", [["TENSOR", [128, 512, 7, 7], "float32"], ["TENSOR", [512, 512, 3, 3], "float32"], [1, 1], [1, 1], [1, 1], "NCHW", "float32"], {}, ["conv2d", [128, 512, 7, 7, "float32"], [512, 512, 3, 3, "float32"], [1, 1], [1, 1], [1, 1], "NCHW", "float32"], {"i": 4252318, "t": "winograd", "c": null, "e": [["tile_b", "sp", [-1, 1, 1, 1]], ["tile_y", "sp", [-1, 1, 16, 8]], ["tile_x", "sp", [-1, 8, 8, 1]], ["tile_rc", "sp", [-1, 8]], ["auto_unroll_max_step", "ot", 1500], ["unroll_explicit", "ot", 1]]}], "r": [[0.002740793218181818, 0.002741003763636364, 0.0027419965454545453, 0.0027422580909090906, 0.0027423864363636365, 0.002742666090909091, 0.0027428159272727274, 0.0027432565272727276], 0, 8.088928461074829, 1577225121.2877896], "v": 0.1}
23 | {"i": ["cuda -model=unknown", "topi_nn_conv2d", [["TENSOR", [128, 512, 7, 7], "float32"], ["TENSOR", [2048, 512, 1, 1], "float32"], [1, 1], [0, 0], [1, 1], "NCHW", "float32"], {}, ["conv2d", [128, 512, 7, 7, "float32"], [2048, 512, 1, 1, "float32"], [1, 1], [0, 0], [1, 1], "NCHW", "float32"], {"i": 253818, "t": "direct", "c": null, "e": [["tile_f", "sp", [-1, 4, 8, 2]], ["tile_y", "sp", [-1, 7, 1, 1]], ["tile_x", "sp", [-1, 1, 7, 1]], ["tile_rc", "sp", [-1, 8]], ["tile_ry", "sp", [-1, 1]], ["tile_rx", "sp", [-1, 1]], ["auto_unroll_max_step", "ot", 512], ["unroll_explicit", "ot", 1]]}], "r": [[0.0025597322711864404, 0.002561245254237288, 0.0025629808474576274, 0.0025632487966101695, 0.002563518033898305, 0.0025641496271186443, 0.002564390779661017, 0.00256525206779661], 0, 13.628146648406982, 1577227038.6051059], "v": 0.1}
24 | {"i": ["cuda -model=unknown", "topi_nn_dense", [["TENSOR", [128, 2048], "float32"], ["TENSOR", [1000, 2048], "float32"], null, "float32"], {}, ["dense", [128, 2048, "float32"], [1000, 2048, "float32"], 0, "float32"], {"i": 70959, "t": "direct", "c": null, "e": [["tile_x", "sp", [-1, 8, 1, 2]], ["tile_y", "sp", [-1, 2, 1, 5]], ["tile_k", "sp", [-1, 2, 1]]]}], "r": [[0.00354612025, 0.0035470710833333335, 0.0035479164375, 0.003548581104166667, 0.0035512449791666664, 0.003552203958333333, 0.0035528297291666667, 0.0035902876458333336], 0, 3.521620750427246, 1577228784.847085], "v": 0.1}
25 | 


--------------------------------------------------------------------------------
/mytophub/20191227/resnet50v1_bs_1.log:
--------------------------------------------------------------------------------
 1 | {"i": ["cuda -model=unknown", "topi_nn_conv2d", [["TENSOR", [1, 1024, 14, 14], "float32"], ["TENSOR", [2048, 1024, 1, 1], "float32"], [2, 2], [0, 0], [1, 1], "NCHW", "float32"], {}, ["conv2d", [1, 1024, 14, 14, "float32"], [2048, 1024, 1, 1, "float32"], [2, 2], [0, 0], [1, 1], "NCHW", "float32"], {"i": 346943, "t": "direct", "c": null, "e": [["tile_f", "sp", [-1, 2, 32, 1]], ["tile_y", "sp", [-1, 7, 1, 1]], ["tile_x", "sp", [-1, 1, 7, 1]], ["tile_rc", "sp", [-1, 16]], ["tile_ry", "sp", [-1, 1]], ["tile_rx", "sp", [-1, 1]], ["auto_unroll_max_step", "ot", 1500], ["unroll_explicit", "ot", 1]]}], "r": [[0.00016145490406395735, 0.00016152403797468353, 0.0001615662405063291, 0.00016160819653564292, 0.0001616776713362069, 0.0001616778943965517, 0.00016173229418103447, 0.00016173750969827587], 0, 6.183557510375977, 1576737392.091698], "v": 0.1}
 2 | {"i": ["cuda -model=unknown", "topi_nn_conv2d", [["TENSOR", [1, 512, 28, 28], "float32"], ["TENSOR", [1024, 512, 1, 1], "float32"], [2, 2], [0, 0], [1, 1], "NCHW", "float32"], {}, ["conv2d", [1, 512, 28, 28, "float32"], [1024, 512, 1, 1, "float32"], [2, 2], [0, 0], [1, 1], "NCHW", "float32"], {"i": 3992886, "t": "direct", "c": null, "e": [["tile_f", "sp", [-1, 4, 16, 1]], ["tile_y", "sp", [-1, 1, 1, 2]], ["tile_x", "sp", [-1, 1, 14, 1]], ["tile_rc", "sp", [-1, 16]], ["tile_ry", "sp", [-1, 1]], ["tile_rx", "sp", [-1, 1]], ["auto_unroll_max_step", "ot", 1500], ["unroll_explicit", "ot", 1]]}], "r": [[6.922477752293578e-05, 6.92313821674312e-05, 6.924994581422018e-05, 6.92602072821101e-05, 6.926205733944954e-05, 6.927225745412845e-05, 6.929297190366973e-05, 6.934898107798165e-05], 0, 28.005922555923462, 1576738192.7619834], "v": 0.1}
 3 | {"i": ["cuda -model=unknown", "topi_nn_conv2d", [["TENSOR", [1, 256, 56, 56], "float32"], ["TENSOR", [512, 256, 1, 1], "float32"], [2, 2], [0, 0], [1, 1], "NCHW", "float32"], {}, ["conv2d", [1, 256, 56, 56, "float32"], [512, 256, 1, 1, "float32"], [2, 2], [0, 0], [1, 1], "NCHW", "float32"], {"i": 17036831, "t": "direct", "c": null, "e": [["tile_f", "sp", [-1, 16, 8, 1]], ["tile_y", "sp", [-1, 1, 1, 1]], ["tile_x", "sp", [-1, 2, 14, 1]], ["tile_rc", "sp", [-1, 8]], ["tile_ry", "sp", [-1, 1]], ["tile_rx", "sp", [-1, 1]], ["auto_unroll_max_step", "ot", 1500], ["unroll_explicit", "ot", 1]]}], "r": [[5.8089114824335904e-05, 5.809596629534419e-05, 5.810448986003999e-05, 5.8107251071122535e-05, 5.813846615252785e-05, 5.814329791488146e-05, 5.815662524992859e-05, 5.8211077692087974e-05], 0, 4.205289125442505, 1576739520.1315114], "v": 0.1}
 4 | {"i": ["cuda -model=unknown", "topi_nn_conv2d", [["TENSOR", [1, 3, 224, 224], "float32"], ["TENSOR", [64, 3, 7, 7], "float32"], [2, 2], [3, 3], [1, 1], "NCHW", "float32"], {}, ["conv2d", [1, 3, 224, 224, "float32"], [64, 3, 7, 7, "float32"], [2, 2], [3, 3], [1, 1], "NCHW", "float32"], {"i": 61363753, "t": "direct", "c": null, "e": [["tile_f", "sp", [-1, 1, 8, 8]], ["tile_y", "sp", [-1, 1, 1, 1]], ["tile_x", "sp", [-1, 7, 16, 1]], ["tile_rc", "sp", [-1, 3]], ["tile_ry", "sp", [-1, 1]], ["tile_rx", "sp", [-1, 7]], ["auto_unroll_max_step", "ot", 512], ["unroll_explicit", "ot", 1]]}], "r": [[4.48079066723879e-05, 4.4849254666380605e-05, 4.485475284273761e-05, 4.485584037760137e-05, 4.485739154687835e-05, 4.488499098905815e-05, 4.4886555031109204e-05, 4.493317807337481e-05], 0, 4.223693609237671, 1576741359.9741962], "v": 0.1}
 5 | {"i": ["cuda -model=unknown", "topi_nn_conv2d", [["TENSOR", [1, 64, 56, 56], "float32"], ["TENSOR", [64, 64, 1, 1], "float32"], [1, 1], [0, 0], [1, 1], "NCHW", "float32"], {}, ["conv2d", [1, 64, 56, 56, "float32"], [64, 64, 1, 1, "float32"], [1, 1], [0, 0], [1, 1], "NCHW", "float32"], {"i": 20616981, "t": "direct", "c": null, "e": [["tile_f", "sp", [-1, 8, 8, 1]], ["tile_y", "sp", [-1, 1, 1, 1]], ["tile_x", "sp", [-1, 2, 28, 1]], ["tile_rc", "sp", [-1, 8]], ["tile_ry", "sp", [-1, 1]], ["tile_rx", "sp", [-1, 1]], ["auto_unroll_max_step", "ot", 1500], ["unroll_explicit", "ot", 1]]}], "r": [[8.82699406037367e-06, 8.827041878124883e-06, 8.83085647156122e-06, 8.833518627119281e-06, 8.836016540731551e-06, 8.836196759520318e-06, 8.837943498364724e-06, 8.845814706214053e-06], 0, 18.225536823272705, 1576743271.7053988], "v": 0.1}
 6 | {"i": ["cuda -model=unknown", "topi_nn_conv2d", [["TENSOR", [1, 256, 56, 56], "float32"], ["TENSOR", [64, 256, 1, 1], "float32"], [1, 1], [0, 0], [1, 1], "NCHW", "float32"], {}, ["conv2d", [1, 256, 56, 56, "float32"], [64, 256, 1, 1, "float32"], [1, 1], [0, 0], [1, 1], "NCHW", "float32"], {"i": 12011176, "t": "direct", "c": null, "e": [["tile_f", "sp", [-1, 8, 4, 1]], ["tile_y", "sp", [-1, 1, 1, 2]], ["tile_x", "sp", [-1, 1, 28, 1]], ["tile_rc", "sp", [-1, 16]], ["tile_ry", "sp", [-1, 1]], ["tile_rx", "sp", [-1, 1]], ["auto_unroll_max_step", "ot", 1500], ["unroll_explicit", "ot", 0]]}], "r": [[3.229955276793019e-05, 3.232549536405781e-05, 3.232731728933733e-05, 3.233292337060267e-05, 3.235091396236706e-05, 3.2362637578401966e-05, 3.236615939460049e-05, 3.2391604854104176e-05], 0, 11.575981855392456, 1576745256.3094344], "v": 0.1}
 7 | {"i": ["cuda -model=unknown", "topi_nn_conv2d", [["TENSOR", [1, 64, 56, 56], "float32"], ["TENSOR", [64, 64, 3, 3], "float32"], [1, 1], [1, 1], [1, 1], "NCHW", "float32"], {}, ["conv2d", [1, 64, 56, 56, "float32"], [64, 64, 3, 3, "float32"], [1, 1], [1, 1], [1, 1], "NCHW", "float32"], {"i": 203771, "t": "winograd", "c": null, "e": [["tile_b", "sp", [-1, 1, 1, 1]], ["tile_y", "sp", [-1, 1, 4, 8]], ["tile_x", "sp", [-1, 2, 14, 1]], ["tile_rc", "sp", [-1, 8]], ["auto_unroll_max_step", "ot", 0], ["unroll_explicit", "ot", 1]]}], "r": [[3.9518064510200326e-05, 3.956372891012681e-05, 3.957693659253813e-05, 3.9595429332843226e-05, 3.961328138209888e-05, 4.00275563315567e-05, 4.002888586656865e-05, 4.0183429149053484e-05], 0, 5.1878416538238525, 1576747236.0840814], "v": 0.1}
 8 | {"i": ["cuda -model=unknown", "topi_nn_conv2d", [["TENSOR", [1, 64, 56, 56], "float32"], ["TENSOR", [256, 64, 1, 1], "float32"], [1, 1], [0, 0], [1, 1], "NCHW", "float32"], {}, ["conv2d", [1, 64, 56, 56, "float32"], [256, 64, 1, 1, "float32"], [1, 1], [0, 0], [1, 1], "NCHW", "float32"], {"i": 18326612, "t": "direct", "c": null, "e": [["tile_f", "sp", [-1, 4, 4, 2]], ["tile_y", "sp", [-1, 1, 1, 2]], ["tile_x", "sp", [-1, 2, 28, 1]], ["tile_rc", "sp", [-1, 8]], ["tile_ry", "sp", [-1, 1]], ["tile_rx", "sp", [-1, 1]], ["auto_unroll_max_step", "ot", 1500], ["unroll_explicit", "ot", 0]]}], "r": [[2.8633713786454566e-05, 2.863518450228971e-05, 2.8635545312123405e-05, 2.8636546276211134e-05, 2.8641307664497468e-05, 2.8653576524463726e-05, 2.8658364184140756e-05, 2.869166594360087e-05], 0, 7.322540044784546, 1576749541.0013156], "v": 0.1}
 9 | {"i": ["cuda -model=unknown", "topi_nn_conv2d", [["TENSOR", [1, 256, 56, 56], "float32"], ["TENSOR", [128, 256, 1, 1], "float32"], [1, 1], [0, 0], [1, 1], "NCHW", "float32"], {}, ["conv2d", [1, 256, 56, 56, "float32"], [128, 256, 1, 1, "float32"], [1, 1], [0, 0], [1, 1], "NCHW", "float32"], {"i": 24028828, "t": "direct", "c": null, "e": [["tile_f", "sp", [-1, 4, 16, 1]], ["tile_y", "sp", [-1, 1, 1, 1]], ["tile_x", "sp", [-1, 7, 8, 1]], ["tile_rc", "sp", [-1, 16]], ["tile_ry", "sp", [-1, 1]], ["tile_rx", "sp", [-1, 1]], ["auto_unroll_max_step", "ot", 0], ["unroll_explicit", "ot", 1]]}], "r": [[4.2865310759148005e-05, 4.286941379938103e-05, 4.288143327871837e-05, 4.2890561441835064e-05, 4.2933770617149097e-05, 4.2947521390861096e-05, 4.2950151829601314e-05, 4.2972405060986713e-05], 0, 11.101155042648315, 1576751595.0987728], "v": 0.1}
10 | {"i": ["cuda -model=unknown", "topi_nn_conv2d", [["TENSOR", [1, 128, 56, 56], "float32"], ["TENSOR", [128, 128, 3, 3], "float32"], [2, 2], [1, 1], [1, 1], "NCHW", "float32"], {}, ["conv2d", [1, 128, 56, 56, "float32"], [128, 128, 3, 3, "float32"], [2, 2], [1, 1], [1, 1], "NCHW", "float32"], {"i": 29683979, "t": "direct", "c": null, "e": [["tile_f", "sp", [-1, 2, 16, 2]], ["tile_y", "sp", [-1, 1, 2, 1]], ["tile_x", "sp", [-1, 1, 7, 2]], ["tile_rc", "sp", [-1, 4]], ["tile_ry", "sp", [-1, 3]], ["tile_rx", "sp", [-1, 3]], ["auto_unroll_max_step", "ot", 512], ["unroll_explicit", "ot", 1]]}], "r": [[7.771064902149502e-05, 7.771621302534488e-05, 7.776372730189285e-05, 7.77709114533205e-05, 7.778979563683029e-05, 7.782302855309593e-05, 7.782835964068014e-05, 7.802061131292163e-05], 0, 14.508209705352783, 1576753500.4498262], "v": 0.1}
11 | {"i": ["cuda -model=unknown", "topi_nn_conv2d", [["TENSOR", [1, 512, 28, 28], "float32"], ["TENSOR", [128, 512, 1, 1], "float32"], [1, 1], [0, 0], [1, 1], "NCHW", "float32"], {}, ["conv2d", [1, 512, 28, 28, "float32"], [128, 512, 1, 1, "float32"], [1, 1], [0, 0], [1, 1], "NCHW", "float32"], {"i": 4926988, "t": "direct", "c": null, "e": [["tile_f", "sp", [-1, 4, 16, 1]], ["tile_y", "sp", [-1, 1, 1, 2]], ["tile_x", "sp", [-1, 1, 14, 2]], ["tile_rc", "sp", [-1, 32]], ["tile_ry", "sp", [-1, 1]], ["tile_rx", "sp", [-1, 1]], ["auto_unroll_max_step", "ot", 1500], ["unroll_explicit", "ot", 0]]}], "r": [[3.884389500860585e-05, 3.884483534136546e-05, 3.884495008605852e-05, 3.884514859437751e-05, 3.884752074966533e-05, 3.884931095811818e-05, 3.885193134442532e-05, 3.886651558615414e-05], 0, 3.8512141704559326, 1576755394.4160378], "v": 0.1}
12 | {"i": ["cuda -model=unknown", "topi_nn_conv2d", [["TENSOR", [1, 128, 28, 28], "float32"], ["TENSOR", [128, 128, 3, 3], "float32"], [1, 1], [1, 1], [1, 1], "NCHW", "float32"], {}, ["conv2d", [1, 128, 28, 28, "float32"], [128, 128, 3, 3, "float32"], [1, 1], [1, 1], [1, 1], "NCHW", "float32"], {"i": 435080, "t": "winograd", "c": null, "e": [["tile_b", "sp", [-1, 1, 1, 1]], ["tile_y", "sp", [-1, 2, 8, 4]], ["tile_x", "sp", [-1, 2, 14, 1]], ["tile_rc", "sp", [-1, 16]], ["auto_unroll_max_step", "ot", 128], ["unroll_explicit", "ot", 1]]}], "r": [[4.1679057039462345e-05, 4.169603480958125e-05, 4.173156126141651e-05, 4.174099241771497e-05, 4.17537394451146e-05, 4.176737911425125e-05, 4.180427778735137e-05, 4.1842181802515944e-05], 0, 17.665316343307495, 1576756372.9378417], "v": 0.1}
13 | {"i": ["cuda -model=unknown", "topi_nn_conv2d", [["TENSOR", [1, 128, 28, 28], "float32"], ["TENSOR", [512, 128, 1, 1], "float32"], [1, 1], [0, 0], [1, 1], "NCHW", "float32"], {}, ["conv2d", [1, 128, 28, 28, "float32"], [512, 128, 1, 1, "float32"], [1, 1], [0, 0], [1, 1], "NCHW", "float32"], {"i": 7195570, "t": "direct", "c": null, "e": [["tile_f", "sp", [-1, 8, 8, 1]], ["tile_y", "sp", [-1, 1, 1, 4]], ["tile_x", "sp", [-1, 1, 28, 1]], ["tile_rc", "sp", [-1, 16]], ["tile_ry", "sp", [-1, 1]], ["tile_rx", "sp", [-1, 1]], ["auto_unroll_max_step", "ot", 1500], ["unroll_explicit", "ot", 0]]}], "r": [[2.3346970659581842e-05, 2.3350071733856995e-05, 2.3351470832852026e-05, 2.33531713064572e-05, 2.335339979207578e-05, 2.3353435023680258e-05, 2.3353469100150168e-05, 2.352672346078318e-05], 0, 3.9529354572296143, 1576758538.3815165], "v": 0.1}
14 | {"i": ["cuda -model=unknown", "topi_nn_conv2d", [["TENSOR", [1, 512, 28, 28], "float32"], ["TENSOR", [256, 512, 1, 1], "float32"], [1, 1], [0, 0], [1, 1], "NCHW", "float32"], {}, ["conv2d", [1, 512, 28, 28, "float32"], [256, 512, 1, 1, "float32"], [1, 1], [0, 0], [1, 1], "NCHW", "float32"], {"i": 6510597, "t": "direct", "c": null, "e": [["tile_f", "sp", [-1, 8, 8, 1]], ["tile_y", "sp", [-1, 1, 1, 2]], ["tile_x", "sp", [-1, 1, 14, 2]], ["tile_rc", "sp", [-1, 16]], ["tile_ry", "sp", [-1, 1]], ["tile_rx", "sp", [-1, 1]], ["auto_unroll_max_step", "ot", 1500], ["unroll_explicit", "ot", 0]]}], "r": [[5.3532303771890436e-05, 5.3536134934889986e-05, 5.3546005612932196e-05, 5.35473542882802e-05, 5.3553208576560395e-05, 5.3553290974405026e-05, 5.356555365963179e-05, 5.3567800853165695e-05], 0, 20.82706069946289, 1576760260.5486808], "v": 0.1}
15 | {"i": ["cuda -model=unknown", "topi_nn_conv2d", [["TENSOR", [1, 256, 28, 28], "float32"], ["TENSOR", [256, 256, 3, 3], "float32"], [2, 2], [1, 1], [1, 1], "NCHW", "float32"], {}, ["conv2d", [1, 256, 28, 28, "float32"], [256, 256, 3, 3, "float32"], [2, 2], [1, 1], [1, 1], "NCHW", "float32"], {"i": 8942372, "t": "direct", "c": null, "e": [["tile_f", "sp", [-1, 4, 16, 1]], ["tile_y", "sp", [-1, 1, 2, 1]], ["tile_x", "sp", [-1, 1, 7, 2]], ["tile_rc", "sp", [-1, 16]], ["tile_ry", "sp", [-1, 3]], ["tile_rx", "sp", [-1, 3]], ["auto_unroll_max_step", "ot", 1500], ["unroll_explicit", "ot", 1]]}], "r": [[0.00011063845656192236, 0.00011064159057301295, 0.00011065819362292052, 0.00011066027033271719, 0.0001106855050831793, 0.00011076385027726432, 0.00011078445009242144, 0.00011081034334565619], 0, 9.816993236541748, 1576764007.1941638], "v": 0.1}
16 | {"i": ["cuda -model=unknown", "topi_nn_conv2d", [["TENSOR", [1, 1024, 14, 14], "float32"], ["TENSOR", [256, 1024, 1, 1], "float32"], [1, 1], [0, 0], [1, 1], "NCHW", "float32"], {}, ["conv2d", [1, 1024, 14, 14, "float32"], [256, 1024, 1, 1, "float32"], [1, 1], [0, 0], [1, 1], "NCHW", "float32"], {"i": 198685, "t": "direct", "c": null, "e": [["tile_f", "sp", [-1, 2, 8, 1]], ["tile_y", "sp", [-1, 1, 2, 1]], ["tile_x", "sp", [-1, 1, 7, 2]], ["tile_rc", "sp", [-1, 16]], ["tile_ry", "sp", [-1, 1]], ["tile_rx", "sp", [-1, 1]], ["auto_unroll_max_step", "ot", 0], ["unroll_explicit", "ot", 0]]}], "r": [[5.348998772595403e-05, 5.3490697165811203e-05, 5.3492447444766794e-05, 5.349257978129882e-05, 5.3495581343450127e-05, 5.349945347020754e-05, 5.350027560812319e-05, 5.350121758536041e-05], 0, 12.985000848770142, 1576765852.0497272], "v": 0.1}
17 | {"i": ["cuda -model=unknown", "topi_nn_conv2d", [["TENSOR", [1, 256, 14, 14], "float32"], ["TENSOR", [256, 256, 3, 3], "float32"], [1, 1], [1, 1], [1, 1], "NCHW", "float32"], {}, ["conv2d", [1, 256, 14, 14, "float32"], [256, 256, 3, 3, "float32"], [1, 1], [1, 1], [1, 1], "NCHW", "float32"], {"i": 37032, "t": "winograd", "c": null, "e": [["tile_b", "sp", [-1, 1, 1, 1]], ["tile_y", "sp", [-1, 2, 16, 2]], ["tile_x", "sp", [-1, 7, 7, 1]], ["tile_rc", "sp", [-1, 16]], ["auto_unroll_max_step", "ot", 1500], ["unroll_explicit", "ot", 0]]}], "r": [[4.815374575928918e-05, 4.8269461833602585e-05, 4.8282748788368335e-05, 4.829056845718901e-05, 4.829301453957997e-05, 4.8313505250403876e-05, 4.83151344911147e-05, 4.836303352180937e-05], 0, 23.413869857788086, 1576767777.9769428], "v": 0.1}
18 | {"i": ["cuda -model=unknown", "topi_nn_conv2d", [["TENSOR", [1, 256, 14, 14], "float32"], ["TENSOR", [1024, 256, 1, 1], "float32"], [1, 1], [0, 0], [1, 1], "NCHW", "float32"], {}, ["conv2d", [1, 256, 14, 14, "float32"], [1024, 256, 1, 1, "float32"], [1, 1], [0, 0], [1, 1], "NCHW", "float32"], {"i": 2967855, "t": "direct", "c": null, "e": [["tile_f", "sp", [-1, 8, 8, 1]], ["tile_y", "sp", [-1, 1, 1, 2]], ["tile_x", "sp", [-1, 1, 14, 1]], ["tile_rc", "sp", [-1, 16]], ["tile_ry", "sp", [-1, 1]], ["tile_rx", "sp", [-1, 1]], ["auto_unroll_max_step", "ot", 512], ["unroll_explicit", "ot", 1]]}], "r": [[3.166483473053892e-05, 3.1665308183632737e-05, 3.1667968063872256e-05, 3.166937365269461e-05, 3.1671619294743846e-05, 3.167257045908184e-05, 3.167355941450432e-05, 3.167667092481703e-05], 0, 10.932707786560059, 1576768799.8192716], "v": 0.1}
19 | {"i": ["cuda -model=unknown", "topi_nn_conv2d", [["TENSOR", [1, 1024, 14, 14], "float32"], ["TENSOR", [512, 1024, 1, 1], "float32"], [1, 1], [0, 0], [1, 1], "NCHW", "float32"], {}, ["conv2d", [1, 1024, 14, 14, "float32"], [512, 1024, 1, 1, "float32"], [1, 1], [0, 0], [1, 1], "NCHW", "float32"], {"i": 241809, "t": "direct", "c": null, "e": [["tile_f", "sp", [-1, 4, 8, 1]], ["tile_y", "sp", [-1, 1, 7, 2]], ["tile_x", "sp", [-1, 1, 2, 1]], ["tile_rc", "sp", [-1, 16]], ["tile_ry", "sp", [-1, 1]], ["tile_rx", "sp", [-1, 1]], ["auto_unroll_max_step", "ot", 0], ["unroll_explicit", "ot", 0]]}], "r": [[7.510923047977422e-05, 7.51122966447162e-05, 7.51243276889307e-05, 7.512493289432423e-05, 7.512781812480401e-05, 7.51462053935403e-05, 7.51566958294136e-05, 7.51760090937598e-05], 0, 7.172631025314331, 1576770471.289473], "v": 0.1}
20 | {"i": ["cuda -model=unknown", "topi_nn_conv2d", [["TENSOR", [1, 512, 14, 14], "float32"], ["TENSOR", [512, 512, 3, 3], "float32"], [2, 2], [1, 1], [1, 1], "NCHW", "float32"], {}, ["conv2d", [1, 512, 14, 14, "float32"], [512, 512, 3, 3, "float32"], [2, 2], [1, 1], [1, 1], "NCHW", "float32"], {"i": 679873, "t": "direct", "c": null, "e": [["tile_f", "sp", [-1, 2, 4, 2]], ["tile_y", "sp", [-1, 1, 7, 1]], ["tile_x", "sp", [-1, 1, 1, 1]], ["tile_rc", "sp", [-1, 8]], ["tile_ry", "sp", [-1, 3]], ["tile_rx", "sp", [-1, 3]], ["auto_unroll_max_step", "ot", 512], ["unroll_explicit", "ot", 1]]}], "r": [[0.00020182939416666667, 0.0002018556, 0.00020186024833333336, 0.00020188062583333333, 0.00020188473333333334, 0.00020189181, 0.00020189958083333335, 0.00020191412583333334], 0, 19.01798701286316, 1576772277.906625], "v": 0.1}
21 | {"i": ["cuda -model=unknown", "topi_nn_conv2d", [["TENSOR", [1, 2048, 7, 7], "float32"], ["TENSOR", [512, 2048, 1, 1], "float32"], [1, 1], [0, 0], [1, 1], "NCHW", "float32"], {}, ["conv2d", [1, 2048, 7, 7, "float32"], [512, 2048, 1, 1, "float32"], [1, 1], [0, 0], [1, 1], "NCHW", "float32"], {"i": 14540, "t": "direct", "c": null, "e": [["tile_f", "sp", [-1, 2, 4, 1]], ["tile_y", "sp", [-1, 1, 7, 1]], ["tile_x", "sp", [-1, 1, 1, 1]], ["tile_rc", "sp", [-1, 16]], ["tile_ry", "sp", [-1, 1]], ["tile_rx", "sp", [-1, 1]], ["auto_unroll_max_step", "ot", 0], ["unroll_explicit", "ot", 0]]}], "r": [[0.00011496370787589499, 0.00011497747016706444, 0.00011497936276849642, 0.0001149833875894988, 0.00011500211169451074, 0.00011501386825775656, 0.00011501737088305489, 0.00011502488019093078], 0, 12.199096202850342, 1576774255.8655233], "v": 0.1}
22 | {"i": ["cuda -model=unknown", "topi_nn_conv2d", [["TENSOR", [1, 512, 7, 7], "float32"], ["TENSOR", [512, 512, 3, 3], "float32"], [1, 1], [1, 1], [1, 1], "NCHW", "float32"], {}, ["conv2d", [1, 512, 7, 7, "float32"], [512, 512, 3, 3, "float32"], [1, 1], [1, 1], [1, 1], "NCHW", "float32"], {"i": 190206, "t": "winograd", "c": null, "e": [["tile_b", "sp", [-1, 1, 1, 1]], ["tile_y", "sp", [-1, 1, 16, 4]], ["tile_x", "sp", [-1, 1, 8, 2]], ["tile_rc", "sp", [-1, 16]], ["auto_unroll_max_step", "ot", 1500], ["unroll_explicit", "ot", 0]]}], "r": [[8.524336980999296e-05, 8.525168859957776e-05, 8.532244299788881e-05, 8.538662983814214e-05, 8.542202048947068e-05, 8.544142857142857e-05, 8.544255093910074e-05, 8.565466306203757e-05], 0, 25.467085361480713, 1576776258.6715136], "v": 0.1}
23 | {"i": ["cuda -model=unknown", "topi_nn_conv2d", [["TENSOR", [1, 512, 7, 7], "float32"], ["TENSOR", [2048, 512, 1, 1], "float32"], [1, 1], [0, 0], [1, 1], "NCHW", "float32"], {}, ["conv2d", [1, 512, 7, 7, "float32"], [2048, 512, 1, 1, "float32"], [1, 1], [0, 0], [1, 1], "NCHW", "float32"], {"i": 318543, "t": "direct", "c": null, "e": [["tile_f", "sp", [-1, 2, 16, 1]], ["tile_y", "sp", [-1, 1, 1, 7]], ["tile_x", "sp", [-1, 1, 7, 1]], ["tile_rc", "sp", [-1, 16]], ["tile_ry", "sp", [-1, 1]], ["tile_rx", "sp", [-1, 1]], ["auto_unroll_max_step", "ot", 1500], ["unroll_explicit", "ot", 1]]}], "r": [[6.243043341968912e-05, 6.243638264248704e-05, 6.243913419689119e-05, 6.24425738341969e-05, 6.244570233160622e-05, 6.244827487046632e-05, 6.245158186528497e-05, 6.245367227979274e-05], 0, 7.487981796264648, 1576778781.189002], "v": 0.1}
24 | {"i": ["cuda -model=unknown", "topi_nn_dense", [["TENSOR", [1, 2048], "float32"], ["TENSOR", [1000, 2048], "float32"], null, "float32"], {}, ["dense", [1, 2048, "float32"], [1000, 2048, "float32"], 0, "float32"], {"i": 9, "t": "direct", "c": null, "e": [["tile_k", "sp", [-1, 512]]]}], "r": [[3.6132621990004544e-05, 3.613589429047402e-05, 3.613702059669847e-05, 3.615986172951689e-05, 3.61632292897168e-05, 3.616852339845525e-05, 3.617775026503105e-05, 3.621560305921551e-05], 0, 3.9012484550476074, 1576779588.3908424], "v": 0.1}
25 | 


--------------------------------------------------------------------------------
/mytophub/20191227/resnet50v1_bs_8.log:
--------------------------------------------------------------------------------
 1 | {"i": ["cuda -model=unknown", "topi_nn_conv2d", [["TENSOR", [8, 1024, 14, 14], "float32"], ["TENSOR", [2048, 1024, 1, 1], "float32"], [2, 2], [0, 0], [1, 1], "NCHW", "float32"], {}, ["conv2d", [8, 1024, 14, 14, "float32"], [2048, 1024, 1, 1, "float32"], [2, 2], [0, 0], [1, 1], "NCHW", "float32"], {"i": 330332, "t": "direct", "c": null, "e": [["tile_f", "sp", [-1, 1, 32, 4]], ["tile_y", "sp", [-1, 1, 1, 7]], ["tile_x", "sp", [-1, 1, 7, 1]], ["tile_rc", "sp", [-1, 2]], ["tile_ry", "sp", [-1, 1]], ["tile_rx", "sp", [-1, 1]], ["auto_unroll_max_step", "ot", 1500], ["unroll_explicit", "ot", 1]]}], "r": [[0.00047160208414872797, 0.00047163135812133074, 0.00047164415851272016, 0.00047164536790606653, 0.0004716625518590998, 0.00047166374951076323, 0.0004717278786692759, 0.0004717632602739726], 0, 26.869380712509155, 1576808413.8443494], "v": 0.1}
 2 | {"i": ["cuda -model=unknown", "topi_nn_conv2d", [["TENSOR", [8, 512, 28, 28], "float32"], ["TENSOR", [1024, 512, 1, 1], "float32"], [2, 2], [0, 0], [1, 1], "NCHW", "float32"], {}, ["conv2d", [8, 512, 28, 28, "float32"], [1024, 512, 1, 1, "float32"], [2, 2], [0, 0], [1, 1], "NCHW", "float32"], {"i": 3919726, "t": "direct", "c": null, "e": [["tile_f", "sp", [-1, 8, 8, 2]], ["tile_y", "sp", [-1, 1, 1, 2]], ["tile_x", "sp", [-1, 1, 14, 1]], ["tile_rc", "sp", [-1, 8]], ["tile_ry", "sp", [-1, 1]], ["tile_rx", "sp", [-1, 1]], ["auto_unroll_max_step", "ot", 1500], ["unroll_explicit", "ot", 1]]}], "r": [[0.0004231376433566434, 0.00042318330944055943, 0.0004232272237762238, 0.0004232554265734266, 0.000423284763986014, 0.0004232878041958042, 0.00042329359615384617, 0.00042339172027972027], 0, 18.58449935913086, 1576811312.0699723], "v": 0.1}
 3 | {"i": ["cuda -model=unknown", "topi_nn_conv2d", [["TENSOR", [8, 256, 56, 56], "float32"], ["TENSOR", [512, 256, 1, 1], "float32"], [2, 2], [0, 0], [1, 1], "NCHW", "float32"], {}, ["conv2d", [8, 256, 56, 56, "float32"], [512, 256, 1, 1, "float32"], [2, 2], [0, 0], [1, 1], "NCHW", "float32"], {"i": 13474247, "t": "direct", "c": null, "e": [["tile_f", "sp", [-1, 2, 16, 4]], ["tile_y", "sp", [-1, 1, 2, 1]], ["tile_x", "sp", [-1, 7, 4, 1]], ["tile_rc", "sp", [-1, 4]], ["tile_ry", "sp", [-1, 1]], ["tile_rx", "sp", [-1, 1]], ["auto_unroll_max_step", "ot", 512], ["unroll_explicit", "ot", 1]]}], "r": [[0.0003552764506627393, 0.0003552791192930781, 0.00035538548306332843, 0.00035539221060382914, 0.0003553961575846834, 0.0003554403843888071, 0.00035549844182621504, 0.000355517793814433], 0, 11.16990327835083, 1576813425.8043563], "v": 0.1}
 4 | {"i": ["cuda -model=unknown", "topi_nn_conv2d", [["TENSOR", [8, 3, 224, 224], "float32"], ["TENSOR", [64, 3, 7, 7], "float32"], [2, 2], [3, 3], [1, 1], "NCHW", "float32"], {}, ["conv2d", [8, 3, 224, 224, "float32"], [64, 3, 7, 7, "float32"], [2, 2], [3, 3], [1, 1], "NCHW", "float32"], {"i": 61634221, "t": "direct", "c": null, "e": [["tile_f", "sp", [-1, 1, 8, 4]], ["tile_y", "sp", [-1, 1, 1, 1]], ["tile_x", "sp", [-1, 7, 4, 2]], ["tile_rc", "sp", [-1, 3]], ["tile_ry", "sp", [-1, 1]], ["tile_rx", "sp", [-1, 7]], ["auto_unroll_max_step", "ot", 512], ["unroll_explicit", "ot", 1]]}], "r": [[0.0002864761902777778, 0.00028651795833333334, 0.0002865350194444444, 0.0002865372513888889, 0.0002865851402777778, 0.0002866628819444444, 0.0002866646388888889, 0.00028808960555555557], 0, 4.185834646224976, 1576815991.3093324], "v": 0.1}
 5 | {"i": ["cuda -model=unknown", "topi_nn_conv2d", [["TENSOR", [8, 64, 56, 56], "float32"], ["TENSOR", [64, 64, 1, 1], "float32"], [1, 1], [0, 0], [1, 1], "NCHW", "float32"], {}, ["conv2d", [8, 64, 56, 56, "float32"], [64, 64, 1, 1, "float32"], [1, 1], [0, 0], [1, 1], "NCHW", "float32"], {"i": 16820894, "t": "direct", "c": null, "e": [["tile_f", "sp", [-1, 2, 8, 4]], ["tile_y", "sp", [-1, 1, 2, 1]], ["tile_x", "sp", [-1, 7, 8, 1]], ["tile_rc", "sp", [-1, 8]], ["tile_ry", "sp", [-1, 1]], ["tile_rx", "sp", [-1, 1]], ["auto_unroll_max_step", "ot", 512], ["unroll_explicit", "ot", 1]]}], "r": [[5.9864956131605186e-05, 5.986604611166501e-05, 5.989440254237288e-05, 5.989909845463609e-05, 5.990691151545364e-05, 5.990998030907278e-05, 5.994871011964108e-05, 5.9950582253240285e-05], 0, 8.11007571220398, 1576817818.6117089], "v": 0.1}
 6 | {"i": ["cuda -model=unknown", "topi_nn_conv2d", [["TENSOR", [8, 256, 56, 56], "float32"], ["TENSOR", [64, 256, 1, 1], "float32"], [1, 1], [0, 0], [1, 1], "NCHW", "float32"], {}, ["conv2d", [8, 256, 56, 56, "float32"], [64, 256, 1, 1, "float32"], [1, 1], [0, 0], [1, 1], "NCHW", "float32"], {"i": 21121677, "t": "direct", "c": null, "e": [["tile_f", "sp", [-1, 4, 8, 2]], ["tile_y", "sp", [-1, 1, 2, 1]], ["tile_x", "sp", [-1, 7, 8, 1]], ["tile_rc", "sp", [-1, 8]], ["tile_ry", "sp", [-1, 1]], ["tile_rx", "sp", [-1, 1]], ["auto_unroll_max_step", "ot", 512], ["unroll_explicit", "ot", 1]]}], "r": [[0.00015421009636247608, 0.00015422605169112955, 0.00015426547351627315, 0.00015426834396936823, 0.00015427069049138483, 0.0001542987377153797, 0.00015430920038289725, 0.00015430931014677728], 0, 14.920397758483887, 1576819829.1106915], "v": 0.1}
 7 | {"i": ["cuda -model=unknown", "topi_nn_conv2d", [["TENSOR", [8, 64, 56, 56], "float32"], ["TENSOR", [64, 64, 3, 3], "float32"], [1, 1], [1, 1], [1, 1], "NCHW", "float32"], {}, ["conv2d", [8, 64, 56, 56, "float32"], [64, 64, 3, 3, "float32"], [1, 1], [1, 1], [1, 1], "NCHW", "float32"], {"i": 1511821, "t": "winograd", "c": null, "e": [["tile_b", "sp", [-1, 1, 1, 1]], ["tile_y", "sp", [-1, 1, 8, 8]], ["tile_x", "sp", [-1, 2, 16, 1]], ["tile_rc", "sp", [-1, 16]], ["auto_unroll_max_step", "ot", 128], ["unroll_explicit", "ot", 1]]}], "r": [[0.0003611332712369598, 0.00036153928192771085, 0.00036155675662650604, 0.00036156732048192776, 0.0003616308361445783, 0.00036165913734939755, 0.00036173152289156626, 0.00036173332289156623], 0, 26.35872220993042, 1576822608.4182003], "v": 0.1}
 8 | {"i": ["cuda -model=unknown", "topi_nn_conv2d", [["TENSOR", [8, 64, 56, 56], "float32"], ["TENSOR", [256, 64, 1, 1], "float32"], [1, 1], [0, 0], [1, 1], "NCHW", "float32"], {}, ["conv2d", [8, 64, 56, 56, "float32"], [256, 64, 1, 1, "float32"], [1, 1], [0, 0], [1, 1], "NCHW", "float32"], {"i": 42614582, "t": "direct", "c": null, "e": [["tile_f", "sp", [-1, 4, 16, 1]], ["tile_y", "sp", [-1, 1, 1, 2]], ["tile_x", "sp", [-1, 2, 28, 1]], ["tile_rc", "sp", [-1, 32]], ["tile_ry", "sp", [-1, 1]], ["tile_rx", "sp", [-1, 1]], ["auto_unroll_max_step", "ot", 1500], ["unroll_explicit", "ot", 1]]}], "r": [[0.00017792937223042835, 0.00017795839364844905, 0.00017796971935007386, 0.00017798387740029542, 0.0001779944261447563, 0.00017801247341211226, 0.00017801342023633678, 0.00017802488921713443], 0, 12.574925899505615, 1576826236.5003185], "v": 0.1}
 9 | {"i": ["cuda -model=unknown", "topi_nn_conv2d", [["TENSOR", [8, 256, 56, 56], "float32"], ["TENSOR", [128, 256, 1, 1], "float32"], [1, 1], [0, 0], [1, 1], "NCHW", "float32"], {}, ["conv2d", [8, 256, 56, 56, "float32"], [128, 256, 1, 1, "float32"], [1, 1], [0, 0], [1, 1], "NCHW", "float32"], {"i": 9437784, "t": "direct", "c": null, "e": [["tile_f", "sp", [-1, 8, 8, 1]], ["tile_y", "sp", [-1, 1, 2, 1]], ["tile_x", "sp", [-1, 7, 8, 1]], ["tile_rc", "sp", [-1, 8]], ["tile_ry", "sp", [-1, 1]], ["tile_rx", "sp", [-1, 1]], ["auto_unroll_max_step", "ot", 512], ["unroll_explicit", "ot", 0]]}], "r": [[0.0002548315581668625, 0.00025490869682726205, 0.0002549153266745006, 0.0002549291809635723, 0.00025493499059929493, 0.00025494996474735603, 0.00025498944770857817, 0.0002550550564042303], 0, 3.7231128215789795, 1576827731.1170166], "v": 0.1}
10 | {"i": ["cuda -model=unknown", "topi_nn_conv2d", [["TENSOR", [8, 128, 56, 56], "float32"], ["TENSOR", [128, 128, 3, 3], "float32"], [2, 2], [1, 1], [1, 1], "NCHW", "float32"], {}, ["conv2d", [8, 128, 56, 56, "float32"], [128, 128, 3, 3, "float32"], [2, 2], [1, 1], [1, 1], "NCHW", "float32"], {"i": 17141702, "t": "direct", "c": null, "e": [["tile_f", "sp", [-1, 2, 32, 2]], ["tile_y", "sp", [-1, 2, 2, 1]], ["tile_x", "sp", [-1, 7, 4, 1]], ["tile_rc", "sp", [-1, 2]], ["tile_ry", "sp", [-1, 3]], ["tile_rx", "sp", [-1, 3]], ["auto_unroll_max_step", "ot", 1500], ["unroll_explicit", "ot", 0]]}], "r": [[0.00033271655875831484, 0.0003327339090909091, 0.000332819467849224, 0.00033283552106430154, 0.00033284973614190684, 0.00033287114190687367, 0.0003328896629711752, 0.0003329257804878049], 0, 10.087277889251709, 1576831663.058365], "v": 0.1}
11 | {"i": ["cuda -model=unknown", "topi_nn_conv2d", [["TENSOR", [8, 512, 28, 28], "float32"], ["TENSOR", [128, 512, 1, 1], "float32"], [1, 1], [0, 0], [1, 1], "NCHW", "float32"], {}, ["conv2d", [8, 512, 28, 28, "float32"], [128, 512, 1, 1, "float32"], [1, 1], [0, 0], [1, 1], "NCHW", "float32"], {"i": 4470080, "t": "direct", "c": null, "e": [["tile_f", "sp", [-1, 2, 8, 4]], ["tile_y", "sp", [-1, 1, 4, 1]], ["tile_x", "sp", [-1, 7, 4, 1]], ["tile_rc", "sp", [-1, 8]], ["tile_ry", "sp", [-1, 1]], ["tile_rx", "sp", [-1, 1]], ["auto_unroll_max_step", "ot", 1500], ["unroll_explicit", "ot", 0]]}], "r": [[0.00012369274896142433, 0.00012372559584569734, 0.00012373454836795252, 0.00012374497566765578, 0.00012377120415430265, 0.00012379367477744805, 0.00012381379228486647, 0.00012388769020771513], 0, 3.823491334915161, 1576833619.866541], "v": 0.1}
12 | {"i": ["cuda -model=unknown", "topi_nn_conv2d", [["TENSOR", [8, 128, 28, 28], "float32"], ["TENSOR", [128, 128, 3, 3], "float32"], [1, 1], [1, 1], [1, 1], "NCHW", "float32"], {}, ["conv2d", [8, 128, 28, 28, "float32"], [128, 128, 3, 3, "float32"], [1, 1], [1, 1], [1, 1], "NCHW", "float32"], {"i": 275960, "t": "winograd", "c": null, "e": [["tile_b", "sp", [-1, 1, 1, 1]], ["tile_y", "sp", [-1, 2, 8, 4]], ["tile_x", "sp", [-1, 4, 8, 1]], ["tile_rc", "sp", [-1, 16]], ["auto_unroll_max_step", "ot", 0], ["unroll_explicit", "ot", 0]]}], "r": [[0.0002838563733804476, 0.0002839569387514723, 0.0002839840800942285, 0.0002840338138987044, 0.000284164679623086, 0.00028439453121319197, 0.000284434667844523, 0.00028472418492343933], 0, 24.73231339454651, 1576835294.7347863], "v": 0.1}
13 | {"i": ["cuda -model=unknown", "topi_nn_conv2d", [["TENSOR", [8, 128, 28, 28], "float32"], ["TENSOR", [512, 128, 1, 1], "float32"], [1, 1], [0, 0], [1, 1], "NCHW", "float32"], {}, ["conv2d", [8, 128, 28, 28, "float32"], [512, 128, 1, 1, "float32"], [1, 1], [0, 0], [1, 1], "NCHW", "float32"], {"i": 3971154, "t": "direct", "c": null, "e": [["tile_f", "sp", [-1, 1, 8, 8]], ["tile_y", "sp", [-1, 1, 4, 1]], ["tile_x", "sp", [-1, 7, 4, 1]], ["tile_rc", "sp", [-1, 8]], ["tile_ry", "sp", [-1, 1]], ["tile_rx", "sp", [-1, 1]], ["auto_unroll_max_step", "ot", 512], ["unroll_explicit", "ot", 0]]}], "r": [[0.00013289392889173607, 0.00013291090134529148, 0.00013292713004484305, 0.000132931466367713, 0.000132932600256246, 0.00013293274503523383, 0.0001330451800128123, 0.00013305504420243433], 0, 3.9201998710632324, 1576836702.1013336], "v": 0.1}
14 | {"i": ["cuda -model=unknown", "topi_nn_conv2d", [["TENSOR", [8, 512, 28, 28], "float32"], ["TENSOR", [256, 512, 1, 1], "float32"], [1, 1], [0, 0], [1, 1], "NCHW", "float32"], {}, ["conv2d", [8, 512, 28, 28, "float32"], [256, 512, 1, 1, "float32"], [1, 1], [0, 0], [1, 1], "NCHW", "float32"], {"i": 3506350, "t": "direct", "c": null, "e": [["tile_f", "sp", [-1, 2, 8, 4]], ["tile_y", "sp", [-1, 1, 4, 1]], ["tile_x", "sp", [-1, 7, 4, 1]], ["tile_rc", "sp", [-1, 8]], ["tile_ry", "sp", [-1, 1]], ["tile_rx", "sp", [-1, 1]], ["auto_unroll_max_step", "ot", 512], ["unroll_explicit", "ot", 0]]}], "r": [[0.00023379110125361618, 0.00023381751880424302, 0.00023383771456123432, 0.00023387868273866926, 0.00023388072613307617, 0.00023388729893924782, 0.00023388846769527484, 0.00023398381774349084], 0, 19.840566635131836, 1576837962.458536], "v": 0.1}
15 | {"i": ["cuda -model=unknown", "topi_nn_conv2d", [["TENSOR", [8, 256, 28, 28], "float32"], ["TENSOR", [256, 256, 3, 3], "float32"], [2, 2], [1, 1], [1, 1], "NCHW", "float32"], {}, ["conv2d", [8, 256, 28, 28, "float32"], [256, 256, 3, 3, "float32"], [2, 2], [1, 1], [1, 1], "NCHW", "float32"], {"i": 7270269, "t": "direct", "c": null, "e": [["tile_f", "sp", [-1, 1, 64, 1]], ["tile_y", "sp", [-1, 1, 2, 7]], ["tile_x", "sp", [-1, 2, 1, 1]], ["tile_rc", "sp", [-1, 2]], ["tile_ry", "sp", [-1, 3]], ["tile_rx", "sp", [-1, 3]], ["auto_unroll_max_step", "ot", 512], ["unroll_explicit", "ot", 1]]}], "r": [[0.0003585990673652695, 0.00035861623502994014, 0.0003586432724550898, 0.0003586568997005988, 0.00035865918413173655, 0.0003586718188622755, 0.000358678747005988, 0.0003586881362275449], 0, 20.626556396484375, 1576839684.304072], "v": 0.1}
16 | {"i": ["cuda -model=unknown", "topi_nn_conv2d", [["TENSOR", [8, 1024, 14, 14], "float32"], ["TENSOR", [256, 1024, 1, 1], "float32"], [1, 1], [0, 0], [1, 1], "NCHW", "float32"], {}, ["conv2d", [8, 1024, 14, 14, "float32"], [256, 1024, 1, 1, "float32"], [1, 1], [0, 0], [1, 1], "NCHW", "float32"], {"i": 2463856, "t": "direct", "c": null, "e": [["tile_f", "sp", [-1, 2, 32, 2]], ["tile_y", "sp", [-1, 1, 2, 1]], ["tile_x", "sp", [-1, 7, 2, 1]], ["tile_rc", "sp", [-1, 8]], ["tile_ry", "sp", [-1, 1]], ["tile_rx", "sp", [-1, 1]], ["auto_unroll_max_step", "ot", 1500], ["unroll_explicit", "ot", 1]]}], "r": [[0.0001897665987460815, 0.0001897788299373041, 0.00018978015125391848, 0.000189795079153605, 0.0001898249984326019, 0.00018982909247648903, 0.00018983846394984326, 0.00018984618338557994], 0, 6.394846200942993, 1576841458.9137883], "v": 0.1}
17 | {"i": ["cuda -model=unknown", "topi_nn_conv2d", [["TENSOR", [8, 256, 14, 14], "float32"], ["TENSOR", [256, 256, 3, 3], "float32"], [1, 1], [1, 1], [1, 1], "NCHW", "float32"], {}, ["conv2d", [8, 256, 14, 14, "float32"], [256, 256, 3, 3, "float32"], [1, 1], [1, 1], [1, 1], "NCHW", "float32"], {"i": 1590040, "t": "winograd", "c": null, "e": [["tile_b", "sp", [-1, 1, 1, 1]], ["tile_y", "sp", [-1, 2, 8, 4]], ["tile_x", "sp", [-1, 7, 8, 1]], ["tile_rc", "sp", [-1, 8]], ["auto_unroll_max_step", "ot", 1500], ["unroll_explicit", "ot", 1]]}], "r": [[0.00020388628940217393, 0.0002039372105978261, 0.00020394932608695654, 0.00020401635869565215, 0.00020405609510869564, 0.00020423308831521737, 0.0002043133179347826, 0.0002044258722826087], 0, 3.752495527267456, 1576842857.580174], "v": 0.1}
18 | {"i": ["cuda -model=unknown", "topi_nn_conv2d", [["TENSOR", [8, 256, 14, 14], "float32"], ["TENSOR", [1024, 256, 1, 1], "float32"], [1, 1], [0, 0], [1, 1], "NCHW", "float32"], {}, ["conv2d", [8, 256, 14, 14, "float32"], [1024, 256, 1, 1, "float32"], [1, 1], [0, 0], [1, 1], "NCHW", "float32"], {"i": 3551643, "t": "direct", "c": null, "e": [["tile_f", "sp", [-1, 4, 8, 2]], ["tile_y", "sp", [-1, 7, 1, 1]], ["tile_x", "sp", [-1, 1, 14, 1]], ["tile_rc", "sp", [-1, 8]], ["tile_ry", "sp", [-1, 1]], ["tile_rx", "sp", [-1, 1]], ["auto_unroll_max_step", "ot", 1500], ["unroll_explicit", "ot", 1]]}], "r": [[0.0001493197428924598, 0.00014934773980222497, 0.00014935392459826947, 0.00014935883189122373, 0.00014935919097651423, 0.0001493756347342398, 0.00014938754264524104, 0.00014940507540173054], 0, 7.563421964645386, 1576844478.294257], "v": 0.1}
19 | {"i": ["cuda -model=unknown", "topi_nn_conv2d", [["TENSOR", [8, 1024, 14, 14], "float32"], ["TENSOR", [512, 1024, 1, 1], "float32"], [1, 1], [0, 0], [1, 1], "NCHW", "float32"], {}, ["conv2d", [8, 1024, 14, 14, "float32"], [512, 1024, 1, 1, "float32"], [1, 1], [0, 0], [1, 1], "NCHW", "float32"], {"i": 3228850, "t": "direct", "c": null, "e": [["tile_f", "sp", [-1, 1, 32, 4]], ["tile_y", "sp", [-1, 1, 2, 1]], ["tile_x", "sp", [-1, 7, 2, 1]], ["tile_rc", "sp", [-1, 4]], ["tile_ry", "sp", [-1, 1]], ["tile_rx", "sp", [-1, 1]], ["auto_unroll_max_step", "ot", 1500], ["unroll_explicit", "ot", 1]]}], "r": [[0.0003698850180505415, 0.00036991004873646206, 0.0003699644657039711, 0.0003699842310469314, 0.0003699897545126354, 0.0003699923375451264, 0.00037000330866425994, 0.00037061573285198556], 0, 3.743291139602661, 1576846193.779363], "v": 0.1}
20 | {"i": ["cuda -model=unknown", "topi_nn_conv2d", [["TENSOR", [8, 512, 14, 14], "float32"], ["TENSOR", [512, 512, 3, 3], "float32"], [2, 2], [1, 1], [1, 1], "NCHW", "float32"], {}, ["conv2d", [8, 512, 14, 14, "float32"], [512, 512, 3, 3, "float32"], [2, 2], [1, 1], [1, 1], "NCHW", "float32"], {"i": 814525, "t": "direct", "c": null, "e": [["tile_f", "sp", [-1, 1, 16, 2]], ["tile_y", "sp", [-1, 1, 7, 1]], ["tile_x", "sp", [-1, 7, 1, 1]], ["tile_rc", "sp", [-1, 2]], ["tile_ry", "sp", [-1, 3]], ["tile_rx", "sp", [-1, 3]], ["auto_unroll_max_step", "ot", 1500], ["unroll_explicit", "ot", 1]]}], "r": [[0.0006693071712707183, 0.0006693115580110497, 0.0006693274088397789, 0.0006694159696132596, 0.0006694531270718232, 0.0006695107458563536, 0.0006695622099447513, 0.0006696967624309392], 0, 19.60824728012085, 1576849501.9256742], "v": 0.1}
21 | {"i": ["cuda -model=unknown", "topi_nn_conv2d", [["TENSOR", [8, 2048, 7, 7], "float32"], ["TENSOR", [512, 2048, 1, 1], "float32"], [1, 1], [0, 0], [1, 1], "NCHW", "float32"], {}, ["conv2d", [8, 2048, 7, 7, "float32"], [512, 2048, 1, 1, "float32"], [1, 1], [0, 0], [1, 1], "NCHW", "float32"], {"i": 226636, "t": "direct", "c": null, "e": [["tile_f", "sp", [-1, 4, 16, 1]], ["tile_y", "sp", [-1, 1, 7, 1]], ["tile_x", "sp", [-1, 7, 1, 1]], ["tile_rc", "sp", [-1, 16]], ["tile_ry", "sp", [-1, 1]], ["tile_rx", "sp", [-1, 1]], ["auto_unroll_max_step", "ot", 1500], ["unroll_explicit", "ot", 1]]}], "r": [[0.00026882018638392855, 0.00026882994419642856, 0.0002688526339285714, 0.00026886569754464284, 0.0002688759375, 0.00026889643861607144, 0.0002688987098214286, 0.0002689007779017857], 0, 10.092194557189941, 1576851044.2091682], "v": 0.1}
22 | {"i": ["cuda -model=unknown", "topi_nn_conv2d", [["TENSOR", [8, 512, 7, 7], "float32"], ["TENSOR", [512, 512, 3, 3], "float32"], [1, 1], [1, 1], [1, 1], "NCHW", "float32"], {}, ["conv2d", [8, 512, 7, 7, "float32"], [512, 512, 3, 3, "float32"], [1, 1], [1, 1], [1, 1], "NCHW", "float32"], {"i": 1404638, "t": "winograd", "c": null, "e": [["tile_b", "sp", [-1, 1, 1, 1]], ["tile_y", "sp", [-1, 1, 16, 8]], ["tile_x", "sp", [-1, 8, 8, 1]], ["tile_rc", "sp", [-1, 8]], ["auto_unroll_max_step", "ot", 1500], ["unroll_explicit", "ot", 1]]}], "r": [[0.00019711420424836602, 0.00019726046732026145, 0.00019727502369281045, 0.00019740761683006535, 0.00019772490114379086, 0.0001977619297385621, 0.00019793598120915033, 0.0001979388316993464], 0, 19.893366813659668, 1576852256.3964272], "v": 0.1}
23 | {"i": ["cuda -model=unknown", "topi_nn_conv2d", [["TENSOR", [8, 512, 7, 7], "float32"], ["TENSOR", [2048, 512, 1, 1], "float32"], [1, 1], [0, 0], [1, 1], "NCHW", "float32"], {}, ["conv2d", [8, 512, 7, 7, "float32"], [2048, 512, 1, 1, "float32"], [1, 1], [0, 0], [1, 1], "NCHW", "float32"], {"i": 318544, "t": "direct", "c": null, "e": [["tile_f", "sp", [-1, 4, 16, 1]], ["tile_y", "sp", [-1, 1, 1, 7]], ["tile_x", "sp", [-1, 1, 7, 1]], ["tile_rc", "sp", [-1, 16]], ["tile_ry", "sp", [-1, 1]], ["tile_rx", "sp", [-1, 1]], ["auto_unroll_max_step", "ot", 1500], ["unroll_explicit", "ot", 1]]}], "r": [[0.00021115921561668148, 0.00021138156255545698, 0.00021152970807453416, 0.00021153112511091392, 0.00021155988819875776, 0.00021162797604259095, 0.00021166313043478259, 0.0002117495039929015], 0, 7.84623908996582, 1576853859.8838825], "v": 0.1}
24 | {"i": ["cuda -model=unknown", "topi_nn_dense", [["TENSOR", [8, 2048], "float32"], ["TENSOR", [1000, 2048], "float32"], null, "float32"], {}, ["dense", [8, 2048, "float32"], [1000, 2048, "float32"], 0, "float32"], {"i": 9, "t": "direct", "c": null, "e": [["tile_k", "sp", [-1, 512]]]}], "r": [[0.0002670240141843972, 0.00026794887234042554, 0.0002682500390070922, 0.0002682610035460993, 0.0002683441666666667, 0.00026834742198581565, 0.00026836355319148934, 0.00026842092730496454], 0, 2.7772510051727295, 1576854668.3834345], "v": 0.1}
25 | 


--------------------------------------------------------------------------------
/mytophub/20191227/mobilenet_v2_bs_8.log:
--------------------------------------------------------------------------------
 1 | {"i": ["cuda -model=unknown", "topi_nn_depthwise_conv2d_nchw", [["TENSOR", [8, 576, 14, 14], "float32"], ["TENSOR", [576, 1, 3, 3], "float32"], [1, 1], [1, 1], [1, 1], "float32"], {}, ["depthwise_conv2d_nchw", [8, 576, 14, 14, "float32"], [576, 1, 3, 3, "float32"], [1, 1], [1, 1], [1, 1], "float32"], {"i": 1228920, "t": "direct", "c": null, "e": [["tile_f", "sp", [-1, 1, 1, 1]], ["tile_y", "sp", [-1, 2, 7, 1]], ["tile_x", "sp", [-1, 1, 7, 2]], ["auto_unroll_max_step", "ot", 1500], ["unroll_explicit", "ot", 1]]}], "r": [[3.5669175673663835e-05, 3.567513056423999e-05, 3.567604213190412e-05, 3.5687416555009676e-05, 3.569185573916927e-05, 3.5696842786958465e-05, 3.569754652374572e-05, 3.5718478785171956e-05], 0, 6.818683624267578, 1577025186.3151221], "v": 0.1}
 2 | {"i": ["cuda -model=unknown", "topi_nn_depthwise_conv2d_nchw", [["TENSOR", [8, 192, 28, 28], "float32"], ["TENSOR", [192, 1, 3, 3], "float32"], [1, 1], [1, 1], [1, 1], "float32"], {}, ["depthwise_conv2d_nchw", [8, 192, 28, 28, "float32"], [192, 1, 3, 3, "float32"], [1, 1], [1, 1], [1, 1], "float32"], {"i": 2928254, "t": "direct", "c": null, "e": [["tile_f", "sp", [-1, 1, 2, 1]], ["tile_y", "sp", [-1, 1, 4, 7]], ["tile_x", "sp", [-1, 1, 28, 1]], ["auto_unroll_max_step", "ot", 1500], ["unroll_explicit", "ot", 1]]}], "r": [[4.3285783862194015e-05, 4.3345013780598366e-05, 4.339025294650952e-05, 4.339941432456935e-05, 4.3416304805077065e-05, 4.349966727107888e-05, 4.352685947416137e-05, 4.3569819038984586e-05], 0, 15.84661078453064, 1577027391.440542], "v": 0.1}
 3 | {"i": ["cuda -model=unknown", "topi_nn_depthwise_conv2d_nchw", [["TENSOR", [8, 144, 56, 56], "float32"], ["TENSOR", [144, 1, 3, 3], "float32"], [1, 1], [1, 1], [1, 1], "float32"], {}, ["depthwise_conv2d_nchw", [8, 144, 56, 56, "float32"], [144, 1, 3, 3, "float32"], [1, 1], [1, 1], [1, 1], "float32"], {"i": 9794400, "t": "direct", "c": null, "e": [["tile_f", "sp", [-1, 1, 1, 1]], ["tile_y", "sp", [-1, 1, 4, 7]], ["tile_x", "sp", [-1, 1, 56, 1]], ["auto_unroll_max_step", "ot", 256], ["unroll_explicit", "ot", 1]]}], "r": [[0.0001232650200927357, 0.00012327157702215352, 0.00012327463626996395, 0.00012330674188562596, 0.0001233340432766615, 0.00012337212879958784, 0.00012342744513137556, 0.00012344814064914992], 0, 4.2695276737213135, 1577029008.68025], "v": 0.1}
 4 | {"i": ["cuda -model=unknown", "topi_nn_conv2d", [["TENSOR", [8, 3, 224, 224], "float32"], ["TENSOR", [32, 3, 3, 3], "float32"], [2, 2], [1, 1], [1, 1], "NCHW", "float32"], {}, ["conv2d", [8, 3, 224, 224, "float32"], [32, 3, 3, 3, "float32"], [2, 2], [1, 1], [1, 1], "NCHW", "float32"], {"i": 42006765, "t": "direct", "c": null, "e": [["tile_f", "sp", [-1, 1, 8, 4]], ["tile_y", "sp", [-1, 1, 1, 1]], ["tile_x", "sp", [-1, 7, 16, 1]], ["tile_rc", "sp", [-1, 1]], ["tile_ry", "sp", [-1, 3]], ["tile_rx", "sp", [-1, 3]], ["auto_unroll_max_step", "ot", 512], ["unroll_explicit", "ot", 1]]}], "r": [[8.33800919819507e-05, 8.342193162096494e-05, 8.348304095800069e-05, 8.358770669906282e-05, 8.364438562998958e-05, 8.366169940992711e-05, 8.368628913571676e-05, 8.371644602568552e-05], 0, 27.775304794311523, 1577031705.2260394], "v": 0.1}
 5 | {"i": ["cuda -model=unknown", "topi_nn_depthwise_conv2d_nchw", [["TENSOR", [8, 32, 112, 112], "float32"], ["TENSOR", [32, 1, 3, 3], "float32"], [1, 1], [1, 1], [1, 1], "float32"], {}, ["depthwise_conv2d_nchw", [8, 32, 112, 112, "float32"], [32, 1, 3, 3, "float32"], [1, 1], [1, 1], [1, 1], "float32"], {"i": 3626840, "t": "direct", "c": null, "e": [["tile_f", "sp", [-1, 1, 1, 1]], ["tile_y", "sp", [-1, 1, 4, 4]], ["tile_x", "sp", [-1, 1, 56, 1]], ["auto_unroll_max_step", "ot", 0], ["unroll_explicit", "ot", 1]]}], "r": [[0.0001104305337310693, 0.00011043610096374484, 0.00011043797521798991, 0.00011045434694814134, 0.00011045655667737495, 0.00011047186737035336, 0.00011054134511243689, 0.00011090290637907297], 0, 3.5581119060516357, 1577033846.5544987], "v": 0.1}
 6 | {"i": ["cuda -model=unknown", "topi_nn_conv2d", [["TENSOR", [8, 32, 112, 112], "float32"], ["TENSOR", [16, 32, 1, 1], "float32"], [1, 1], [0, 0], [1, 1], "NCHW", "float32"], {}, ["conv2d", [8, 32, 112, 112, "float32"], [16, 32, 1, 1, "float32"], [1, 1], [0, 0], [1, 1], "NCHW", "float32"], {"i": 9819770, "t": "direct", "c": null, "e": [["tile_f", "sp", [-1, 1, 4, 4]], ["tile_y", "sp", [-1, 8, 1, 1]], ["tile_x", "sp", [-1, 1, 112, 1]], ["tile_rc", "sp", [-1, 4]], ["tile_ry", "sp", [-1, 1]], ["tile_rx", "sp", [-1, 1]], ["auto_unroll_max_step", "ot", 1500], ["unroll_explicit", "ot", 0]]}], "r": [[7.883216835899116e-05, 7.883803668522764e-05, 7.884616049787094e-05, 7.885013560432362e-05, 7.8856649852604e-05, 7.887155715689485e-05, 7.887355650180151e-05, 7.887720700949885e-05], 0, 30.159452438354492, 1577037320.3374894], "v": 0.1}
 7 | {"i": ["cuda -model=unknown", "topi_nn_conv2d", [["TENSOR", [8, 16, 112, 112], "float32"], ["TENSOR", [96, 16, 1, 1], "float32"], [1, 1], [0, 0], [1, 1], "NCHW", "float32"], {}, ["conv2d", [8, 16, 112, 112, "float32"], [96, 16, 1, 1, "float32"], [1, 1], [0, 0], [1, 1], "NCHW", "float32"], {"i": 124125292, "t": "direct", "c": null, "e": [["tile_f", "sp", [-1, 2, 8, 6]], ["tile_y", "sp", [-1, 1, 2, 1]], ["tile_x", "sp", [-1, 7, 16, 1]], ["tile_rc", "sp", [-1, 8]], ["tile_ry", "sp", [-1, 1]], ["tile_rx", "sp", [-1, 1]], ["auto_unroll_max_step", "ot", 1500], ["unroll_explicit", "ot", 1]]}], "r": [[0.00018732534856700234, 0.0001873291525948877, 0.00018736358249419057, 0.00018753343919442292, 0.00018761719442292798, 0.00018767686134779242, 0.0001877239194422928, 0.0001877502416731216], 0, 12.485112190246582, 1577040071.49598], "v": 0.1}
 8 | {"i": ["cuda -model=unknown", "topi_nn_depthwise_conv2d_nchw", [["TENSOR", [8, 96, 112, 112], "float32"], ["TENSOR", [96, 1, 3, 3], "float32"], [2, 2], [1, 1], [1, 1], "float32"], {}, ["depthwise_conv2d_nchw", [8, 96, 112, 112, "float32"], [96, 1, 3, 3, "float32"], [2, 2], [1, 1], [1, 1], "float32"], {"i": 6260800, "t": "direct", "c": null, "e": [["tile_f", "sp", [-1, 1, 1, 1]], ["tile_y", "sp", [-1, 1, 1, 2]], ["tile_x", "sp", [-1, 1, 56, 1]], ["auto_unroll_max_step", "ot", 256], ["unroll_explicit", "ot", 1]]}], "r": [[0.0002090474766435986, 0.0002090854671280277, 0.0002090897560553633, 0.00020909301557093426, 0.0002090958088235294, 0.0002090997759515571, 0.00020913492387543252, 0.0002091731418685121], 0, 17.75760316848755, 1577042127.8527424], "v": 0.1}
 9 | {"i": ["cuda -model=unknown", "topi_nn_conv2d", [["TENSOR", [8, 96, 56, 56], "float32"], ["TENSOR", [24, 96, 1, 1], "float32"], [1, 1], [0, 0], [1, 1], "NCHW", "float32"], {}, ["conv2d", [8, 96, 56, 56, "float32"], [24, 96, 1, 1, "float32"], [1, 1], [0, 0], [1, 1], "NCHW", "float32"], {"i": 32437752, "t": "direct", "c": null, "e": [["tile_f", "sp", [-1, 1, 4, 6]], ["tile_y", "sp", [-1, 2, 1, 2]], ["tile_x", "sp", [-1, 2, 28, 1]], ["tile_rc", "sp", [-1, 4]], ["tile_ry", "sp", [-1, 1]], ["tile_rx", "sp", [-1, 1]], ["auto_unroll_max_step", "ot", 1500], ["unroll_explicit", "ot", 1]]}], "r": [[5.322861279610361e-05, 5.323237480628736e-05, 5.3252101394731015e-05, 5.325578746955945e-05, 5.3257390967456274e-05, 5.326599535089661e-05, 5.326720301084791e-05, 5.327394819570511e-05], 0, 23.620360851287842, 1577047087.701304], "v": 0.1}
10 | {"i": ["cuda -model=unknown", "topi_nn_conv2d", [["TENSOR", [8, 144, 56, 56], "float32"], ["TENSOR", [24, 144, 1, 1], "float32"], [1, 1], [0, 0], [1, 1], "NCHW", "float32"], {}, ["conv2d", [8, 144, 56, 56, "float32"], [24, 144, 1, 1, "float32"], [1, 1], [0, 0], [1, 1], "NCHW", "float32"], {"i": 40636097, "t": "direct", "c": null, "e": [["tile_f", "sp", [-1, 8, 3, 1]], ["tile_y", "sp", [-1, 2, 1, 2]], ["tile_x", "sp", [-1, 1, 56, 1]], ["tile_rc", "sp", [-1, 6]], ["tile_ry", "sp", [-1, 1]], ["tile_rx", "sp", [-1, 1]], ["auto_unroll_max_step", "ot", 1500], ["unroll_explicit", "ot", 1]]}], "r": [[7.142262195845698e-05, 7.143360059347181e-05, 7.144461899109792e-05, 7.145777002967359e-05, 7.146389910979229e-05, 7.150970623145401e-05, 7.15296e-05, 7.153969673590505e-05], 0, 21.12481117248535, 1577050269.7861612], "v": 0.1}
11 | {"i": ["cuda -model=unknown", "topi_nn_conv2d", [["TENSOR", [8, 24, 56, 56], "float32"], ["TENSOR", [144, 24, 1, 1], "float32"], [1, 1], [0, 0], [1, 1], "NCHW", "float32"], {}, ["conv2d", [8, 24, 56, 56, "float32"], [144, 24, 1, 1, "float32"], [1, 1], [0, 0], [1, 1], "NCHW", "float32"], {"i": 47852511, "t": "direct", "c": null, "e": [["tile_f", "sp", [-1, 2, 2, 3]], ["tile_y", "sp", [-1, 2, 1, 1]], ["tile_x", "sp", [-1, 1, 56, 1]], ["tile_rc", "sp", [-1, 8]], ["tile_ry", "sp", [-1, 1]], ["tile_rx", "sp", [-1, 1]], ["auto_unroll_max_step", "ot", 1500], ["unroll_explicit", "ot", 0]]}], "r": [[6.667625798319328e-05, 6.668540084033613e-05, 6.66967112044818e-05, 6.670515630252101e-05, 6.670611120448179e-05, 6.670913081232492e-05, 6.673613417366947e-05, 6.674050588235294e-05], 0, 6.433590650558472, 1577052848.5041995], "v": 0.1}
12 | {"i": ["cuda -model=unknown", "topi_nn_depthwise_conv2d_nchw", [["TENSOR", [8, 144, 56, 56], "float32"], ["TENSOR", [144, 1, 3, 3], "float32"], [2, 2], [1, 1], [1, 1], "float32"], {}, ["depthwise_conv2d_nchw", [8, 144, 56, 56, "float32"], [144, 1, 3, 3, "float32"], [2, 2], [1, 1], [1, 1], "float32"], {"i": 2478700, "t": "direct", "c": null, "e": [["tile_f", "sp", [-1, 1, 1, 1]], ["tile_y", "sp", [-1, 4, 1, 1]], ["tile_x", "sp", [-1, 1, 28, 1]], ["auto_unroll_max_step", "ot", 256], ["unroll_explicit", "ot", 1]]}], "r": [[8.157018246205734e-05, 8.161573254637437e-05, 8.161759527824621e-05, 8.165797639123103e-05, 8.17569784148398e-05, 8.180711838111299e-05, 8.183942833052277e-05, 8.18401976391231e-05], 0, 40.31033253669739, 1577054424.1942546], "v": 0.1}
13 | {"i": ["cuda -model=unknown", "topi_nn_conv2d", [["TENSOR", [8, 144, 28, 28], "float32"], ["TENSOR", [32, 144, 1, 1], "float32"], [1, 1], [0, 0], [1, 1], "NCHW", "float32"], {}, ["conv2d", [8, 144, 28, 28, "float32"], [32, 144, 1, 1, "float32"], [1, 1], [0, 0], [1, 1], "NCHW", "float32"], {"i": 3443905, "t": "direct", "c": null, "e": [["tile_f", "sp", [-1, 4, 8, 1]], ["tile_y", "sp", [-1, 1, 1, 2]], ["tile_x", "sp", [-1, 1, 28, 1]], ["tile_rc", "sp", [-1, 16]], ["tile_ry", "sp", [-1, 1]], ["tile_rx", "sp", [-1, 1]], ["auto_unroll_max_step", "ot", 1500], ["unroll_explicit", "ot", 0]]}], "r": [[2.454023656908064e-05, 2.4576680225988702e-05, 2.4599707447354905e-05, 2.4608112891628146e-05, 2.4628948022598868e-05, 2.463908145865434e-05, 2.466981068310221e-05, 2.47066614278377e-05], 0, 4.257974147796631, 1577056282.017053], "v": 0.1}
14 | {"i": ["cuda -model=unknown", "topi_nn_conv2d", [["TENSOR", [8, 192, 28, 28], "float32"], ["TENSOR", [32, 192, 1, 1], "float32"], [1, 1], [0, 0], [1, 1], "NCHW", "float32"], {}, ["conv2d", [8, 192, 28, 28, "float32"], [32, 192, 1, 1, "float32"], [1, 1], [0, 0], [1, 1], "NCHW", "float32"], {"i": 5594301, "t": "direct", "c": null, "e": [["tile_f", "sp", [-1, 4, 4, 1]], ["tile_y", "sp", [-1, 1, 1, 2]], ["tile_x", "sp", [-1, 1, 28, 1]], ["tile_rc", "sp", [-1, 12]], ["tile_ry", "sp", [-1, 1]], ["tile_rx", "sp", [-1, 1]], ["auto_unroll_max_step", "ot", 512], ["unroll_explicit", "ot", 1]]}], "r": [[3.142844767210505e-05, 3.146623119777159e-05, 3.148899960206924e-05, 3.149302029446876e-05, 3.152974240615466e-05, 3.153818861918026e-05, 3.1554146173232525e-05, 3.1634306671972406e-05], 0, 24.322300910949707, 1577058193.2497644], "v": 0.1}
15 | {"i": ["cuda -model=unknown", "topi_nn_conv2d", [["TENSOR", [8, 32, 28, 28], "float32"], ["TENSOR", [192, 32, 1, 1], "float32"], [1, 1], [0, 0], [1, 1], "NCHW", "float32"], {}, ["conv2d", [8, 32, 28, 28, "float32"], [192, 32, 1, 1, "float32"], [1, 1], [0, 0], [1, 1], "NCHW", "float32"], {"i": 15287493, "t": "direct", "c": null, "e": [["tile_f", "sp", [-1, 1, 8, 3]], ["tile_y", "sp", [-1, 1, 1, 2]], ["tile_x", "sp", [-1, 1, 28, 1]], ["tile_rc", "sp", [-1, 16]], ["tile_ry", "sp", [-1, 1]], ["tile_rx", "sp", [-1, 1]], ["auto_unroll_max_step", "ot", 512], ["unroll_explicit", "ot", 1]]}], "r": [[2.5667207573482084e-05, 2.5780096009440036e-05, 2.5786979618107702e-05, 2.5799740506329115e-05, 2.5808542909246945e-05, 2.5808670349710363e-05, 2.5820789101051276e-05, 2.5872650504183653e-05], 0, 21.165749073028564, 1577059807.9907591], "v": 0.1}
16 | {"i": ["cuda -model=unknown", "topi_nn_depthwise_conv2d_nchw", [["TENSOR", [8, 192, 28, 28], "float32"], ["TENSOR", [192, 1, 3, 3], "float32"], [2, 2], [1, 1], [1, 1], "float32"], {}, ["depthwise_conv2d_nchw", [8, 192, 28, 28, "float32"], [192, 1, 3, 3, "float32"], [2, 2], [1, 1], [1, 1], "float32"], {"i": 476868, "t": "direct", "c": null, "e": [["tile_f", "sp", [-1, 1, 1, 2]], ["tile_y", "sp", [-1, 1, 7, 2]], ["tile_x", "sp", [-1, 1, 14, 1]], ["auto_unroll_max_step", "ot", 1500], ["unroll_explicit", "ot", 1]]}], "r": [[3.1763851004759385e-05, 3.1770004494976205e-05, 3.1777092675832893e-05, 3.178300066102591e-05, 3.17909276837652e-05, 3.1797302088841884e-05, 3.180947435219461e-05, 3.181597501322052e-05], 0, 12.726351022720337, 1577061551.003724], "v": 0.1}
17 | {"i": ["cuda -model=unknown", "topi_nn_conv2d", [["TENSOR", [8, 192, 14, 14], "float32"], ["TENSOR", [64, 192, 1, 1], "float32"], [1, 1], [0, 0], [1, 1], "NCHW", "float32"], {}, ["conv2d", [8, 192, 14, 14, "float32"], [64, 192, 1, 1, "float32"], [1, 1], [0, 0], [1, 1], "NCHW", "float32"], {"i": 1692476, "t": "direct", "c": null, "e": [["tile_f", "sp", [-1, 2, 8, 2]], ["tile_y", "sp", [-1, 1, 2, 1]], ["tile_x", "sp", [-1, 1, 7, 2]], ["tile_rc", "sp", [-1, 24]], ["tile_ry", "sp", [-1, 1]], ["tile_rx", "sp", [-1, 1]], ["auto_unroll_max_step", "ot", 1500], ["unroll_explicit", "ot", 1]]}], "r": [[1.523153599740091e-05, 1.523222391163093e-05, 1.523312001299545e-05, 1.5233745419103314e-05, 1.5234619168291098e-05, 1.523627257959714e-05, 1.5236481611435997e-05, 1.5237268291098115e-05], 0, 24.46720051765442, 1577063015.4099426], "v": 0.1}
18 | {"i": ["cuda -model=unknown", "topi_nn_conv2d", [["TENSOR", [8, 384, 14, 14], "float32"], ["TENSOR", [64, 384, 1, 1], "float32"], [1, 1], [0, 0], [1, 1], "NCHW", "float32"], {}, ["conv2d", [8, 384, 14, 14, "float32"], [64, 384, 1, 1, "float32"], [1, 1], [0, 0], [1, 1], "NCHW", "float32"], {"i": 1907516, "t": "direct", "c": null, "e": [["tile_f", "sp", [-1, 2, 8, 2]], ["tile_y", "sp", [-1, 1, 2, 1]], ["tile_x", "sp", [-1, 1, 7, 2]], ["tile_rc", "sp", [-1, 24]], ["tile_ry", "sp", [-1, 1]], ["tile_rx", "sp", [-1, 1]], ["auto_unroll_max_step", "ot", 1500], ["unroll_explicit", "ot", 1]]}], "r": [[2.7104173504079783e-05, 2.7104218041704443e-05, 2.7104731754306438e-05, 2.710505575702629e-05, 2.7106652538531277e-05, 2.7108722461468724e-05, 2.711186536718042e-05, 2.711198470081596e-05], 0, 24.56200408935547, 1577064708.7898057], "v": 0.1}
19 | {"i": ["cuda -model=unknown", "topi_nn_conv2d", [["TENSOR", [8, 64, 14, 14], "float32"], ["TENSOR", [384, 64, 1, 1], "float32"], [1, 1], [0, 0], [1, 1], "NCHW", "float32"], {}, ["conv2d", [8, 64, 14, 14, "float32"], [384, 64, 1, 1, "float32"], [1, 1], [0, 0], [1, 1], "NCHW", "float32"], {"i": 4733323, "t": "direct", "c": null, "e": [["tile_f", "sp", [-1, 8, 4, 1]], ["tile_y", "sp", [-1, 7, 2, 1]], ["tile_x", "sp", [-1, 1, 14, 1]], ["tile_rc", "sp", [-1, 8]], ["tile_ry", "sp", [-1, 1]], ["tile_rx", "sp", [-1, 1]], ["auto_unroll_max_step", "ot", 1500], ["unroll_explicit", "ot", 1]]}], "r": [[1.9784056151313012e-05, 1.9785875453854596e-05, 1.9787038588195558e-05, 1.9787638520645105e-05, 1.9790801654986068e-05, 1.9790973739761883e-05, 1.9792394916828507e-05, 1.979556404627206e-05], 0, 18.558823823928833, 1577066491.66608], "v": 0.1}
20 | {"i": ["cuda -model=unknown", "topi_nn_depthwise_conv2d_nchw", [["TENSOR", [8, 384, 14, 14], "float32"], ["TENSOR", [384, 1, 3, 3], "float32"], [1, 1], [1, 1], [1, 1], "float32"], {}, ["depthwise_conv2d_nchw", [8, 384, 14, 14, "float32"], [384, 1, 3, 3, "float32"], [1, 1], [1, 1], [1, 1], "float32"], {"i": 559696, "t": "direct", "c": null, "e": [["tile_f", "sp", [-1, 1, 2, 1]], ["tile_y", "sp", [-1, 1, 2, 7]], ["tile_x", "sp", [-1, 1, 14, 1]], ["auto_unroll_max_step", "ot", 256], ["unroll_explicit", "ot", 1]]}], "r": [[2.4591763803049033e-05, 2.4593488051091884e-05, 2.4593998351874742e-05, 2.4606124845488258e-05, 2.4607122785331683e-05, 2.4675283168520807e-05, 2.4724753090234856e-05, 2.4803031726411208e-05], 0, 19.9319806098938, 1577068190.5743427], "v": 0.1}
21 | {"i": ["cuda -model=unknown", "topi_nn_conv2d", [["TENSOR", [8, 384, 14, 14], "float32"], ["TENSOR", [96, 384, 1, 1], "float32"], [1, 1], [0, 0], [1, 1], "NCHW", "float32"], {}, ["conv2d", [8, 384, 14, 14, "float32"], [96, 384, 1, 1, "float32"], [1, 1], [0, 0], [1, 1], "NCHW", "float32"], {"i": 4154797, "t": "direct", "c": null, "e": [["tile_f", "sp", [-1, 6, 8, 1]], ["tile_y", "sp", [-1, 1, 2, 1]], ["tile_x", "sp", [-1, 2, 7, 1]], ["tile_rc", "sp", [-1, 24]], ["tile_ry", "sp", [-1, 1]], ["tile_rx", "sp", [-1, 1]], ["auto_unroll_max_step", "ot", 512], ["unroll_explicit", "ot", 1]]}], "r": [[3.461854668630339e-05, 3.463162179675994e-05, 3.463527525773196e-05, 3.4637909867452134e-05, 3.470426288659794e-05, 3.470739558173785e-05, 3.470866318114875e-05, 3.472387982326951e-05], 0, 6.891187429428101, 1577069912.342446], "v": 0.1}
22 | {"i": ["cuda -model=unknown", "topi_nn_conv2d", [["TENSOR", [8, 576, 14, 14], "float32"], ["TENSOR", [96, 576, 1, 1], "float32"], [1, 1], [0, 0], [1, 1], "NCHW", "float32"], {}, ["conv2d", [8, 576, 14, 14, "float32"], [96, 576, 1, 1, "float32"], [1, 1], [0, 0], [1, 1], "NCHW", "float32"], {"i": 5430701, "t": "direct", "c": null, "e": [["tile_f", "sp", [-1, 6, 8, 1]], ["tile_y", "sp", [-1, 1, 2, 1]], ["tile_x", "sp", [-1, 1, 7, 2]], ["tile_rc", "sp", [-1, 24]], ["tile_ry", "sp", [-1, 1]], ["tile_rx", "sp", [-1, 1]], ["auto_unroll_max_step", "ot", 512], ["unroll_explicit", "ot", 1]]}], "r": [[4.9222830204422194e-05, 4.9231537546933666e-05, 4.9259656862745095e-05, 4.927378639966625e-05, 4.9287508969545266e-05, 4.929806925323321e-05, 4.938426387150605e-05, 4.93953310387985e-05], 0, 6.5696797370910645, 1577071960.985715], "v": 0.1}
23 | {"i": ["cuda -model=unknown", "topi_nn_conv2d", [["TENSOR", [8, 96, 14, 14], "float32"], ["TENSOR", [576, 96, 1, 1], "float32"], [1, 1], [0, 0], [1, 1], "NCHW", "float32"], {}, ["conv2d", [8, 96, 14, 14, "float32"], [576, 96, 1, 1, "float32"], [1, 1], [0, 0], [1, 1], "NCHW", "float32"], {"i": 3767896, "t": "direct", "c": null, "e": [["tile_f", "sp", [-1, 1, 4, 6]], ["tile_y", "sp", [-1, 7, 2, 1]], ["tile_x", "sp", [-1, 1, 14, 1]], ["tile_rc", "sp", [-1, 8]], ["tile_ry", "sp", [-1, 1]], ["tile_rx", "sp", [-1, 1]], ["auto_unroll_max_step", "ot", 512], ["unroll_explicit", "ot", 0]]}], "r": [[4.0616557446062094e-05, 4.0650350815646375e-05, 4.068101473425715e-05, 4.06991383967725e-05, 4.075791492720575e-05, 4.0799126819856164e-05, 4.0825016137519736e-05, 4.0832094544816696e-05], 0, 6.116156578063965, 1577074245.0034497], "v": 0.1}
24 | {"i": ["cuda -model=unknown", "topi_nn_depthwise_conv2d_nchw", [["TENSOR", [8, 576, 14, 14], "float32"], ["TENSOR", [576, 1, 3, 3], "float32"], [2, 2], [1, 1], [1, 1], "float32"], {}, ["depthwise_conv2d_nchw", [8, 576, 14, 14, "float32"], [576, 1, 3, 3, "float32"], [2, 2], [1, 1], [1, 1], "float32"], {"i": 76440, "t": "direct", "c": null, "e": [["tile_f", "sp", [-1, 1, 1, 1]], ["tile_y", "sp", [-1, 1, 1, 7]], ["tile_x", "sp", [-1, 1, 7, 1]], ["auto_unroll_max_step", "ot", 1500], ["unroll_explicit", "ot", 1]]}], "r": [[2.6108463751438433e-05, 2.6112443843498274e-05, 2.6118005868814728e-05, 2.612111979286536e-05, 2.6160622094361336e-05, 2.6226580667433833e-05, 2.6232252589182967e-05, 2.6280418757192173e-05], 0, 6.775911569595337, 1577076018.7019913], "v": 0.1}
25 | {"i": ["cuda -model=unknown", "topi_nn_conv2d", [["TENSOR", [8, 576, 7, 7], "float32"], ["TENSOR", [160, 576, 1, 1], "float32"], [1, 1], [0, 0], [1, 1], "NCHW", "float32"], {}, ["conv2d", [8, 576, 7, 7, "float32"], [160, 576, 1, 1, "float32"], [1, 1], [0, 0], [1, 1], "NCHW", "float32"], {"i": 414415, "t": "direct", "c": null, "e": [["tile_f", "sp", [-1, 5, 2, 1]], ["tile_y", "sp", [-1, 1, 7, 1]], ["tile_x", "sp", [-1, 1, 7, 1]], ["tile_rc", "sp", [-1, 24]], ["tile_ry", "sp", [-1, 1]], ["tile_rx", "sp", [-1, 1]], ["auto_unroll_max_step", "ot", 1500], ["unroll_explicit", "ot", 1]]}], "r": [[4.3834081568771866e-05, 4.3842779598600626e-05, 4.3861865402320015e-05, 4.386188050082857e-05, 4.386282470999816e-05, 4.386499907935923e-05, 4.38699972380777e-05, 4.389224212852145e-05], 0, 18.242265224456787, 1577077095.7741208], "v": 0.1}
26 | {"i": ["cuda -model=unknown", "topi_nn_conv2d", [["TENSOR", [8, 960, 7, 7], "float32"], ["TENSOR", [160, 960, 1, 1], "float32"], [1, 1], [0, 0], [1, 1], "NCHW", "float32"], {}, ["conv2d", [8, 960, 7, 7, "float32"], [160, 960, 1, 1, "float32"], [1, 1], [0, 0], [1, 1], "NCHW", "float32"], {"i": 536321, "t": "direct", "c": null, "e": [["tile_f", "sp", [-1, 4, 1, 2]], ["tile_y", "sp", [-1, 1, 7, 1]], ["tile_x", "sp", [-1, 1, 7, 1]], ["tile_rc", "sp", [-1, 15]], ["tile_ry", "sp", [-1, 1]], ["tile_rx", "sp", [-1, 1]], ["auto_unroll_max_step", "ot", 1500], ["unroll_explicit", "ot", 1]]}], "r": [[6.4000689858175e-05, 6.400274391222906e-05, 6.400405485683704e-05, 6.401103559004549e-05, 6.40133786459727e-05, 6.402129007225047e-05, 6.402430532512712e-05, 6.402782793684773e-05], 0, 19.17388677597046, 1577079077.3417273], "v": 0.1}
27 | {"i": ["cuda -model=unknown", "topi_nn_conv2d", [["TENSOR", [8, 160, 7, 7], "float32"], ["TENSOR", [960, 160, 1, 1], "float32"], [1, 1], [0, 0], [1, 1], "NCHW", "float32"], {}, ["conv2d", [8, 160, 7, 7, "float32"], [960, 160, 1, 1, "float32"], [1, 1], [0, 0], [1, 1], "NCHW", "float32"], {"i": 1434208, "t": "direct", "c": null, "e": [["tile_f", "sp", [-1, 4, 16, 1]], ["tile_y", "sp", [-1, 1, 1, 7]], ["tile_x", "sp", [-1, 1, 7, 1]], ["tile_rc", "sp", [-1, 16]], ["tile_ry", "sp", [-1, 1]], ["tile_rx", "sp", [-1, 1]], ["auto_unroll_max_step", "ot", 1500], ["unroll_explicit", "ot", 1]]}], "r": [[3.48041003798948e-05, 3.480448056691993e-05, 3.480857305669199e-05, 3.480859614260666e-05, 3.480969783752192e-05, 3.481210169491526e-05, 3.481237156633548e-05, 3.481531516656926e-05], 0, 31.684184789657593, 1577080901.030168], "v": 0.1}
28 | {"i": ["cuda -model=unknown", "topi_nn_depthwise_conv2d_nchw", [["TENSOR", [8, 960, 7, 7], "float32"], ["TENSOR", [960, 1, 3, 3], "float32"], [1, 1], [1, 1], [1, 1], "float32"], {}, ["depthwise_conv2d_nchw", [8, 960, 7, 7, "float32"], [960, 1, 3, 3, "float32"], [1, 1], [1, 1], [1, 1], "float32"], {"i": 99897, "t": "direct", "c": null, "e": [["tile_f", "sp", [-1, 1, 1, 3]], ["tile_y", "sp", [-1, 1, 7, 1]], ["tile_x", "sp", [-1, 1, 7, 1]], ["auto_unroll_max_step", "ot", 256], ["unroll_explicit", "ot", 1]]}], "r": [[1.3741140247841663e-05, 1.3741577494567452e-05, 1.374161020731779e-05, 1.374192453162624e-05, 1.3742197098725553e-05, 1.3742983379338697e-05, 1.3743329183062196e-05, 1.3743398543489752e-05], 0, 28.37011408805847, 1577082597.7219343], "v": 0.1}
29 | {"i": ["cuda -model=unknown", "topi_nn_conv2d", [["TENSOR", [8, 960, 7, 7], "float32"], ["TENSOR", [320, 960, 1, 1], "float32"], [1, 1], [0, 0], [1, 1], "NCHW", "float32"], {}, ["conv2d", [8, 960, 7, 7, "float32"], [320, 960, 1, 1, "float32"], [1, 1], [0, 0], [1, 1], "NCHW", "float32"], {"i": 804851, "t": "direct", "c": null, "e": [["tile_f", "sp", [-1, 1, 16, 2]], ["tile_y", "sp", [-1, 1, 1, 7]], ["tile_x", "sp", [-1, 1, 7, 1]], ["tile_rc", "sp", [-1, 15]], ["tile_ry", "sp", [-1, 1]], ["tile_rx", "sp", [-1, 1]], ["auto_unroll_max_step", "ot", 1500], ["unroll_explicit", "ot", 1]]}], "r": [[0.00010123226318009279, 0.00010125099493884437, 0.00010125151834668916, 0.00010125188443694643, 0.00010127882918599748, 0.00010128709405314213, 0.00010129502404048924, 0.00010131703247574863], 0, 40.078184843063354, 1577085516.2485526], "v": 0.1}
30 | {"i": ["cuda -model=unknown", "topi_nn_conv2d", [["TENSOR", [8, 320, 7, 7], "float32"], ["TENSOR", [1280, 320, 1, 1], "float32"], [1, 1], [0, 0], [1, 1], "NCHW", "float32"], {}, ["conv2d", [8, 320, 7, 7, "float32"], [1280, 320, 1, 1, "float32"], [1, 1], [0, 0], [1, 1], "NCHW", "float32"], {"i": 798241, "t": "direct", "c": null, "e": [["tile_f", "sp", [-1, 2, 16, 4]], ["tile_y", "sp", [-1, 7, 1, 1]], ["tile_x", "sp", [-1, 1, 7, 1]], ["tile_rc", "sp", [-1, 10]], ["tile_ry", "sp", [-1, 1]], ["tile_rx", "sp", [-1, 1]], ["auto_unroll_max_step", "ot", 1500], ["unroll_explicit", "ot", 1]]}], "r": [[7.538858774474257e-05, 7.53982291515591e-05, 7.539948042059464e-05, 7.541244778825236e-05, 7.541432596084118e-05, 7.541997897026831e-05, 7.543684300217549e-05, 7.544389738941262e-05], 0, 4.084026575088501, 1577087984.0792356], "v": 0.1}
31 | {"i": ["cuda -model=unknown", "topi_nn_dense", [["TENSOR", [8, 1280], "float32"], ["TENSOR", [1000, 1280], "float32"], null, "float32"], {}, ["dense", [8, 1280, "float32"], [1000, 1280, "float32"], 0, "float32"], {"i": 15, "t": "direct", "c": null, "e": [["tile_k", "sp", [-1, 320]]]}], "r": [[0.00016843751947148816, 0.00016859290820584144, 0.00016859392350486788, 0.0001685963602225313, 0.0001686155542420028, 0.00016863228442280946, 0.00016863876008344923, 0.00016869267454798332], 0, 6.655681848526001, 1577089208.2909489], "v": 0.1}
32 | 


--------------------------------------------------------------------------------
/mytophub/20191227/mobilenet_v2_bs_1.log:
--------------------------------------------------------------------------------
 1 | {"i": ["cuda -model=unknown", "topi_nn_depthwise_conv2d_nchw", [["TENSOR", [1, 576, 14, 14], "float32"], ["TENSOR", [576, 1, 3, 3], "float32"], [1, 1], [1, 1], [1, 1], "float32"], {}, ["depthwise_conv2d_nchw", [1, 576, 14, 14, "float32"], [576, 1, 3, 3, "float32"], [1, 1], [1, 1], [1, 1], "float32"], {"i": 1186920, "t": "direct", "c": null, "e": [["tile_f", "sp", [-1, 1, 1, 1]], ["tile_y", "sp", [-1, 7, 2, 1]], ["tile_x", "sp", [-1, 1, 14, 1]], ["auto_unroll_max_step", "ot", 1500], ["unroll_explicit", "ot", 1]]}], "r": [[4.256162015643442e-06, 4.2563126487827325e-06, 4.256411190461901e-06, 4.2567419032187085e-06, 4.2567880933805435e-06, 4.256937226189762e-06, 4.257065054311949e-06, 4.2571571145652045e-06], 0, 29.913546562194824, 1577083282.9573474], "v": 0.1}
 2 | {"i": ["cuda -model=unknown", "topi_nn_depthwise_conv2d_nchw", [["TENSOR", [1, 192, 28, 28], "float32"], ["TENSOR", [192, 1, 3, 3], "float32"], [1, 1], [1, 1], [1, 1], "float32"], {}, ["depthwise_conv2d_nchw", [1, 192, 28, 28, "float32"], [192, 1, 3, 3, "float32"], [1, 1], [1, 1], [1, 1], "float32"], {"i": 2924880, "t": "direct", "c": null, "e": [["tile_f", "sp", [-1, 1, 1, 1]], ["tile_y", "sp", [-1, 2, 7, 2]], ["tile_x", "sp", [-1, 1, 28, 1]], ["auto_unroll_max_step", "ot", 1500], ["unroll_explicit", "ot", 1]]}], "r": [[4.766426336721167e-06, 4.767010649580203e-06, 4.768153557224922e-06, 4.768980357931949e-06, 4.7780454264251e-06, 4.7780648475475036e-06, 4.778221718957137e-06, 4.778257909854176e-06], 0, 5.5999884605407715, 1577084138.5747097], "v": 0.1}
 3 | {"i": ["cuda -model=unknown", "topi_nn_depthwise_conv2d_nchw", [["TENSOR", [1, 144, 56, 56], "float32"], ["TENSOR", [144, 1, 3, 3], "float32"], [1, 1], [1, 1], [1, 1], "float32"], {}, ["depthwise_conv2d_nchw", [1, 144, 56, 56, "float32"], [144, 1, 3, 3, "float32"], [1, 1], [1, 1], [1, 1], "float32"], {"i": 12007450, "t": "direct", "c": null, "e": [["tile_f", "sp", [-1, 1, 1, 1]], ["tile_y", "sp", [-1, 1, 1, 8]], ["tile_x", "sp", [-1, 2, 28, 1]], ["auto_unroll_max_step", "ot", 1500], ["unroll_explicit", "ot", 1]]}], "r": [[1.3898894137096298e-05, 1.38993009387731e-05, 1.389971016118557e-05, 1.3901035366357678e-05, 1.3901307787683769e-05, 1.3901483970006496e-05, 1.3902622955659207e-05, 1.3903008561138335e-05], 0, 33.551636695861816, 1577085863.0345652], "v": 0.1}
 4 | {"i": ["cuda -model=unknown", "topi_nn_conv2d", [["TENSOR", [1, 3, 224, 224], "float32"], ["TENSOR", [32, 3, 3, 3], "float32"], [2, 2], [1, 1], [1, 1], "NCHW", "float32"], {}, ["conv2d", [1, 3, 224, 224, "float32"], [32, 3, 3, 3, "float32"], [2, 2], [1, 1], [1, 1], "NCHW", "float32"], {"i": 42004105, "t": "direct", "c": null, "e": [["tile_f", "sp", [-1, 4, 8, 1]], ["tile_y", "sp", [-1, 1, 1, 7]], ["tile_x", "sp", [-1, 1, 16, 1]], ["tile_rc", "sp", [-1, 1]], ["tile_ry", "sp", [-1, 3]], ["tile_rx", "sp", [-1, 3]], ["auto_unroll_max_step", "ot", 512], ["unroll_explicit", "ot", 1]]}], "r": [[8.649715312597063e-06, 8.649995207880374e-06, 8.650231042286018e-06, 8.650303412166658e-06, 8.650794648799752e-06, 8.65089208856547e-06, 8.650967830678439e-06, 8.651983626924612e-06], 0, 7.5378313064575195, 1577088139.3531494], "v": 0.1}
 5 | {"i": ["cuda -model=unknown", "topi_nn_depthwise_conv2d_nchw", [["TENSOR", [1, 32, 112, 112], "float32"], ["TENSOR", [32, 1, 3, 3], "float32"], [1, 1], [1, 1], [1, 1], "float32"], {}, ["depthwise_conv2d_nchw", [1, 32, 112, 112, "float32"], [32, 1, 3, 3, "float32"], [1, 1], [1, 1], [1, 1], "float32"], {"i": 5830328, "t": "direct", "c": null, "e": [["tile_f", "sp", [-1, 1, 1, 1]], ["tile_y", "sp", [-1, 1, 1, 7]], ["tile_x", "sp", [-1, 2, 56, 1]], ["auto_unroll_max_step", "ot", 1500], ["unroll_explicit", "ot", 1]]}], "r": [[7.183194829237152e-06, 7.1835263006702835e-06, 7.183646632620491e-06, 7.183808266836898e-06, 7.184015544206831e-06, 7.184131790616023e-06, 7.184360293648261e-06, 7.185253814235557e-06], 0, 6.09515643119812, 1577089519.9242003], "v": 0.1}
 6 | {"i": ["cuda -model=unknown", "topi_nn_conv2d", [["TENSOR", [1, 32, 112, 112], "float32"], ["TENSOR", [16, 32, 1, 1], "float32"], [1, 1], [0, 0], [1, 1], "NCHW", "float32"], {}, ["conv2d", [1, 32, 112, 112, "float32"], [16, 32, 1, 1, "float32"], [1, 1], [0, 0], [1, 1], "NCHW", "float32"], {"i": 18703358, "t": "direct", "c": null, "e": [["tile_f", "sp", [-1, 2, 4, 2]], ["tile_y", "sp", [-1, 2, 1, 1]], ["tile_x", "sp", [-1, 1, 16, 1]], ["tile_rc", "sp", [-1, 8]], ["tile_ry", "sp", [-1, 1]], ["tile_rx", "sp", [-1, 1]], ["auto_unroll_max_step", "ot", 512], ["unroll_explicit", "ot", 1]]}], "r": [[5.841753497942387e-06, 5.844725778953556e-06, 5.84535681951793e-06, 5.8472145502645506e-06, 5.848550235155791e-06, 5.8599458847736625e-06, 5.879950323339213e-06, 5.883456643151088e-06], 0, 3.213482618331909, 1577091945.6474025], "v": 0.1}
 7 | {"i": ["cuda -model=unknown", "topi_nn_depthwise_conv2d_nchw", [["TENSOR", [1, 96, 112, 112], "float32"], ["TENSOR", [96, 1, 3, 3], "float32"], [2, 2], [1, 1], [1, 1], "float32"], {}, ["depthwise_conv2d_nchw", [1, 96, 112, 112, "float32"], [96, 1, 3, 3, "float32"], [2, 2], [1, 1], [1, 1], "float32"], {"i": 6254976, "t": "direct", "c": null, "e": [["tile_f", "sp", [-1, 1, 1, 1]], ["tile_y", "sp", [-1, 8, 1, 1]], ["tile_x", "sp", [-1, 1, 56, 1]], ["auto_unroll_max_step", "ot", 256], ["unroll_explicit", "ot", 1]]}], "r": [[2.895801455427532e-05, 2.8958062704669497e-05, 2.8958580715585204e-05, 2.8958952819890844e-05, 2.8959407155852035e-05, 2.896004499696786e-05, 2.8960418071558523e-05, 2.8960928320194055e-05], 0, 17.70728850364685, 1577095402.1052015], "v": 0.1}
 8 | {"i": ["cuda -model=unknown", "topi_nn_conv2d", [["TENSOR", [1, 96, 56, 56], "float32"], ["TENSOR", [24, 96, 1, 1], "float32"], [1, 1], [0, 0], [1, 1], "NCHW", "float32"], {}, ["conv2d", [1, 96, 56, 56, "float32"], [24, 96, 1, 1, "float32"], [1, 1], [0, 0], [1, 1], "NCHW", "float32"], {"i": 34401177, "t": "direct", "c": null, "e": [["tile_f", "sp", [-1, 1, 8, 3]], ["tile_y", "sp", [-1, 1, 4, 1]], ["tile_x", "sp", [-1, 2, 4, 1]], ["tile_rc", "sp", [-1, 16]], ["tile_ry", "sp", [-1, 1]], ["tile_rx", "sp", [-1, 1]], ["auto_unroll_max_step", "ot", 1500], ["unroll_explicit", "ot", 1]]}], "r": [[7.162611532792926e-06, 7.163162196020634e-06, 7.163197089167281e-06, 7.163887214443626e-06, 7.164199926308032e-06, 7.164350921149594e-06, 7.164514627855563e-06, 7.178308843036109e-06], 0, 3.538969039916992, 1577098448.371999], "v": 0.1}
 9 | {"i": ["cuda -model=unknown", "topi_nn_conv2d", [["TENSOR", [1, 144, 56, 56], "float32"], ["TENSOR", [24, 144, 1, 1], "float32"], [1, 1], [0, 0], [1, 1], "NCHW", "float32"], {}, ["conv2d", [1, 144, 56, 56, "float32"], [24, 144, 1, 1, "float32"], [1, 1], [0, 0], [1, 1], "NCHW", "float32"], {"i": 19597577, "t": "direct", "c": null, "e": [["tile_f", "sp", [-1, 1, 8, 3]], ["tile_y", "sp", [-1, 2, 2, 1]], ["tile_x", "sp", [-1, 1, 8, 1]], ["tile_rc", "sp", [-1, 16]], ["tile_ry", "sp", [-1, 1]], ["tile_rx", "sp", [-1, 1]], ["auto_unroll_max_step", "ot", 1500], ["unroll_explicit", "ot", 0]]}], "r": [[9.574150888610764e-06, 9.574410463078848e-06, 9.574568911138924e-06, 9.575588035043805e-06, 9.5759211514393e-06, 9.577087058823529e-06, 9.580039499374217e-06, 9.580816821026283e-06], 0, 3.36997127532959, 1577100212.032417], "v": 0.1}
10 | {"i": ["cuda -model=unknown", "topi_nn_conv2d", [["TENSOR", [1, 24, 56, 56], "float32"], ["TENSOR", [144, 24, 1, 1], "float32"], [1, 1], [0, 0], [1, 1], "NCHW", "float32"], {}, ["conv2d", [1, 24, 56, 56, "float32"], [144, 24, 1, 1, "float32"], [1, 1], [0, 0], [1, 1], "NCHW", "float32"], {"i": 32172392, "t": "direct", "c": null, "e": [["tile_f", "sp", [-1, 9, 4, 1]], ["tile_y", "sp", [-1, 2, 1, 1]], ["tile_x", "sp", [-1, 1, 56, 1]], ["tile_rc", "sp", [-1, 12]], ["tile_ry", "sp", [-1, 1]], ["tile_rx", "sp", [-1, 1]], ["auto_unroll_max_step", "ot", 512], ["unroll_explicit", "ot", 0]]}], "r": [[7.403441449198569e-06, 7.403902404291959e-06, 7.404091336600874e-06, 7.404116671082263e-06, 7.404120380182806e-06, 7.404631441250497e-06, 7.40475384156842e-06, 7.405115545105312e-06], 0, 11.287461757659912, 1577104116.1376636], "v": 0.1}
11 | {"i": ["cuda -model=unknown", "topi_nn_depthwise_conv2d_nchw", [["TENSOR", [1, 144, 56, 56], "float32"], ["TENSOR", [144, 1, 3, 3], "float32"], [2, 2], [1, 1], [1, 1], "float32"], {}, ["depthwise_conv2d_nchw", [1, 144, 56, 56, "float32"], [144, 1, 3, 3, "float32"], [2, 2], [1, 1], [1, 1], "float32"], {"i": 3038700, "t": "direct", "c": null, "e": [["tile_f", "sp", [-1, 1, 1, 1]], ["tile_y", "sp", [-1, 4, 1, 1]], ["tile_x", "sp", [-1, 1, 28, 1]], ["auto_unroll_max_step", "ot", 1500], ["unroll_explicit", "ot", 1]]}], "r": [[5.2229270524166105e-06, 5.228248985704561e-06, 5.250097699115044e-06, 5.277759700476515e-06, 5.278095112321307e-06, 5.2993250374404354e-06, 5.299374976174268e-06, 5.4285359564329475e-06], 0, 3.1553022861480713, 1577105674.5875518], "v": 0.1}
12 | {"i": ["cuda -model=unknown", "topi_nn_conv2d", [["TENSOR", [1, 144, 28, 28], "float32"], ["TENSOR", [32, 144, 1, 1], "float32"], [1, 1], [0, 0], [1, 1], "NCHW", "float32"], {}, ["conv2d", [1, 144, 28, 28, "float32"], [32, 144, 1, 1, "float32"], [1, 1], [0, 0], [1, 1], "NCHW", "float32"], {"i": 6688659, "t": "direct", "c": null, "e": [["tile_f", "sp", [-1, 2, 16, 1]], ["tile_y", "sp", [-1, 1, 1, 1]], ["tile_x", "sp", [-1, 1, 14, 2]], ["tile_rc", "sp", [-1, 144]], ["tile_ry", "sp", [-1, 1]], ["tile_rx", "sp", [-1, 1]], ["auto_unroll_max_step", "ot", 512], ["unroll_explicit", "ot", 1]]}], "r": [[5.958930087524424e-06, 5.959027542089344e-06, 5.95940464655657e-06, 5.959563475281711e-06, 5.959722678729156e-06, 5.959848344530393e-06, 5.959906399721635e-06, 5.960143732769466e-06], 0, 5.992655277252197, 1577107250.3644614], "v": 0.1}
13 | {"i": ["cuda -model=unknown", "topi_nn_conv2d", [["TENSOR", [1, 192, 28, 28], "float32"], ["TENSOR", [32, 192, 1, 1], "float32"], [1, 1], [0, 0], [1, 1], "NCHW", "float32"], {}, ["conv2d", [1, 192, 28, 28, "float32"], [32, 192, 1, 1, "float32"], [1, 1], [0, 0], [1, 1], "NCHW", "float32"], {"i": 7011627, "t": "direct", "c": null, "e": [["tile_f", "sp", [-1, 1, 16, 2]], ["tile_y", "sp", [-1, 2, 2, 1]], ["tile_x", "sp", [-1, 1, 4, 1]], ["tile_rc", "sp", [-1, 24]], ["tile_ry", "sp", [-1, 1]], ["tile_rx", "sp", [-1, 1]], ["auto_unroll_max_step", "ot", 1500], ["unroll_explicit", "ot", 1]]}], "r": [[7.046878329846038e-06, 7.0474393991443115e-06, 7.048045282783174e-06, 7.049342868742388e-06, 7.050769776084444e-06, 7.0512095499828235e-06, 7.051481465288404e-06, 7.051635926423285e-06], 0, 9.481719732284546, 1577111353.3867362], "v": 0.1}
14 | {"i": ["cuda -model=unknown", "topi_nn_conv2d", [["TENSOR", [1, 32, 28, 28], "float32"], ["TENSOR", [192, 32, 1, 1], "float32"], [1, 1], [0, 0], [1, 1], "NCHW", "float32"], {}, ["conv2d", [1, 32, 28, 28, "float32"], [192, 32, 1, 1, "float32"], [1, 1], [0, 0], [1, 1], "NCHW", "float32"], {"i": 8292977, "t": "direct", "c": null, "e": [["tile_f", "sp", [-1, 2, 4, 3]], ["tile_y", "sp", [-1, 2, 1, 1]], ["tile_x", "sp", [-1, 1, 28, 1]], ["tile_rc", "sp", [-1, 8]], ["tile_ry", "sp", [-1, 1]], ["tile_rx", "sp", [-1, 1]], ["auto_unroll_max_step", "ot", 1500], ["unroll_explicit", "ot", 0]]}], "r": [[4.898193303988262e-06, 4.898248446045085e-06, 4.898303854875284e-06, 4.898452794451113e-06, 4.898921942110178e-06, 4.8990147525676935e-06, 4.918776017073495e-06, 4.925037161531279e-06], 0, 3.0670552253723145, 1577112859.4634447], "v": 0.1}
15 | {"i": ["cuda -model=unknown", "topi_nn_depthwise_conv2d_nchw", [["TENSOR", [1, 192, 28, 28], "float32"], ["TENSOR", [192, 1, 3, 3], "float32"], [2, 2], [1, 1], [1, 1], "float32"], {}, ["depthwise_conv2d_nchw", [1, 192, 28, 28, "float32"], [192, 1, 3, 3, "float32"], [2, 2], [1, 1], [1, 1], "float32"], {"i": 389760, "t": "direct", "c": null, "e": [["tile_f", "sp", [-1, 1, 1, 1]], ["tile_y", "sp", [-1, 1, 14, 1]], ["tile_x", "sp", [-1, 1, 14, 1]], ["auto_unroll_max_step", "ot", 256], ["unroll_explicit", "ot", 1]]}], "r": [[4.0862087758663674e-06, 4.097030491149339e-06, 4.098363525305411e-06, 4.136863924208427e-06, 4.175083071553229e-06, 4.194510022438295e-06, 4.1986394664672155e-06, 4.237582822238843e-06], 0, 2.86110258102417, 1577114034.9081006], "v": 0.1}
16 | {"i": ["cuda -model=unknown", "topi_nn_conv2d", [["TENSOR", [1, 192, 14, 14], "float32"], ["TENSOR", [64, 192, 1, 1], "float32"], [1, 1], [0, 0], [1, 1], "NCHW", "float32"], {}, ["conv2d", [1, 192, 14, 14, "float32"], [64, 192, 1, 1, "float32"], [1, 1], [0, 0], [1, 1], "NCHW", "float32"], {"i": 1430039, "t": "direct", "c": null, "e": [["tile_f", "sp", [-1, 2, 16, 1]], ["tile_y", "sp", [-1, 1, 1, 1]], ["tile_x", "sp", [-1, 1, 14, 1]], ["tile_rc", "sp", [-1, 48]], ["tile_ry", "sp", [-1, 1]], ["tile_rx", "sp", [-1, 1]], ["auto_unroll_max_step", "ot", 512], ["unroll_explicit", "ot", 1]]}], "r": [[6.312398292531847e-06, 6.315889957178638e-06, 6.334382402843985e-06, 6.342390967116426e-06, 6.345885944359161e-06, 6.354949960949072e-06, 6.358513156122916e-06, 6.362491503056745e-06], 0, 11.796204328536987, 1577115071.6172433], "v": 0.1}
17 | {"i": ["cuda -model=unknown", "topi_nn_conv2d", [["TENSOR", [1, 384, 14, 14], "float32"], ["TENSOR", [64, 384, 1, 1], "float32"], [1, 1], [0, 0], [1, 1], "NCHW", "float32"], {}, ["conv2d", [1, 384, 14, 14, "float32"], [64, 384, 1, 1, "float32"], [1, 1], [0, 0], [1, 1], "NCHW", "float32"], {"i": 1946135, "t": "direct", "c": null, "e": [["tile_f", "sp", [-1, 2, 16, 1]], ["tile_y", "sp", [-1, 1, 1, 1]], ["tile_x", "sp", [-1, 1, 14, 1]], ["tile_rc", "sp", [-1, 48]], ["tile_ry", "sp", [-1, 1]], ["tile_rx", "sp", [-1, 1]], ["auto_unroll_max_step", "ot", 1500], ["unroll_explicit", "ot", 1]]}], "r": [[1.0617377347559504e-05, 1.0620711388982766e-05, 1.0622015738907933e-05, 1.062692782310626e-05, 1.062701144208951e-05, 1.0627607879109737e-05, 1.0628751701829769e-05, 1.0630421474436345e-05], 0, 9.924859046936035, 1577116739.1119459], "v": 0.1}
18 | {"i": ["cuda -model=unknown", "topi_nn_conv2d", [["TENSOR", [1, 64, 14, 14], "float32"], ["TENSOR", [384, 64, 1, 1], "float32"], [1, 1], [0, 0], [1, 1], "NCHW", "float32"], {}, ["conv2d", [1, 64, 14, 14, "float32"], [384, 64, 1, 1, "float32"], [1, 1], [0, 0], [1, 1], "NCHW", "float32"], {"i": 2273973, "t": "direct", "c": null, "e": [["tile_f", "sp", [-1, 1, 8, 3]], ["tile_y", "sp", [-1, 2, 1, 1]], ["tile_x", "sp", [-1, 1, 14, 1]], ["tile_rc", "sp", [-1, 16]], ["tile_ry", "sp", [-1, 1]], ["tile_rx", "sp", [-1, 1]], ["auto_unroll_max_step", "ot", 1500], ["unroll_explicit", "ot", 0]]}], "r": [[6.522165207756232e-06, 6.529184487534626e-06, 6.53598947368421e-06, 6.542384930747923e-06, 6.554154958448754e-06, 6.557132963988919e-06, 6.573822742382271e-06, 6.585809362880887e-06], 0, 9.513134241104126, 1577118703.810596], "v": 0.1}
19 | {"i": ["cuda -model=unknown", "topi_nn_depthwise_conv2d_nchw", [["TENSOR", [1, 384, 14, 14], "float32"], ["TENSOR", [384, 1, 3, 3], "float32"], [1, 1], [1, 1], [1, 1], "float32"], {}, ["depthwise_conv2d_nchw", [1, 384, 14, 14, "float32"], [384, 1, 3, 3, "float32"], [1, 1], [1, 1], [1, 1], "float32"], {"i": 702736, "t": "direct", "c": null, "e": [["tile_f", "sp", [-1, 1, 2, 1]], ["tile_y", "sp", [-1, 1, 14, 1]], ["tile_x", "sp", [-1, 1, 7, 2]], ["auto_unroll_max_step", "ot", 1500], ["unroll_explicit", "ot", 1]]}], "r": [[3.632894071552672e-06, 3.6334047441761516e-06, 3.6334306102187713e-06, 3.633492890827217e-06, 3.6337781973548914e-06, 3.6338098783817322e-06, 3.633916941460128e-06, 3.6340649044427897e-06], 0, 10.450222969055176, 1577119442.6631932], "v": 0.1}
20 | {"i": ["cuda -model=unknown", "topi_nn_conv2d", [["TENSOR", [1, 384, 14, 14], "float32"], ["TENSOR", [96, 384, 1, 1], "float32"], [1, 1], [0, 0], [1, 1], "NCHW", "float32"], {}, ["conv2d", [1, 384, 14, 14, "float32"], [96, 384, 1, 1, "float32"], [1, 1], [0, 0], [1, 1], "NCHW", "float32"], {"i": 5183408, "t": "direct", "c": null, "e": [["tile_f", "sp", [-1, 2, 12, 1]], ["tile_y", "sp", [-1, 1, 2, 1]], ["tile_x", "sp", [-1, 1, 7, 1]], ["tile_rc", "sp", [-1, 48]], ["tile_ry", "sp", [-1, 1]], ["tile_rx", "sp", [-1, 1]], ["auto_unroll_max_step", "ot", 1500], ["unroll_explicit", "ot", 1]]}], "r": [[1.1937419841393252e-05, 1.193864608925517e-05, 1.1938808272430416e-05, 1.1940988285906805e-05, 1.1941515627429637e-05, 1.1942867827709532e-05, 1.1943505882962732e-05, 1.1943663815891773e-05], 0, 6.855751991271973, 1577121202.8205402], "v": 0.1}
21 | {"i": ["cuda -model=unknown", "topi_nn_conv2d", [["TENSOR", [1, 576, 14, 14], "float32"], ["TENSOR", [96, 576, 1, 1], "float32"], [1, 1], [0, 0], [1, 1], "NCHW", "float32"], {}, ["conv2d", [1, 576, 14, 14, "float32"], [96, 576, 1, 1, "float32"], [1, 1], [0, 0], [1, 1], "NCHW", "float32"], {"i": 1914794, "t": "direct", "c": null, "e": [["tile_f", "sp", [-1, 2, 8, 1]], ["tile_y", "sp", [-1, 1, 2, 1]], ["tile_x", "sp", [-1, 1, 7, 1]], ["tile_rc", "sp", [-1, 36]], ["tile_ry", "sp", [-1, 1]], ["tile_rx", "sp", [-1, 1]], ["auto_unroll_max_step", "ot", 512], ["unroll_explicit", "ot", 0]]}], "r": [[1.815036938931298e-05, 1.815235038167939e-05, 1.8153618702290078e-05, 1.815396404580153e-05, 1.8154230916030535e-05, 1.8159444351145038e-05, 1.8159473282442747e-05, 1.816900786259542e-05], 0, 6.211086988449097, 1577123092.1215918], "v": 0.1}
22 | {"i": ["cuda -model=unknown", "topi_nn_depthwise_conv2d_nchw", [["TENSOR", [1, 576, 14, 14], "float32"], ["TENSOR", [576, 1, 3, 3], "float32"], [2, 2], [1, 1], [1, 1], "float32"], {}, ["depthwise_conv2d_nchw", [1, 576, 14, 14, "float32"], [576, 1, 3, 3, "float32"], [2, 2], [1, 1], [1, 1], "float32"], {"i": 62160, "t": "direct", "c": null, "e": [["tile_f", "sp", [-1, 1, 1, 1]], ["tile_y", "sp", [-1, 1, 7, 1]], ["tile_x", "sp", [-1, 1, 7, 1]], ["auto_unroll_max_step", "ot", 256], ["unroll_explicit", "ot", 1]]}], "r": [[3.9184910079619905e-06, 3.9185321727510625e-06, 3.918747796069115e-06, 3.9187520090569955e-06, 3.918796477422468e-06, 3.9188999461776875e-06, 3.918965163972457e-06, 3.91907824650619e-06], 0, 13.658522367477417, 1577127326.064711], "v": 0.1}
23 | {"i": ["cuda -model=unknown", "topi_nn_conv2d", [["TENSOR", [1, 576, 7, 7], "float32"], ["TENSOR", [160, 576, 1, 1], "float32"], [1, 1], [0, 0], [1, 1], "NCHW", "float32"], {}, ["conv2d", [1, 576, 7, 7, "float32"], [160, 576, 1, 1, "float32"], [1, 1], [0, 0], [1, 1], "NCHW", "float32"], {"i": 197625, "t": "direct", "c": null, "e": [["tile_f", "sp", [-1, 1, 40, 1]], ["tile_y", "sp", [-1, 1, 7, 1]], ["tile_x", "sp", [-1, 1, 1, 1]], ["tile_rc", "sp", [-1, 48]], ["tile_ry", "sp", [-1, 1]], ["tile_rx", "sp", [-1, 1]], ["auto_unroll_max_step", "ot", 1500], ["unroll_explicit", "ot", 0]]}], "r": [[1.5496503255120033e-05, 1.549661311542113e-05, 1.549692092770921e-05, 1.5498575681540757e-05, 1.5498696527871966e-05, 1.5500766648582668e-05, 1.550101274922013e-05, 1.5501034789095347e-05], 0, 9.248530149459839, 1577128684.9711614], "v": 0.1}
24 | {"i": ["cuda -model=unknown", "topi_nn_conv2d", [["TENSOR", [1, 960, 7, 7], "float32"], ["TENSOR", [160, 960, 1, 1], "float32"], [1, 1], [0, 0], [1, 1], "NCHW", "float32"], {}, ["conv2d", [1, 960, 7, 7, "float32"], [160, 960, 1, 1, "float32"], [1, 1], [0, 0], [1, 1], "NCHW", "float32"], {"i": 557369, "t": "direct", "c": null, "e": [["tile_f", "sp", [-1, 1, 40, 1]], ["tile_y", "sp", [-1, 1, 1, 1]], ["tile_x", "sp", [-1, 1, 7, 1]], ["tile_rc", "sp", [-1, 40]], ["tile_ry", "sp", [-1, 1]], ["tile_rx", "sp", [-1, 1]], ["auto_unroll_max_step", "ot", 1500], ["unroll_explicit", "ot", 1]]}], "r": [[2.0863379659790084e-05, 2.086367453854506e-05, 2.086381044154904e-05, 2.086463255519363e-05, 2.0864978736880203e-05, 2.0865093286283027e-05, 2.086545258776692e-05, 2.0865820213536014e-05], 0, 4.844036817550659, 1577130480.5591009], "v": 0.1}
25 | {"i": ["cuda -model=unknown", "topi_nn_conv2d", [["TENSOR", [1, 160, 7, 7], "float32"], ["TENSOR", [960, 160, 1, 1], "float32"], [1, 1], [0, 0], [1, 1], "NCHW", "float32"], {}, ["conv2d", [1, 160, 7, 7, "float32"], [960, 160, 1, 1, "float32"], [1, 1], [0, 0], [1, 1], "NCHW", "float32"], {"i": 228595, "t": "direct", "c": null, "e": [["tile_f", "sp", [-1, 4, 8, 1]], ["tile_y", "sp", [-1, 1, 7, 1]], ["tile_x", "sp", [-1, 1, 7, 1]], ["tile_rc", "sp", [-1, 80]], ["tile_ry", "sp", [-1, 1]], ["tile_rx", "sp", [-1, 1]], ["auto_unroll_max_step", "ot", 0], ["unroll_explicit", "ot", 0]]}], "r": [[1.0966452136400986e-05, 1.0967450133525062e-05, 1.0968967235004108e-05, 1.0970244299506983e-05, 1.0970635630649137e-05, 1.0972328985209532e-05, 1.0974144515201315e-05, 1.09784135168447e-05], 0, 5.936706781387329, 1577131748.0362394], "v": 0.1}
26 | {"i": ["cuda -model=unknown", "topi_nn_depthwise_conv2d_nchw", [["TENSOR", [1, 960, 7, 7], "float32"], ["TENSOR", [960, 1, 3, 3], "float32"], [1, 1], [1, 1], [1, 1], "float32"], {}, ["depthwise_conv2d_nchw", [1, 960, 7, 7, "float32"], [960, 1, 3, 3, "float32"], [1, 1], [1, 1], [1, 1], "float32"], {"i": 121072, "t": "direct", "c": null, "e": [["tile_f", "sp", [-1, 1, 8, 1]], ["tile_y", "sp", [-1, 1, 7, 1]], ["tile_x", "sp", [-1, 1, 7, 1]], ["auto_unroll_max_step", "ot", 1500], ["unroll_explicit", "ot", 1]]}], "r": [[3.611961681842175e-06, 3.620906868566904e-06, 3.6210494358669835e-06, 3.6363834949854844e-06, 3.653309976247031e-06, 3.6787850191343363e-06, 3.687062054631829e-06, 3.689518111638955e-06], 0, 3.3101305961608887, 1577132890.1084025], "v": 0.1}
27 | {"i": ["cuda -model=unknown", "topi_nn_conv2d", [["TENSOR", [1, 960, 7, 7], "float32"], ["TENSOR", [320, 960, 1, 1], "float32"], [1, 1], [0, 0], [1, 1], "NCHW", "float32"], {}, ["conv2d", [1, 960, 7, 7, "float32"], [320, 960, 1, 1, "float32"], [1, 1], [0, 0], [1, 1], "NCHW", "float32"], {"i": 836031, "t": "direct", "c": null, "e": [["tile_f", "sp", [-1, 1, 20, 1]], ["tile_y", "sp", [-1, 1, 1, 1]], ["tile_x", "sp", [-1, 1, 7, 1]], ["tile_rc", "sp", [-1, 40]], ["tile_ry", "sp", [-1, 1]], ["tile_rx", "sp", [-1, 1]], ["auto_unroll_max_step", "ot", 1500], ["unroll_explicit", "ot", 1]]}], "r": [[3.429527770609319e-05, 3.4295376057347666e-05, 3.429598336917563e-05, 3.429667827956989e-05, 3.4298970609318994e-05, 3.429956286738351e-05, 3.4300457347670254e-05, 3.430223168458781e-05], 0, 4.7007317543029785, 1577133963.272713], "v": 0.1}
28 | {"i": ["cuda -model=unknown", "topi_nn_conv2d", [["TENSOR", [1, 320, 7, 7], "float32"], ["TENSOR", [1280, 320, 1, 1], "float32"], [1, 1], [0, 0], [1, 1], "NCHW", "float32"], {}, ["conv2d", [1, 320, 7, 7, "float32"], [1280, 320, 1, 1, "float32"], [1, 1], [0, 0], [1, 1], "NCHW", "float32"], {"i": 524062, "t": "direct", "c": null, "e": [["tile_f", "sp", [-1, 8, 2, 1]], ["tile_y", "sp", [-1, 1, 7, 1]], ["tile_x", "sp", [-1, 1, 7, 1]], ["tile_rc", "sp", [-1, 20]], ["tile_ry", "sp", [-1, 1]], ["tile_rx", "sp", [-1, 1]], ["auto_unroll_max_step", "ot", 0], ["unroll_explicit", "ot", 1]]}], "r": [[2.61371262993763e-05, 2.6138839916839915e-05, 2.613949818087318e-05, 2.614049792099792e-05, 2.6143544828482328e-05, 2.6143851091476094e-05, 2.6144472323284822e-05, 2.6145180743243243e-05], 0, 3.301823854446411, 1577135231.2540295], "v": 0.1}
29 | {"i": ["cuda -model=unknown", "topi_nn_dense", [["TENSOR", [1, 1280], "float32"], ["TENSOR", [1000, 1280], "float32"], null, "float32"], {}, ["dense", [1, 1280, "float32"], [1000, 1280, "float32"], 0, "float32"], {"i": 12, "t": "direct", "c": null, "e": [["tile_k", "sp", [-1, 128]]]}], "r": [[2.3896592488451496e-05, 2.3912026913034746e-05, 2.3912977605944968e-05, 2.3918613376179953e-05, 2.393022725446877e-05, 2.393378650331392e-05, 2.3941181763406304e-05, 2.3958782888130145e-05], 0, 6.477096796035767, 1577135912.7309265], "v": 0.1}
30 | {"i": ["cuda -model=unknown", "topi_nn_conv2d", [["TENSOR", [1, 16, 112, 112], "float32"], ["TENSOR", [96, 16, 1, 1], "float32"], [1, 1], [0, 0], [1, 1], "NCHW", "float32"], {}, ["conv2d", [1, 16, 112, 112, "float32"], [96, 16, 1, 1, "float32"], [1, 1], [0, 0], [1, 1], "NCHW", "float32"], {"i": 36287689, "t": "direct", "c": null, "e": [["tile_f", "sp", [-1, 1, 2, 4]], ["tile_y", "sp", [-1, 1, 4, 1]], ["tile_x", "sp", [-1, 1, 16, 1]], ["tile_rc", "sp", [-1, 8]], ["tile_ry", "sp", [-1, 1]], ["tile_rx", "sp", [-1, 1]], ["auto_unroll_max_step", "ot", 512], ["unroll_explicit", "ot", 0]]}], "r": [[1.9788722391752575e-05, 1.9789338474226802e-05, 1.9792051216494844e-05, 1.979239917525773e-05, 1.9792576989690723e-05, 1.979290375257732e-05, 1.979402630927835e-05, 1.9794034144329898e-05], 0, 43.663047790527344, 1577037207.4167051], "v": 0.1}
31 | {"i": ["cuda -model=unknown", "topi_nn_conv2d", [["TENSOR", [1, 96, 14, 14], "float32"], ["TENSOR", [576, 96, 1, 1], "float32"], [1, 1], [0, 0], [1, 1], "NCHW", "float32"], {}, ["conv2d", [1, 96, 14, 14, "float32"], [576, 96, 1, 1, "float32"], [1, 1], [0, 0], [1, 1], "NCHW", "float32"], {"i": 12157414, "t": "direct", "c": null, "e": [["tile_f", "sp", [-1, 4, 9, 1]], ["tile_y", "sp", [-1, 1, 1, 2]], ["tile_x", "sp", [-1, 1, 14, 1]], ["tile_rc", "sp", [-1, 24]], ["tile_ry", "sp", [-1, 1]], ["tile_rx", "sp", [-1, 1]], ["auto_unroll_max_step", "ot", 512], ["unroll_explicit", "ot", 1]]}], "r": [[8.802540760660776e-06, 8.802668152132154e-06, 8.802742143680369e-06, 8.805153745678064e-06, 8.806079062620054e-06, 8.807195197848637e-06, 8.807517249327699e-06, 8.809732347291586e-06], 0, 6.136052370071411, 1577069179.8141932], "v": 0.1}
32 | 


--------------------------------------------------------------------------------
/mytophub/20191227/mobilenet_v2_bs_128.log:
--------------------------------------------------------------------------------
 1 | {"i": ["cuda -model=unknown", "topi_nn_depthwise_conv2d_nchw", [["TENSOR", [128, 576, 14, 14], "float32"], ["TENSOR", [576, 1, 3, 3], "float32"], [1, 1], [1, 1], [1, 1], "float32"], {}, ["depthwise_conv2d_nchw", [128, 576, 14, 14, "float32"], [576, 1, 3, 3, "float32"], [1, 1], [1, 1], [1, 1], "float32"], {"i": 541892, "t": "direct", "c": null, "e": [["tile_f", "sp", [-1, 1, 9, 1]], ["tile_y", "sp", [-1, 7, 2, 1]], ["tile_x", "sp", [-1, 1, 14, 1]], ["auto_unroll_max_step", "ot", 1500], ["unroll_explicit", "ot", 0]]}], "r": [[0.0004924322769857434, 0.0004924659327902241, 0.0004924813238289205, 0.0004925114949083503, 0.0004925402790224032, 0.0004925837718940937, 0.0004925917617107943, 0.0004926085050916497], 0, 22.871050357818604, 1576982230.884389], "v": 0.1}
 2 | {"i": ["cuda -model=unknown", "topi_nn_depthwise_conv2d_nchw", [["TENSOR", [128, 192, 28, 28], "float32"], ["TENSOR", [192, 1, 3, 3], "float32"], [1, 1], [1, 1], [1, 1], "float32"], {}, ["depthwise_conv2d_nchw", [128, 192, 28, 28, "float32"], [192, 1, 3, 3, "float32"], [1, 1], [1, 1], [1, 1], "float32"], {"i": 2390654, "t": "direct", "c": null, "e": [["tile_f", "sp", [-1, 1, 2, 1]], ["tile_y", "sp", [-1, 1, 4, 7]], ["tile_x", "sp", [-1, 1, 28, 1]], ["auto_unroll_max_step", "ot", 256], ["unroll_explicit", "ot", 1]]}], "r": [[0.0006410275824468085, 0.000641137789893617, 0.0006411516968085106, 0.0006411983776595745, 0.0006412184893617021, 0.0006412689494680851, 0.000641275545212766, 0.000641281210106383], 0, 17.757728815078735, 1576983101.2966228], "v": 0.1}
 3 | {"i": ["cuda -model=unknown", "topi_nn_depthwise_conv2d_nchw", [["TENSOR", [128, 144, 56, 56], "float32"], ["TENSOR", [144, 1, 3, 3], "float32"], [1, 1], [1, 1], [1, 1], "float32"], {}, ["depthwise_conv2d_nchw", [128, 144, 56, 56, "float32"], [144, 1, 3, 3, "float32"], [1, 1], [1, 1], [1, 1], "float32"], {"i": 9790302, "t": "direct", "c": null, "e": [["tile_f", "sp", [-1, 1, 2, 2]], ["tile_y", "sp", [-1, 1, 2, 4]], ["tile_x", "sp", [-1, 1, 56, 1]], ["auto_unroll_max_step", "ot", 256], ["unroll_explicit", "ot", 1]]}], "r": [[0.0019051543924050634, 0.001905155670886076, 0.001905259253164557, 0.0019052785949367089, 0.0019054091012658226, 0.0019054247974683546, 0.0019054377848101265, 0.0019054836202531645], 0, 5.685882806777954, 1576984814.935286], "v": 0.1}
 4 | {"i": ["cuda -model=unknown", "topi_nn_conv2d", [["TENSOR", [128, 3, 224, 224], "float32"], ["TENSOR", [32, 3, 3, 3], "float32"], [2, 2], [1, 1], [1, 1], "NCHW", "float32"], {}, ["conv2d", [128, 3, 224, 224, "float32"], [32, 3, 3, 3, "float32"], [2, 2], [1, 1], [1, 1], "NCHW", "float32"], {"i": 50838811, "t": "direct", "c": null, "e": [["tile_f", "sp", [-1, 1, 4, 8]], ["tile_y", "sp", [-1, 1, 1, 4]], ["tile_x", "sp", [-1, 1, 112, 1]], ["tile_rc", "sp", [-1, 1]], ["tile_ry", "sp", [-1, 3]], ["tile_rx", "sp", [-1, 3]], ["auto_unroll_max_step", "ot", 1500], ["unroll_explicit", "ot", 1]]}], "r": [[0.0011669232512077296, 0.00116694715942029, 0.0011670172995169082, 0.0011670459420289854, 0.0011672542512077295, 0.0011673511304347826, 0.0011673662077294686, 0.0011674710628019323], 0, 8.119793891906738, 1576987464.199659], "v": 0.1}
 5 | {"i": ["cuda -model=unknown", "topi_nn_depthwise_conv2d_nchw", [["TENSOR", [128, 32, 112, 112], "float32"], ["TENSOR", [32, 1, 3, 3], "float32"], [1, 1], [1, 1], [1, 1], "float32"], {}, ["depthwise_conv2d_nchw", [128, 32, 112, 112, "float32"], [32, 1, 3, 3, "float32"], [1, 1], [1, 1], [1, 1], "float32"], {"i": 3642520, "t": "direct", "c": null, "e": [["tile_f", "sp", [-1, 1, 1, 1]], ["tile_y", "sp", [-1, 1, 4, 4]], ["tile_x", "sp", [-1, 1, 112, 1]], ["auto_unroll_max_step", "ot", 0], ["unroll_explicit", "ot", 1]]}], "r": [[0.0016790437847222222, 0.0016791174583333332, 0.0016791198333333335, 0.0016791405347222222, 0.0016792294583333333, 0.0016794495902777778, 0.0016795537847222224, 0.0016795567708333334], 0, 14.72003984451294, 1576989505.415866], "v": 0.1}
 6 | {"i": ["cuda -model=unknown", "topi_nn_conv2d", [["TENSOR", [128, 32, 112, 112], "float32"], ["TENSOR", [16, 32, 1, 1], "float32"], [1, 1], [0, 0], [1, 1], "NCHW", "float32"], {}, ["conv2d", [128, 32, 112, 112, "float32"], [16, 32, 1, 1, "float32"], [1, 1], [0, 0], [1, 1], "NCHW", "float32"], {"i": 15309186, "t": "direct", "c": null, "e": [["tile_f", "sp", [-1, 4, 4, 1]], ["tile_y", "sp", [-1, 1, 1, 2]], ["tile_x", "sp", [-1, 1, 112, 1]], ["tile_rc", "sp", [-1, 16]], ["tile_ry", "sp", [-1, 1]], ["tile_rx", "sp", [-1, 1]], ["auto_unroll_max_step", "ot", 0], ["unroll_explicit", "ot", 1]]}], "r": [[0.0012192305806451613, 0.0012192714677419355, 0.0012193339838709677, 0.001219483322580645, 0.0012195604274193546, 0.0012196175887096775, 0.001219651879032258, 0.001219747612903226], 0, 10.505413055419922, 1576992400.9788272], "v": 0.1}
 7 | {"i": ["cuda -model=unknown", "topi_nn_conv2d", [["TENSOR", [128, 16, 112, 112], "float32"], ["TENSOR", [96, 16, 1, 1], "float32"], [1, 1], [0, 0], [1, 1], "NCHW", "float32"], {}, ["conv2d", [128, 16, 112, 112, "float32"], [96, 16, 1, 1, "float32"], [1, 1], [0, 0], [1, 1], "NCHW", "float32"], {"i": 124282142, "t": "direct", "c": null, "e": [["tile_f", "sp", [-1, 1, 2, 48]], ["tile_y", "sp", [-1, 1, 2, 1]], ["tile_x", "sp", [-1, 2, 56, 1]], ["tile_rc", "sp", [-1, 8]], ["tile_ry", "sp", [-1, 1]], ["tile_rx", "sp", [-1, 1]], ["auto_unroll_max_step", "ot", 1500], ["unroll_explicit", "ot", 1]]}], "r": [[0.00292620685, 0.0029264251250000002, 0.0029265299500000003, 0.002926978525, 0.0029271707375, 0.0029272888375, 0.0029272892750000002, 0.002927462125], 0, 5.112454175949097, 1576995358.973894], "v": 0.1}
 8 | {"i": ["cuda -model=unknown", "topi_nn_depthwise_conv2d_nchw", [["TENSOR", [128, 96, 112, 112], "float32"], ["TENSOR", [96, 1, 3, 3], "float32"], [2, 2], [1, 1], [1, 1], "float32"], {}, ["depthwise_conv2d_nchw", [128, 96, 112, 112, "float32"], [96, 1, 3, 3, "float32"], [2, 2], [1, 1], [1, 1], "float32"], {"i": 7694400, "t": "direct", "c": null, "e": [["tile_f", "sp", [-1, 1, 1, 1]], ["tile_y", "sp", [-1, 1, 1, 2]], ["tile_x", "sp", [-1, 1, 56, 1]], ["auto_unroll_max_step", "ot", 1500], ["unroll_explicit", "ot", 1]]}], "r": [[0.003257665833333333, 0.0032577514166666666, 0.0032578503125, 0.003258083875, 0.003258299479166667, 0.0032583697083333335, 0.003258842645833333, 0.003259034166666667], 0, 2.9178566932678223, 1576997214.2991278], "v": 0.1}
 9 | {"i": ["cuda -model=unknown", "topi_nn_conv2d", [["TENSOR", [128, 96, 56, 56], "float32"], ["TENSOR", [24, 96, 1, 1], "float32"], [1, 1], [0, 0], [1, 1], "NCHW", "float32"], {}, ["conv2d", [128, 96, 56, 56, "float32"], [24, 96, 1, 1, "float32"], [1, 1], [0, 0], [1, 1], "NCHW", "float32"], {"i": 32954475, "t": "direct", "c": null, "e": [["tile_f", "sp", [-1, 1, 3, 8]], ["tile_y", "sp", [-1, 4, 2, 1]], ["tile_x", "sp", [-1, 1, 56, 1]], ["tile_rc", "sp", [-1, 6]], ["tile_ry", "sp", [-1, 1]], ["tile_rx", "sp", [-1, 1]], ["auto_unroll_max_step", "ot", 1500], ["unroll_explicit", "ot", 1]]}], "r": [[0.000775195570967742, 0.000775219829032258, 0.0007752351677419355, 0.0007752462580645161, 0.0007752852806451612, 0.000775311670967742, 0.0007753124516129032, 0.000775324770967742], 0, 3.656017303466797, 1577000856.7665567], "v": 0.1}
10 | {"i": ["cuda -model=unknown", "topi_nn_conv2d", [["TENSOR", [128, 144, 56, 56], "float32"], ["TENSOR", [24, 144, 1, 1], "float32"], [1, 1], [0, 0], [1, 1], "NCHW", "float32"], {}, ["conv2d", [128, 144, 56, 56, "float32"], [24, 144, 1, 1, "float32"], [1, 1], [0, 0], [1, 1], "NCHW", "float32"], {"i": 17594475, "t": "direct", "c": null, "e": [["tile_f", "sp", [-1, 1, 3, 8]], ["tile_y", "sp", [-1, 4, 2, 1]], ["tile_x", "sp", [-1, 1, 56, 1]], ["tile_rc", "sp", [-1, 6]], ["tile_ry", "sp", [-1, 1]], ["tile_rx", "sp", [-1, 1]], ["auto_unroll_max_step", "ot", 1500], ["unroll_explicit", "ot", 0]]}], "r": [[0.0010762264454545454, 0.0010763786954545453, 0.0010765197318181817, 0.0010765394318181817, 0.0010765941954545455, 0.0010766144727272727, 0.00107665635, 0.0010767212681818182], 0, 3.779479742050171, 1577003398.419684], "v": 0.1}
11 | {"i": ["cuda -model=unknown", "topi_nn_conv2d", [["TENSOR", [128, 24, 56, 56], "float32"], ["TENSOR", [144, 24, 1, 1], "float32"], [1, 1], [0, 0], [1, 1], "NCHW", "float32"], {}, ["conv2d", [128, 24, 56, 56, "float32"], [144, 24, 1, 1, "float32"], [1, 1], [0, 0], [1, 1], "NCHW", "float32"], {"i": 45623120, "t": "direct", "c": null, "e": [["tile_f", "sp", [-1, 1, 3, 8]], ["tile_y", "sp", [-1, 2, 1, 2]], ["tile_x", "sp", [-1, 1, 56, 1]], ["tile_rc", "sp", [-1, 6]], ["tile_ry", "sp", [-1, 1]], ["tile_rx", "sp", [-1, 1]], ["auto_unroll_max_step", "ot", 1500], ["unroll_explicit", "ot", 0]]}], "r": [[0.0010314081643835616, 0.0010314449178082192, 0.0010314843904109588, 0.0010315650136986303, 0.001031610212328767, 0.0010317074657534247, 0.0010317642945205478, 0.001031856794520548], 0, 12.336322784423828, 1577005390.9159007], "v": 0.1}
12 | {"i": ["cuda -model=unknown", "topi_nn_depthwise_conv2d_nchw", [["TENSOR", [128, 144, 56, 56], "float32"], ["TENSOR", [144, 1, 3, 3], "float32"], [2, 2], [1, 1], [1, 1], "float32"], {}, ["depthwise_conv2d_nchw", [128, 144, 56, 56, "float32"], [144, 1, 3, 3, "float32"], [2, 2], [1, 1], [1, 1], "float32"], {"i": 2598400, "t": "direct", "c": null, "e": [["tile_f", "sp", [-1, 1, 1, 1]], ["tile_y", "sp", [-1, 1, 7, 2]], ["tile_x", "sp", [-1, 2, 7, 2]], ["auto_unroll_max_step", "ot", 256], ["unroll_explicit", "ot", 1]]}], "r": [[0.0012250302588832488, 0.0012251331472081218, 0.0012251812030456854, 0.001225217116751269, 0.0012252300862944162, 0.0012252728274111675, 0.0012253132588832488, 0.001225341116751269], 0, 3.809429407119751, 1577007590.3815002], "v": 0.1}
13 | {"i": ["cuda -model=unknown", "topi_nn_conv2d", [["TENSOR", [128, 144, 28, 28], "float32"], ["TENSOR", [32, 144, 1, 1], "float32"], [1, 1], [0, 0], [1, 1], "NCHW", "float32"], {}, ["conv2d", [128, 144, 28, 28, "float32"], [32, 144, 1, 1, "float32"], [1, 1], [0, 0], [1, 1], "NCHW", "float32"], {"i": 5862504, "t": "direct", "c": null, "e": [["tile_f", "sp", [-1, 4, 4, 2]], ["tile_y", "sp", [-1, 2, 2, 1]], ["tile_x", "sp", [-1, 1, 28, 1]], ["tile_rc", "sp", [-1, 8]], ["tile_ry", "sp", [-1, 1]], ["tile_rx", "sp", [-1, 1]], ["auto_unroll_max_step", "ot", 512], ["unroll_explicit", "ot", 1]]}], "r": [[0.00030421541666666665, 0.00030421977146464645, 0.00030422413383838386, 0.00030429518055555553, 0.00030431397474747475, 0.00030433110606060607, 0.0003043926994949495, 0.00030440221212121215], 0, 9.922117233276367, 1577010261.225624], "v": 0.1}
14 | {"i": ["cuda -model=unknown", "topi_nn_conv2d", [["TENSOR", [128, 192, 28, 28], "float32"], ["TENSOR", [32, 192, 1, 1], "float32"], [1, 1], [0, 0], [1, 1], "NCHW", "float32"], {}, ["conv2d", [128, 192, 28, 28, "float32"], [32, 192, 1, 1, "float32"], [1, 1], [0, 0], [1, 1], "NCHW", "float32"], {"i": 1740609, "t": "direct", "c": null, "e": [["tile_f", "sp", [-1, 4, 8, 1]], ["tile_y", "sp", [-1, 4, 1, 1]], ["tile_x", "sp", [-1, 1, 28, 1]], ["tile_rc", "sp", [-1, 8]], ["tile_ry", "sp", [-1, 1]], ["tile_rx", "sp", [-1, 1]], ["auto_unroll_max_step", "ot", 512], ["unroll_explicit", "ot", 0]]}], "r": [[0.0003867008465266559, 0.0003867304830371567, 0.00038674032310177705, 0.0003867692164781906, 0.00038679729240710827, 0.0003868171841680129, 0.0003868245573505654, 0.0003868939660743134], 0, 6.584372282028198, 1577012153.8546429], "v": 0.1}
15 | {"i": ["cuda -model=unknown", "topi_nn_conv2d", [["TENSOR", [128, 32, 28, 28], "float32"], ["TENSOR", [192, 32, 1, 1], "float32"], [1, 1], [0, 0], [1, 1], "NCHW", "float32"], {}, ["conv2d", [128, 32, 28, 28, "float32"], [192, 32, 1, 1, "float32"], [1, 1], [0, 0], [1, 1], "NCHW", "float32"], {"i": 8836179, "t": "direct", "c": null, "e": [["tile_f", "sp", [-1, 3, 8, 1]], ["tile_y", "sp", [-1, 1, 1, 2]], ["tile_x", "sp", [-1, 1, 28, 1]], ["tile_rc", "sp", [-1, 16]], ["tile_ry", "sp", [-1, 1]], ["tile_rx", "sp", [-1, 1]], ["auto_unroll_max_step", "ot", 1500], ["unroll_explicit", "ot", 0]]}], "r": [[0.0003551378484375, 0.00035524364375, 0.0003552536515625, 0.000355354640625, 0.0003553730875, 0.0003553818140625, 0.000355406296875, 0.0003554092640625], 0, 3.3498692512512207, 1577014505.536223], "v": 0.1}
16 | {"i": ["cuda -model=unknown", "topi_nn_depthwise_conv2d_nchw", [["TENSOR", [128, 192, 28, 28], "float32"], ["TENSOR", [192, 1, 3, 3], "float32"], [2, 2], [1, 1], [1, 1], "float32"], {}, ["depthwise_conv2d_nchw", [128, 192, 28, 28, "float32"], [192, 1, 3, 3, "float32"], [2, 2], [1, 1], [1, 1], "float32"], {"i": 385418, "t": "direct", "c": null, "e": [["tile_f", "sp", [-1, 1, 3, 1]], ["tile_y", "sp", [-1, 1, 7, 2]], ["tile_x", "sp", [-1, 2, 7, 1]], ["auto_unroll_max_step", "ot", 256], ["unroll_explicit", "ot", 1]]}], "r": [[0.000440803, 0.0004408038296703297, 0.00044080913553113554, 0.0004408110311355311, 0.0004408152747252747, 0.0004408232032967033, 0.000440838173992674, 0.00044084876007326006], 0, 19.497984647750854, 1577016371.363043], "v": 0.1}
17 | {"i": ["cuda -model=unknown", "topi_nn_conv2d", [["TENSOR", [128, 192, 14, 14], "float32"], ["TENSOR", [64, 192, 1, 1], "float32"], [1, 1], [0, 0], [1, 1], "NCHW", "float32"], {}, ["conv2d", [128, 192, 14, 14, "float32"], [64, 192, 1, 1, "float32"], [1, 1], [0, 0], [1, 1], "NCHW", "float32"], {"i": 1323800, "t": "direct", "c": null, "e": [["tile_f", "sp", [-1, 2, 8, 2]], ["tile_y", "sp", [-1, 1, 1, 14]], ["tile_x", "sp", [-1, 1, 14, 1]], ["tile_rc", "sp", [-1, 8]], ["tile_ry", "sp", [-1, 1]], ["tile_rx", "sp", [-1, 1]], ["auto_unroll_max_step", "ot", 512], ["unroll_explicit", "ot", 1]]}], "r": [[0.00013619114844192634, 0.00013626524815864023, 0.00013627431104815864, 0.00013627742662889519, 0.00013630077393767705, 0.0001363245059490085, 0.0001363518090651558, 0.00013636298526912183], 0, 22.2538423538208, 1577017577.7665992], "v": 0.1}
18 | {"i": ["cuda -model=unknown", "topi_nn_conv2d", [["TENSOR", [128, 384, 14, 14], "float32"], ["TENSOR", [64, 384, 1, 1], "float32"], [1, 1], [0, 0], [1, 1], "NCHW", "float32"], {}, ["conv2d", [128, 384, 14, 14, "float32"], [64, 384, 1, 1, "float32"], [1, 1], [0, 0], [1, 1], "NCHW", "float32"], {"i": 1838864, "t": "direct", "c": null, "e": [["tile_f", "sp", [-1, 4, 8, 1]], ["tile_y", "sp", [-1, 14, 1, 1]], ["tile_x", "sp", [-1, 1, 14, 1]], ["tile_rc", "sp", [-1, 8]], ["tile_ry", "sp", [-1, 1]], ["tile_rx", "sp", [-1, 1]], ["auto_unroll_max_step", "ot", 1500], ["unroll_explicit", "ot", 1]]}], "r": [[0.00024326830050251257, 0.00024333388944723617, 0.00024342593668341708, 0.00024345997487437186, 0.00024354556281407035, 0.00024355500301507537, 0.00024356102713567838, 0.00024356147638190955], 0, 22.29342246055603, 1577019846.617505], "v": 0.1}
19 | {"i": ["cuda -model=unknown", "topi_nn_conv2d", [["TENSOR", [128, 64, 14, 14], "float32"], ["TENSOR", [384, 64, 1, 1], "float32"], [1, 1], [0, 0], [1, 1], "NCHW", "float32"], {}, ["conv2d", [128, 64, 14, 14, "float32"], [384, 64, 1, 1, "float32"], [1, 1], [0, 0], [1, 1], "NCHW", "float32"], {"i": 1292784, "t": "direct", "c": null, "e": [["tile_f", "sp", [-1, 4, 4, 2]], ["tile_y", "sp", [-1, 7, 2, 1]], ["tile_x", "sp", [-1, 1, 14, 1]], ["tile_rc", "sp", [-1, 8]], ["tile_ry", "sp", [-1, 1]], ["tile_rx", "sp", [-1, 1]], ["auto_unroll_max_step", "ot", 512], ["unroll_explicit", "ot", 0]]}], "r": [[0.00022571670681605977, 0.0002257359710550887, 0.00022574140056022407, 0.00022576462651727357, 0.0002257684304388422, 0.00022577994584500468, 0.00022583613165266106, 0.00022586610737628383], 0, 15.66208553314209, 1577022401.9926887], "v": 0.1}
20 | {"i": ["cuda -model=unknown", "topi_nn_depthwise_conv2d_nchw", [["TENSOR", [128, 384, 14, 14], "float32"], ["TENSOR", [384, 1, 3, 3], "float32"], [1, 1], [1, 1], [1, 1], "float32"], {}, ["depthwise_conv2d_nchw", [128, 384, 14, 14, "float32"], [384, 1, 3, 3, "float32"], [1, 1], [1, 1], [1, 1], "float32"], {"i": 432537, "t": "direct", "c": null, "e": [["tile_f", "sp", [-1, 1, 8, 1]], ["tile_y", "sp", [-1, 7, 2, 1]], ["tile_x", "sp", [-1, 1, 14, 1]], ["auto_unroll_max_step", "ot", 0], ["unroll_explicit", "ot", 1]]}], "r": [[0.00033138103532008833, 0.00033150320529801326, 0.0003315039006622517, 0.0003315402582781457, 0.0003316247306843267, 0.0003316301412803532, 0.0003316793554083885, 0.0003316904856512141], 0, 19.53456687927246, 1577024074.2204301], "v": 0.1}
21 | {"i": ["cuda -model=unknown", "topi_nn_conv2d", [["TENSOR", [128, 384, 14, 14], "float32"], ["TENSOR", [96, 384, 1, 1], "float32"], [1, 1], [0, 0], [1, 1], "NCHW", "float32"], {}, ["conv2d", [128, 384, 14, 14, "float32"], [96, 384, 1, 1, "float32"], [1, 1], [0, 0], [1, 1], "NCHW", "float32"], {"i": 1234104, "t": "direct", "c": null, "e": [["tile_f", "sp", [-1, 3, 4, 2]], ["tile_y", "sp", [-1, 7, 2, 1]], ["tile_x", "sp", [-1, 1, 14, 1]], ["tile_rc", "sp", [-1, 8]], ["tile_ry", "sp", [-1, 1]], ["tile_rx", "sp", [-1, 1]], ["auto_unroll_max_step", "ot", 512], ["unroll_explicit", "ot", 0]]}], "r": [[0.0003202832582781457, 0.0003203761311258278, 0.0003203799920529801, 0.0003203970476821192, 0.0003204525099337748, 0.0003204735430463576, 0.00032055492317880794, 0.0003206461509933775], 0, 12.39969515800476, 1577025491.089609], "v": 0.1}
22 | {"i": ["cuda -model=unknown", "topi_nn_conv2d", [["TENSOR", [128, 576, 14, 14], "float32"], ["TENSOR", [96, 576, 1, 1], "float32"], [1, 1], [0, 0], [1, 1], "NCHW", "float32"], {}, ["conv2d", [128, 576, 14, 14, "float32"], [96, 576, 1, 1, "float32"], [1, 1], [0, 0], [1, 1], "NCHW", "float32"], {"i": 2725048, "t": "direct", "c": null, "e": [["tile_f", "sp", [-1, 3, 4, 2]], ["tile_y", "sp", [-1, 7, 2, 1]], ["tile_x", "sp", [-1, 1, 14, 1]], ["tile_rc", "sp", [-1, 8]], ["tile_ry", "sp", [-1, 1]], ["tile_rx", "sp", [-1, 1]], ["auto_unroll_max_step", "ot", 1500], ["unroll_explicit", "ot", 0]]}], "r": [[0.000472001599609375, 0.000472129212890625, 0.000472129615234375, 0.000472180564453125, 0.00047236093359375, 0.00047240101171875, 0.000472509556640625, 0.0004727565703125], 0, 31.21929383277893, 1577027886.4104795], "v": 0.1}
23 | {"i": ["cuda -model=unknown", "topi_nn_conv2d", [["TENSOR", [128, 96, 14, 14], "float32"], ["TENSOR", [576, 96, 1, 1], "float32"], [1, 1], [0, 0], [1, 1], "NCHW", "float32"], {}, ["conv2d", [128, 96, 14, 14, "float32"], [576, 96, 1, 1, "float32"], [1, 1], [0, 0], [1, 1], "NCHW", "float32"], {"i": 11509444, "t": "direct", "c": null, "e": [["tile_f", "sp", [-1, 1, 4, 9]], ["tile_y", "sp", [-1, 7, 2, 1]], ["tile_x", "sp", [-1, 1, 14, 1]], ["tile_rc", "sp", [-1, 8]], ["tile_ry", "sp", [-1, 1]], ["tile_rx", "sp", [-1, 1]], ["auto_unroll_max_step", "ot", 512], ["unroll_explicit", "ot", 1]]}], "r": [[0.0004334905555555555, 0.0004335100125448029, 0.0004336135143369175, 0.0004336731827956989, 0.00043369396236559143, 0.00043372455376344083, 0.0004337485896057348, 0.00043377002688172047], 0, 10.24066972732544, 1577030442.7566364], "v": 0.1}
24 | {"i": ["cuda -model=unknown", "topi_nn_depthwise_conv2d_nchw", [["TENSOR", [128, 576, 14, 14], "float32"], ["TENSOR", [576, 1, 3, 3], "float32"], [2, 2], [1, 1], [1, 1], "float32"], {}, ["depthwise_conv2d_nchw", [128, 576, 14, 14, "float32"], [576, 1, 3, 3, "float32"], [2, 2], [1, 1], [1, 1], "float32"], {"i": 75655, "t": "direct", "c": null, "e": [["tile_f", "sp", [-1, 3, 4, 1]], ["tile_y", "sp", [-1, 1, 7, 1]], ["tile_x", "sp", [-1, 1, 7, 1]], ["auto_unroll_max_step", "ot", 1500], ["unroll_explicit", "ot", 1]]}], "r": [[0.0003393663023909986, 0.00033943658790436005, 0.00033946630801687765, 0.00033947313220815753, 0.0003394738438818565, 0.0003394852953586498, 0.0003395200604781997, 0.0003395253670886076], 0, 4.610151052474976, 1577033207.9550354], "v": 0.1}
25 | {"i": ["cuda -model=unknown", "topi_nn_conv2d", [["TENSOR", [128, 576, 7, 7], "float32"], ["TENSOR", [160, 576, 1, 1], "float32"], [1, 1], [0, 0], [1, 1], "NCHW", "float32"], {}, ["conv2d", [128, 576, 7, 7, "float32"], [160, 576, 1, 1, "float32"], [1, 1], [0, 0], [1, 1], "NCHW", "float32"], {"i": 403580, "t": "direct", "c": null, "e": [["tile_f", "sp", [-1, 1, 16, 5]], ["tile_y", "sp", [-1, 7, 1, 1]], ["tile_x", "sp", [-1, 1, 7, 1]], ["tile_rc", "sp", [-1, 12]], ["tile_ry", "sp", [-1, 1]], ["tile_rx", "sp", [-1, 1]], ["auto_unroll_max_step", "ot", 1500], ["unroll_explicit", "ot", 1]]}], "r": [[0.00025041541279669763, 0.0002504247450980392, 0.0002504995335397317, 0.0002506772249742002, 0.00025073734262125906, 0.000250791213622291, 0.00025079516193656095, 0.0002508958173374613], 0, 22.042405128479004, 1577034990.010283], "v": 0.1}
26 | {"i": ["cuda -model=unknown", "topi_nn_conv2d", [["TENSOR", [128, 960, 7, 7], "float32"], ["TENSOR", [160, 960, 1, 1], "float32"], [1, 1], [0, 0], [1, 1], "NCHW", "float32"], {}, ["conv2d", [128, 960, 7, 7, "float32"], [160, 960, 1, 1, "float32"], [1, 1], [0, 0], [1, 1], "NCHW", "float32"], {"i": 532604, "t": "direct", "c": null, "e": [["tile_f", "sp", [-1, 1, 16, 5]], ["tile_y", "sp", [-1, 7, 1, 1]], ["tile_x", "sp", [-1, 1, 7, 1]], ["tile_rc", "sp", [-1, 12]], ["tile_ry", "sp", [-1, 1]], ["tile_rx", "sp", [-1, 1]], ["auto_unroll_max_step", "ot", 1500], ["unroll_explicit", "ot", 1]]}], "r": [[0.0004130911094017094, 0.0004132081162393162, 0.00041325050085470086, 0.0004132856854700855, 0.00041333794700854705, 0.00041334566666666666, 0.0004134788188034188, 0.0004135694051282051], 0, 6.42118239402771, 1577036732.061099], "v": 0.1}
27 | {"i": ["cuda -model=unknown", "topi_nn_conv2d", [["TENSOR", [128, 160, 7, 7], "float32"], ["TENSOR", [960, 160, 1, 1], "float32"], [1, 1], [0, 0], [1, 1], "NCHW", "float32"], {}, ["conv2d", [128, 160, 7, 7, "float32"], [960, 160, 1, 1, "float32"], [1, 1], [0, 0], [1, 1], "NCHW", "float32"], {"i": 1391525, "t": "direct", "c": null, "e": [["tile_f", "sp", [-1, 2, 8, 3]], ["tile_y", "sp", [-1, 1, 1, 7]], ["tile_x", "sp", [-1, 1, 7, 1]], ["tile_rc", "sp", [-1, 8]], ["tile_ry", "sp", [-1, 1]], ["tile_rx", "sp", [-1, 1]], ["auto_unroll_max_step", "ot", 1500], ["unroll_explicit", "ot", 1]]}], "r": [[0.0003890511884057971, 0.0003890642028985507, 0.00038908027858293074, 0.0003890889001610306, 0.0003891082818035427, 0.0003891102818035427, 0.00038912962318840577, 0.00038929127858293076], 0, 13.870026111602783, 1577038664.8340282], "v": 0.1}
28 | {"i": ["cuda -model=unknown", "topi_nn_depthwise_conv2d_nchw", [["TENSOR", [128, 960, 7, 7], "float32"], ["TENSOR", [960, 1, 3, 3], "float32"], [1, 1], [1, 1], [1, 1], "float32"], {}, ["depthwise_conv2d_nchw", [128, 960, 7, 7, "float32"], [960, 1, 3, 3, "float32"], [1, 1], [1, 1], [1, 1], "float32"], {"i": 121048, "t": "direct", "c": null, "e": [["tile_f", "sp", [-1, 3, 5, 1]], ["tile_y", "sp", [-1, 1, 7, 1]], ["tile_x", "sp", [-1, 1, 7, 1]], ["auto_unroll_max_step", "ot", 1500], ["unroll_explicit", "ot", 1]]}], "r": [[0.00021692545229007632, 0.00021698320610687022, 0.00021698354389312977, 0.000217005143129771, 0.00021701927099236642, 0.00021708887404580152, 0.0002171617576335878, 0.00021716972232824428], 0, 3.7297778129577637, 1577040416.36417], "v": 0.1}
29 | {"i": ["cuda -model=unknown", "topi_nn_conv2d", [["TENSOR", [128, 960, 7, 7], "float32"], ["TENSOR", [320, 960, 1, 1], "float32"], [1, 1], [0, 0], [1, 1], "NCHW", "float32"], {}, ["conv2d", [128, 960, 7, 7, "float32"], [320, 960, 1, 1, "float32"], [1, 1], [0, 0], [1, 1], "NCHW", "float32"], {"i": 782809, "t": "direct", "c": null, "e": [["tile_f", "sp", [-1, 1, 8, 10]], ["tile_y", "sp", [-1, 7, 1, 1]], ["tile_x", "sp", [-1, 1, 7, 1]], ["tile_rc", "sp", [-1, 6]], ["tile_ry", "sp", [-1, 1]], ["tile_rx", "sp", [-1, 1]], ["auto_unroll_max_step", "ot", 1500], ["unroll_explicit", "ot", 1]]}], "r": [[0.0007245472432432432, 0.0007246782372372372, 0.0007248073693693695, 0.0007248727057057057, 0.0007248976156156156, 0.0007249364294294294, 0.0007250481381381381, 0.0007251297687687687], 0, 12.239248752593994, 1577042328.426015], "v": 0.1}
30 | {"i": ["cuda -model=unknown", "topi_nn_conv2d", [["TENSOR", [128, 320, 7, 7], "float32"], ["TENSOR", [1280, 320, 1, 1], "float32"], [1, 1], [0, 0], [1, 1], "NCHW", "float32"], {}, ["conv2d", [128, 320, 7, 7, "float32"], [1280, 320, 1, 1, "float32"], [1, 1], [0, 0], [1, 1], "NCHW", "float32"], {"i": 630587, "t": "direct", "c": null, "e": [["tile_f", "sp", [-1, 2, 8, 4]], ["tile_y", "sp", [-1, 1, 1, 7]], ["tile_x", "sp", [-1, 1, 7, 1]], ["tile_rc", "sp", [-1, 5]], ["tile_ry", "sp", [-1, 1]], ["tile_rx", "sp", [-1, 1]], ["auto_unroll_max_step", "ot", 512], ["unroll_explicit", "ot", 1]]}], "r": [[0.0009332842906976744, 0.0009333610542635659, 0.000933489531007752, 0.0009335054612403101, 0.0009335902441860465, 0.0009336596589147286, 0.0009337442364341085, 0.0009337563682170543], 0, 25.36549663543701, 1577044315.2849915], "v": 0.1}
31 | {"i": ["cuda -model=unknown", "topi_nn_dense", [["TENSOR", [128, 1280], "float32"], ["TENSOR", [1000, 1280], "float32"], null, "float32"], {}, ["dense", [128, 1280, "float32"], [1000, 1280, "float32"], 0, "float32"], {"i": 194291, "t": "direct", "c": null, "e": [["tile_x", "sp", [-1, 8, 2, 1]], ["tile_y", "sp", [-1, 5, 2, 1]], ["tile_k", "sp", [-1, 8, 1]]]}], "r": [[0.00047364567924528304, 0.00047377319496855343, 0.00047383921383647797, 0.00047407464465408803, 0.0004741108679245283, 0.00047445288050314467, 0.0004747232106918239, 0.00047494211006289305], 0, 22.404786348342896, 1577047211.6715684], "v": 0.1}
32 | 


--------------------------------------------------------------------------------