├── README.md ├── mxnet ├── base │ ├── bert_export.py │ ├── ic_model_export.py │ ├── mx_bert_serving.py │ └── mx_serving.py └── tvm │ ├── tvm_bert_export.py │ ├── tvm_export.py │ └── tvm_serving.py └── torch ├── base ├── bert_export.py ├── ic_model_export.py ├── torch_profiling.py └── torch_serving.py ├── onnx ├── onnx_profiling.py ├── onnx_serving.py └── torch2onnx.py └── tvm ├── tvm_bert_export.py ├── tvm_export.py ├── tvm_profiling.py └── tvm_serving.py /README.md: -------------------------------------------------------------------------------- 1 | # bench-optimize-models 2 | benchmark graph optimized/hardware optimized models 3 | -------------------------------------------------------------------------------- /mxnet/base/bert_export.py: -------------------------------------------------------------------------------- 1 | import time 2 | import argparse 3 | import numpy as np 4 | import mxnet as mx 5 | import gluonnlp as nlp 6 | 7 | ctx = mx.gpu() if mx.context.num_gpus() else mx.cpu() 8 | 9 | 10 | def bert_download(model_name,seq_length, batch_size, dtype="float32"): 11 | inputs = np.random.randint(0, 2000, size=(batch_size, seq_length)).astype(dtype) 12 | token_types = np.random.uniform(size=(batch_size, seq_length)).astype(dtype) 13 | valid_length = np.asarray([seq_length] * batch_size).astype(dtype) 14 | 15 | inputs_nd = mx.nd.array(inputs, ctx=ctx) 16 | token_types_nd = mx.nd.array(token_types, ctx=ctx) 17 | valid_length_nd = mx.nd.array(valid_length, ctx=ctx) 18 | 19 | # Instantiate a BERT classifier using GluonNLP 20 | if model_name == "bert_base": 21 | model_name_ = "bert_12_768_12" 22 | dataset = "book_corpus_wiki_en_uncased" 23 | model, _ = nlp.model.get_model( 24 | name=model_name_, 25 | dataset_name=dataset, 26 | pretrained=True, 27 | use_pooler=True, 28 | use_decoder=False, 29 | use_classifier=False, 30 | ) 31 | model = nlp.model.BERTClassifier(model, dropout=0.1, num_classes=2) 32 | model.initialize(ctx=ctx) 33 | model.hybridize(static_alloc=True) 34 | 35 | mx_out = model(inputs_nd, token_types_nd, valid_length_nd) 36 | mx_out.wait_to_read() 37 | 38 | # print model info 39 | #print("-"*10,f"{model_name} Parameter Info","-"*10) 40 | #print(model.summary(inputs_nd,token_types_nd, valid_length_nd)) 41 | 42 | elif model_name == "distilbert": 43 | model_name_="distilbert_6_768_12" 44 | dataset = "distilbert_book_corpus_wiki_en_uncased" 45 | model, _ = nlp.model.get_model( 46 | name=model_name_, 47 | dataset_name=dataset, 48 | pretrained=True, 49 | ) 50 | model.hybridize(static_alloc=True) 51 | 52 | mx_out = model(inputs_nd, valid_length_nd) 53 | mx_out.wait_to_read() 54 | 55 | # print("-"*10,f"{model_name} Parameter Info","-"*10) 56 | # print(model.summary(inputs_nd, valid_length_nd)) 57 | 58 | 59 | target_path = f"./{model_name}_{batch_size}" 60 | from pathlib import Path 61 | Path(target_path).mkdir(parents=True, exist_ok=True) 62 | 63 | model.export(f'{model_name}_{batch_size}/model') 64 | print("-"*10,f"Download {model_name} complete","-"*10) 65 | 66 | 67 | 68 | if __name__ == "__main__": 69 | import argparse 70 | 71 | parser = argparse.ArgumentParser() 72 | parser.add_argument('--model',default='bert_base' , type=str) 73 | parser.add_argument('--batchsize',default=1 , type=int) 74 | parser.add_argument('--seq',default=128 , type=int) 75 | 76 | args = parser.parse_args() 77 | model_name = args.model 78 | batchsize = args.batchsize 79 | seq_length=args.seq 80 | 81 | 82 | bert_download(model_name,seq_length,batchsize) 83 | -------------------------------------------------------------------------------- /mxnet/base/ic_model_export.py: -------------------------------------------------------------------------------- 1 | import json 2 | import mxnet as mx 3 | from mxnet import gluon, nd 4 | from mxnet.gluon.model_zoo import vision 5 | import numpy as np 6 | 7 | 8 | ctx = mx.gpu() if mx.context.num_gpus() else mx.cpu() 9 | 10 | def download_model(model_name,batchsize,imgsize=224): 11 | models_detail = { 12 | 'densenet' : vision.densenet161(pretrained=True, ctx=ctx), 13 | 'resnet18' : vision.resnet18_v1(pretrained=True, ctx=ctx), 14 | 'squeezenet' : vision.squeezenet1_0(pretrained=True, ctx=ctx), 15 | 'mobilenet':vision.mobilenet0_5(pretrained=True, ctx=ctx), 16 | 'mobilenet_v2':vision.get_mobilenet_v2(1, pretrained=True), 17 | 'inception_v3':vision.inception_v3(pretrained=True, ctx=ctx), 18 | 'resnet50': vision.get_resnet(1, 50, pretrained=True), 19 | 'alexnet':vision.alexnet(pretrained=True,ctx=ctx), 20 | 'vgg16':vision.vgg16(pretrained=True, ctx=ctx), 21 | 'vgg19':vision.vgg19(pretrained=True, ctx=ctx) 22 | } 23 | 24 | model = models_detail[model_name] 25 | model.hybridize() 26 | 27 | input_shape = (batchsize, 3, imgsize, imgsize) 28 | data = np.random.uniform(size=input_shape) 29 | 30 | data_array = np.random.uniform(0, 255, size=input_shape).astype("float32") 31 | mx_data = mx.nd.array(data_array) 32 | model(mx_data) 33 | 34 | target_path = f"./{model_name}" 35 | from pathlib import Path 36 | Path(target_path).mkdir(parents=True, exist_ok=True) 37 | 38 | model.export(f'{model_name}/model') 39 | print("-"*10,f"Download and export {model_name} complete","-"*10) 40 | 41 | if __name__ == "__main__": 42 | import argparse 43 | 44 | parser = argparse.ArgumentParser() 45 | parser.add_argument('--model',default='resnet50' , type=str) 46 | parser.add_argument('--batchsize',default=8 , type=int) 47 | 48 | 49 | args = parser.parse_args() 50 | 51 | model_name = args.model 52 | batchsize = args.batchsize 53 | img_size = 224 54 | 55 | if args.model == "all": 56 | models = ["mobilenet", "mobilenet_v2", "inception_v3","resnet50","alexnet","vgg16","vgg19"] 57 | else: 58 | models = [args.model] 59 | 60 | for model in models: 61 | if model == 'inception_v3': 62 | img_size = 299 63 | download_model(model,batchsize,img_size) 64 | -------------------------------------------------------------------------------- /mxnet/base/mx_bert_serving.py: -------------------------------------------------------------------------------- 1 | import warnings 2 | import mxnet as mx 3 | import mxnet.ndarray as nd 4 | from mxnet import gluon 5 | import time 6 | import numpy as np 7 | 8 | import argparse 9 | 10 | 11 | ctx = mx.gpu() if mx.context.num_gpus() else mx.cpu() 12 | 13 | 14 | def timer(thunk, repeat=1, number=10, dryrun=3, min_repeat_ms=1000): 15 | """Helper function to time a function""" 16 | for i in range(dryrun): 17 | thunk() 18 | ret = [] 19 | for _ in range(repeat): 20 | while True: 21 | beg = time.time() 22 | for _ in range(number): 23 | thunk() 24 | end = time.time() 25 | lat = (end - beg) * 1e3 26 | if lat >= min_repeat_ms: 27 | break 28 | number = int(max(min_repeat_ms / (lat / number) + 1, number * 1.618)) 29 | ret.append(lat / number) 30 | return ret 31 | 32 | def load_model(model_name): 33 | model_json = f"./{model_name}/model-symbol.json" 34 | model_params = f"./{model_name}/model-0000.params" 35 | 36 | if model_name == "bert_base": 37 | with warnings.catch_warnings(): 38 | warnings.simplefilter("ignore") 39 | model = gluon.nn.SymbolBlock.imports(model_json, ['data0','data1','data2'], model_params, ctx=ctx) 40 | elif model_name == "distilbert": 41 | with warnings.catch_warnings(): 42 | warnings.simplefilter("ignore") 43 | model = gluon.nn.SymbolBlock.imports(model_json, ['data0','data1'], model_params, ctx=ctx) 44 | return model 45 | 46 | def benchmark(model_name,batch_size,seq_length,dtype='float32'): 47 | model = load_model(model_name) 48 | inputs = np.random.randint(0, 2000, size=(batch_size, seq_length)).astype(dtype) 49 | token_types = np.random.uniform(size=(batch_size, seq_length)).astype(dtype) 50 | valid_length = np.asarray([seq_length] * batch_size).astype(dtype) 51 | 52 | inputs_nd = mx.nd.array(inputs, ctx=ctx) 53 | token_types_nd = mx.nd.array(token_types, ctx=ctx) 54 | valid_length_nd = mx.nd.array(valid_length, ctx=ctx) 55 | 56 | if model_name == "bert_base": 57 | # Prepare input data 58 | model.hybridize(static_alloc=True) 59 | mx_out = model(inputs_nd, token_types_nd, valid_length_nd) 60 | mx_out.wait_to_read() 61 | res = timer(lambda: model(inputs_nd,token_types_nd,valid_length_nd).wait_to_read(), 62 | repeat=3, 63 | dryrun=5, 64 | min_repeat_ms=1000) 65 | 66 | elif model_name == "distilbert": 67 | model.hybridize(static_alloc=True) 68 | mx_out = model(inputs_nd, valid_length_nd,) 69 | mx_out.wait_to_read() 70 | 71 | # Benchmark the MXNet latency 72 | res = timer(lambda: model(inputs_nd, valid_length_nd).wait_to_read(), 73 | repeat=3, 74 | dryrun=5, 75 | min_repeat_ms=1000) 76 | 77 | print(f"MXNet {model_name} latency for batch {batch_size} : {np.mean(res):.2f} ms") 78 | 79 | 80 | 81 | if __name__ == "__main__": 82 | import argparse 83 | 84 | parser = argparse.ArgumentParser() 85 | parser.add_argument('--model',default='bert_base' , type=str) 86 | parser.add_argument('--batchsize',default=1 , type=int) 87 | parser.add_argument('--seq',default=128 , type=int) 88 | 89 | args = parser.parse_args() 90 | 91 | model_name = args.model 92 | batchsize = args.batchsize 93 | seq_length=args.seq 94 | 95 | benchmark(model_name,batchsize,seq_length) 96 | -------------------------------------------------------------------------------- /mxnet/base/mx_serving.py: -------------------------------------------------------------------------------- 1 | from json import load 2 | import warnings 3 | import mxnet as mx 4 | import mxnet.ndarray as nd 5 | from mxnet import nd, gluon 6 | import time 7 | import numpy as np 8 | 9 | import argparse 10 | 11 | 12 | ctx = mx.gpu() if mx.context.num_gpus() else mx.cpu() 13 | 14 | 15 | def timer(thunk, repeat=1, number=10, dryrun=3, min_repeat_ms=1000): 16 | """Helper function to time a function""" 17 | for i in range(dryrun): 18 | thunk() 19 | ret = [] 20 | for _ in range(repeat): 21 | while True: 22 | beg = time.time() 23 | for _ in range(number): 24 | thunk() 25 | end = time.time() 26 | lat = (end - beg) * 1e3 27 | if lat >= min_repeat_ms: 28 | break 29 | number = int(max(min_repeat_ms / (lat / number) + 1, number * 1.618)) 30 | ret.append(lat / number) 31 | return ret 32 | 33 | def load_model(model_name): 34 | model_json = f"./{model_name}/model-symbol.json" 35 | model_params = f"./{model_name}/model-0000.params" 36 | 37 | 38 | with warnings.catch_warnings(): 39 | warnings.simplefilter("ignore") 40 | model = gluon.nn.SymbolBlock.imports(model_json, ['data'], model_params, ctx=ctx) 41 | return model 42 | 43 | def benchmark(model_name,imgsize,batchsize): 44 | input_shape = (batchsize, 3, imgsize, imgsize) 45 | data = np.random.uniform(size=input_shape) 46 | 47 | input_data = mx.nd.array(data, ctx=ctx) 48 | 49 | model = load_model(model_name) 50 | 51 | res = timer(lambda: model(input_data).wait_to_read(), 52 | repeat=3, 53 | dryrun=5, 54 | min_repeat_ms=1000) 55 | print(f"MXNet {model_name} latency for batch {batchsize} : {np.mean(res):.2f} ms") 56 | 57 | 58 | 59 | if __name__ == "__main__": 60 | import argparse 61 | 62 | parser = argparse.ArgumentParser() 63 | parser.add_argument('--model',default='resnet50' , type=str) 64 | parser.add_argument('--batchsize',default=8,type=int) 65 | args = parser.parse_args() 66 | 67 | model_name = args.model 68 | batchsize = args.batchsize 69 | 70 | img_size = 224 71 | if args.model == "all": 72 | models = ["mobilenet", "mobilenet_v2", "inception_v3","resnet50","alexnet","vgg16","vgg19"] 73 | else: 74 | models = [args.model] 75 | 76 | for model in models: 77 | if model == 'inception_v3': 78 | img_size = 299 79 | benchmark(model,img_size,batchsize) 80 | -------------------------------------------------------------------------------- /mxnet/tvm/tvm_bert_export.py: -------------------------------------------------------------------------------- 1 | import time 2 | import argparse 3 | import numpy as np 4 | import mxnet as mx 5 | import gluonnlp as nlp 6 | import tvm 7 | from tvm import relay 8 | # import tvm.contrib.graph_runtime as runtime 9 | import tvm.contrib.graph_executor as runtime 10 | 11 | 12 | 13 | import tvm.testing 14 | import warnings 15 | from mxnet import gluon 16 | 17 | warnings.filterwarnings(action='ignore') 18 | 19 | def load_model(model_name,batch_size): 20 | ctx = mx.gpu() if mx.context.num_gpus() else mx.cpu() 21 | 22 | model_json = f"../base/{model_name}_{batch_size}/model-symbol.json" 23 | model_params = f"../base/{model_name}_{batch_size}/model-0000.params" 24 | 25 | if model_name == "bert_base": 26 | with warnings.catch_warnings(): 27 | warnings.simplefilter("ignore") 28 | model = gluon.nn.SymbolBlock.imports(model_json, ['data0','data1','data2'], model_params, ctx=ctx) 29 | elif model_name == "distilbert": 30 | with warnings.catch_warnings(): 31 | warnings.simplefilter("ignore") 32 | model = gluon.nn.SymbolBlock.imports(model_json, ['data0','data1'], model_params, ctx=ctx) 33 | return model 34 | 35 | def compile_tvm(model_name,batch_size,seq_length,target): 36 | 37 | # load origianl mxnet model 38 | model = load_model(model_name,batch_size) 39 | 40 | # Prepare input data 41 | dtype = "float32" 42 | inputs = np.random.randint(0, 2000, size=(batch_size, seq_length)).astype(dtype) 43 | token_types = np.random.uniform(size=(batch_size, seq_length)).astype(dtype) 44 | valid_length = np.asarray([seq_length] * batch_size).astype(dtype) 45 | 46 | ###################################### 47 | # Optimize the BERT model using TVM 48 | ###################################### 49 | 50 | # First, Convert the MXNet model into TVM Relay format 51 | if model_name == "bert_base": 52 | shape_dict = { 53 | 'data0': (batch_size, seq_length), 54 | 'data1': (batch_size, seq_length), 55 | 'data2': (batch_size,) 56 | } 57 | elif model_name=="distilbert": 58 | shape_dict = { 59 | 'data0': (batch_size, seq_length), 60 | 'data1': (batch_size,) 61 | } 62 | 63 | mod, params = relay.frontend.from_mxnet(model, shape_dict) 64 | 65 | # Compile the imported model 66 | if target == "arm": 67 | target = tvm.target.arm_cpu() 68 | 69 | with tvm.transform.PassContext(opt_level=3): 70 | mod = relay.transform.InferType()(mod) 71 | lib = relay.build(mod, target=target, params=params) 72 | lib.export_library(f"./{model_name}_{batch_size}.tar") 73 | 74 | dev = tvm.cpu() 75 | module = runtime.GraphModule(lib["default"](dev)) 76 | 77 | if model_name == "bert_base": 78 | module.set_input(data0=inputs, data1=token_types, data2=valid_length) 79 | elif model_name == "distilbert": 80 | module.set_input(data0=inputs, data1=valid_length) 81 | 82 | # Evaluate 83 | ftimer = module.module.time_evaluator("run", dev, min_repeat_ms=500, repeat=5) 84 | prof_res = np.array(ftimer().results) * 1000 85 | print(f"TVM {model_name} latency for batch {batch_size} : {np.mean(prof_res):.2f} ms") 86 | 87 | if __name__ == "__main__": 88 | import argparse 89 | 90 | parser = argparse.ArgumentParser() 91 | parser.add_argument('--model',default='bert_base' , type=str) 92 | parser.add_argument('--target',default="llvm -mcpu=core-avx2" , type=str) 93 | parser.add_argument('--batchsize',default=1 , type=int) 94 | parser.add_argument('--seq_length',default=128 , type=int) 95 | 96 | args = parser.parse_args() 97 | 98 | model_name = args.model 99 | target = args.target 100 | batch_size = args.batchsize 101 | seq_length = args.seq_length 102 | 103 | compile_tvm(model_name,batch_size,seq_length,target) 104 | -------------------------------------------------------------------------------- /mxnet/tvm/tvm_export.py: -------------------------------------------------------------------------------- 1 | from json import load 2 | import warnings 3 | import mxnet as mx 4 | import mxnet.ndarray as nd 5 | from mxnet import nd, gluon 6 | import time 7 | import numpy as np 8 | import tvm 9 | from tvm import relay 10 | import tvm.contrib.graph_executor as runtime 11 | 12 | import argparse 13 | 14 | 15 | def timer(thunk, repeat=1, number=10, dryrun=3, min_repeat_ms=1000): 16 | """Helper function to time a function""" 17 | for i in range(dryrun): 18 | thunk() 19 | ret = [] 20 | for _ in range(repeat): 21 | while True: 22 | beg = time.time() 23 | for _ in range(number): 24 | thunk() 25 | end = time.time() 26 | lat = (end - beg) * 1e3 27 | if lat >= min_repeat_ms: 28 | break 29 | number = int(max(min_repeat_ms / (lat / number) + 1, number * 1.618)) 30 | ret.append(lat / number) 31 | return ret 32 | 33 | def load_model(model_name,batchsize): 34 | ctx = mx.gpu() if mx.context.num_gpus() else mx.cpu() 35 | 36 | model_json = f"../base/{model_name}_{batchsize}/model-symbol.json" 37 | model_params = f"../base/{model_name}_{batchsize}/model-0000.params" 38 | 39 | 40 | with warnings.catch_warnings(): 41 | warnings.simplefilter("ignore") 42 | model = gluon.nn.SymbolBlock.imports(model_json, ['data'], model_params, ctx=ctx) 43 | return model 44 | 45 | 46 | def convert_to_nhwc(mod): 47 | """Convert to NHWC layout""" 48 | desired_layouts = {"nn.conv2d": ["NHWC", "default"]} 49 | seq = tvm.transform.Sequential( 50 | [ 51 | relay.transform.RemoveUnusedFunctions(), 52 | relay.transform.ConvertLayout(desired_layouts), 53 | ] 54 | ) 55 | with tvm.transform.PassContext(opt_level=3): 56 | mod = seq(mod) 57 | return mod 58 | 59 | 60 | def compile_export(mod,params,target,batch_size): 61 | if target == "arm": 62 | target = tvm.target.arm_cpu() 63 | with tvm.transform.PassContext(opt_level=3): 64 | mod = relay.transform.InferType()(mod) 65 | lib = relay.build(mod, target=target, params=params) 66 | lib.export_library(f"./{model_name}_{batch_size}.tar") 67 | return lib 68 | 69 | 70 | def benchmark(model_name,imgsize,batch_size,target,dtype="float32",layout="NCHW"): 71 | input_name = "data" 72 | input_shape = (batch_size, 3, imgsize, imgsize) 73 | data = np.random.uniform(size=input_shape) 74 | 75 | model = load_model(model_name,batch_size) 76 | 77 | data_array = np.random.uniform(0, 255, size=input_shape).astype("float32") 78 | # mxnet to tvm format 79 | 80 | if layout == "NHWC": 81 | mod = convert_to_nhwc(mod) 82 | else: 83 | assert layout == "NCHW" 84 | 85 | mod, params = relay.frontend.from_mxnet(model, shape={"data": input_shape},dtype=dtype) 86 | 87 | lib=compile_export(mod,params,target,batch_size) 88 | print("export done :",f"{model_name}_{batch_size}.tar") 89 | 90 | dev = tvm.cpu() 91 | module = runtime.GraphModule(lib["default"](dev)) 92 | 93 | data = np.random.uniform(size=input_shape) 94 | module.set_input(input_name, data) 95 | 96 | # Evaluate 97 | ftimer = module.module.time_evaluator("run", dev, min_repeat_ms=500, repeat=5) 98 | prof_res = np.array(ftimer().results) * 1000 99 | print(f"TVM {model_name} latency for batch {batch_size} : {np.mean(prof_res):.2f} ms") 100 | 101 | 102 | 103 | if __name__ == "__main__": 104 | import argparse 105 | 106 | parser = argparse.ArgumentParser() 107 | parser.add_argument('--model',default='resnet50' , type=str) 108 | parser.add_argument('--batchsize',default=1 , type=int) 109 | parser.add_argument('--target',default='llvm -mcpu=core-avx2' , type=str) 110 | 111 | args = parser.parse_args() 112 | 113 | model_name = args.model 114 | batch_size = args.batchsize 115 | target = args.target 116 | 117 | img_size = 224 118 | if args.model == "all": 119 | models = ["mobilenet", "mobilenet_v2", "inception_v3","resnet50","alexnet","vgg16","vgg19"] 120 | else: 121 | models = [args.model] 122 | 123 | for model in models: 124 | if model == 'inception_v3': 125 | img_size = 299 126 | benchmark(model,img_size,batch_size,target) 127 | -------------------------------------------------------------------------------- /mxnet/tvm/tvm_serving.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import time 3 | import argparse 4 | import tvm.contrib.graph_executor as runtime 5 | 6 | from tvm import relay 7 | from tvm.relay import testing 8 | import tvm 9 | from tvm import te 10 | 11 | def benchmark(model, img_size,batch_size, repeat=3): 12 | ctx = tvm.cpu() 13 | 14 | input_name = "data" 15 | input_shape = (batch_size, 3, img_size, img_size) 16 | output_shape = (batch_size, 1000) 17 | 18 | loaded_lib = tvm.runtime.load_module(f'./{model}.tar') 19 | 20 | module = runtime.GraphModule(loaded_lib["default"](ctx)) 21 | 22 | # Feed input data 23 | data = np.random.uniform(size=input_shape) 24 | module.set_input(input_name, data) 25 | 26 | # Evaluate 27 | ftimer = module.module.time_evaluator("run", ctx, min_repeat_ms=500, repeat=repeat) 28 | prof_res = np.array(ftimer().results) * 1000 29 | print(f"TVM {model_name} latency for batch {batch_size} : {np.mean(prof_res):.2f} ms") 30 | 31 | return np.array(ftimer().results) 32 | 33 | 34 | if __name__ == "__main__": 35 | import argparse 36 | 37 | parser = argparse.ArgumentParser() 38 | parser.add_argument('--model',default='resnet50' , type=str) 39 | parser.add_argument('--batchsize',default=1 , type=int) 40 | 41 | args = parser.parse_args() 42 | 43 | model_name = args.model 44 | batch_size = args.batchsize 45 | img_size = 224 46 | 47 | if args.model == "all": 48 | models = ["mobilenet", "mobilenet_v2", "inception_v3","resnet50","alexnet","vgg16","vgg19"] 49 | else: 50 | models = [args.model] 51 | 52 | for model in models: 53 | if model == 'inception_v3': 54 | img_size = 299 55 | benchmark(model,img_size,batch_size) 56 | -------------------------------------------------------------------------------- /torch/base/bert_export.py: -------------------------------------------------------------------------------- 1 | import os 2 | import torch 3 | 4 | from pytorch_transformers.tokenization_bert import BertTokenizer 5 | from pytorch_transformers.modeling_bert import BertModel, BertForMaskedLM 6 | 7 | 8 | import numpy as np 9 | from pathlib import Path 10 | import time 11 | 12 | def timer(thunk, repeat=1, number=10, dryrun=3, min_repeat_ms=1000): 13 | """Helper function to time a function""" 14 | for i in range(dryrun): 15 | thunk() 16 | ret = [] 17 | for _ in range(repeat): 18 | while True: 19 | beg = time.time() 20 | for _ in range(number): 21 | thunk() 22 | end = time.time() 23 | lat = (end - beg) * 1e3 24 | if lat >= min_repeat_ms: 25 | break 26 | number = int(max(min_repeat_ms / (lat / number) + 1, number * 1.618)) 27 | ret.append(lat / number) 28 | return ret 29 | 30 | def inference_model(model_name,batchsize,seq_length,dtype="float32"): 31 | 32 | inputs = np.random.randint(0, 2000, size=(seq_length)) 33 | token_types = np.random.randint(0,2,size=(seq_length)) 34 | 35 | 36 | tokens_tensor = torch.tensor(np.array([inputs])) 37 | segments_tensors = torch.tensor(np.array([token_types])) 38 | 39 | 40 | model = BertModel.from_pretrained('bert-base-uncased') 41 | model.eval() 42 | model(tokens_tensor, segments_tensors) 43 | 44 | target_path = f"./{model_name}/" 45 | from pathlib import Path 46 | Path(target_path).mkdir(parents=True, exist_ok=True) 47 | 48 | torch.save(model, target_path + 'model.pt') # 전체 모델 저장 49 | torch.save(model.state_dict(), target_path + 'model_state_dict.pt') 50 | 51 | print("-"*10,f"Download and export {model_name} complete","-"*10) 52 | 53 | 54 | res = timer(lambda: model(tokens_tensor,segments_tensors), 55 | repeat=3, 56 | dryrun=5, 57 | min_repeat_ms=1000) 58 | print(f"Pytorch {model_name} latency for batch {batchsize} : {np.mean(res):.2f} ms") 59 | 60 | if __name__ == "__main__": 61 | import argparse 62 | 63 | parser = argparse.ArgumentParser() 64 | parser.add_argument('--model',default='bert' , type=str) 65 | parser.add_argument('--batchsize',default=1 , type=int) 66 | parser.add_argument('--seq_length',default=128 , type=int) 67 | 68 | 69 | args = parser.parse_args() 70 | 71 | seq_length= args.seq_length 72 | model_name = args.model 73 | batchsize = args.batchsize 74 | 75 | inference_model(model_name,batchsize,seq_length) 76 | -------------------------------------------------------------------------------- /torch/base/ic_model_export.py: -------------------------------------------------------------------------------- 1 | import os 2 | import torch 3 | import torchvision.models as models 4 | import numpy as np 5 | from pathlib import Path 6 | 7 | def download_model(model_name,imgsize=224): 8 | models_detail = { 9 | 'efficientnet_b0' : models.efficientnet_b0(pretrained=True), 10 | 'mobilenet_v2':models.mobilenet_v2(pretrained=True), 11 | 'resnet18' : models.resnet18(pretrained=True), 12 | 'mobilenet_v3_small' : models.mobilenet_v3_small(pretrained=True), 13 | 'mnasnet':models.mnasnet1_0(pretrained=True), 14 | 'shufflenet' : models.shufflenet_v2_x1_0(pretrained=True), 15 | 'squeezenet' :models.squeezenet1_0(pretrained=True), 16 | 17 | 'inception_v3':models.inception_v3(pretrained=True), 18 | 'resnet50': models.resnet50(pretrained=True), 19 | 'alexnet':models.alexnet(pretrained=True), 20 | 'vgg16':models.vgg16(pretrained=True), 21 | 'vgg19':models.vgg19(pretrained=True), 22 | } 23 | 24 | model = models_detail[model_name] 25 | model.eval() 26 | # input_shape = (batchsize, 3, imgsize, imgsize) 27 | # data = np.random.uniform(size=input_shape) 28 | 29 | # model(data) 30 | 31 | target_path = f"./{model_name}/" 32 | from pathlib import Path 33 | Path(target_path).mkdir(parents=True, exist_ok=True) 34 | 35 | torch.save(model, target_path + 'model.pt') # 전체 모델 저장 36 | torch.save(model.state_dict(), target_path + 'model_state_dict.pt') 37 | 38 | print("-"*10,f"Download and export {model_name} complete","-"*10) 39 | 40 | if __name__ == "__main__": 41 | import argparse 42 | 43 | parser = argparse.ArgumentParser() 44 | parser.add_argument('--model',default='resnet50' , type=str) 45 | 46 | 47 | args = parser.parse_args() 48 | 49 | model_name = args.model 50 | img_size = 224 51 | 52 | 53 | if model_name == 'inception_v3': 54 | img_size = 299 55 | download_model(model_name,img_size) 56 | -------------------------------------------------------------------------------- /torch/base/torch_profiling.py: -------------------------------------------------------------------------------- 1 | from json import load 2 | import warnings 3 | import torch 4 | import time 5 | import numpy as np 6 | 7 | import argparse 8 | 9 | 10 | def timer(thunk, repeat, number=10, dryrun=3, min_repeat_ms=1000): 11 | """Helper function to time a function""" 12 | for i in range(dryrun): 13 | thunk() 14 | ret = [] 15 | for _ in range(repeat): 16 | while True: 17 | beg = time.time() 18 | for _ in range(number): 19 | thunk() 20 | end = time.time() 21 | lat = (end - beg) * 1e3 22 | if lat >= min_repeat_ms: 23 | break 24 | number = int(max(min_repeat_ms / (lat / number) + 1, number * 1.618)) 25 | ret.append(lat / number) 26 | return ret 27 | 28 | def load_model(model_name,batch_size): 29 | 30 | PATH = f"./{model_name}/" 31 | model = torch.load(PATH + 'model.pt') # 전체 모델을 통째로 불러옴, 클래스 선언 필수 32 | model.load_state_dict(torch.load(PATH + 'model_state_dict.pt')) # state_dict를 불러 온 후, 모델에 저장 33 | 34 | 35 | return model 36 | 37 | def benchmark(model_name,batchsize,imgsize): 38 | input_shape = (batchsize, 3, imgsize, imgsize) 39 | data_array = np.random.uniform(0, 255, size=input_shape).astype("float32") 40 | torch_data = torch.tensor(data_array) 41 | 42 | model = load_model(model_name,batchsize) 43 | model.eval() 44 | 45 | # profiling 46 | with torch.autograd.profiler.profile() as prof: 47 | output = model(torch_data) 48 | # profiler.key_averages 는 연산자의 이름에 따라 결과를 집계 49 | print(prof.key_averages().table(sort_by="self_cpu_time_total")) 50 | 51 | res = timer(lambda: model(torch_data), 52 | repeat=10, 53 | dryrun=5, 54 | min_repeat_ms=1000) 55 | print(f"Pytorch {model_name} latency for batch {batchsize} : {np.mean(res):.2f} ms") 56 | 57 | 58 | 59 | if __name__ == "__main__": 60 | import argparse 61 | 62 | parser = argparse.ArgumentParser() 63 | parser.add_argument('--model',default='resnet50' , type=str) 64 | parser.add_argument('--batchsize',default=1 , type=int) 65 | 66 | args = parser.parse_args() 67 | 68 | model_name = args.model 69 | batchsize = args.batchsize 70 | 71 | img_size = 224 72 | 73 | if model_name == 'inception_v3': 74 | img_size = 299 75 | benchmark(model_name,batchsize,img_size) 76 | -------------------------------------------------------------------------------- /torch/base/torch_serving.py: -------------------------------------------------------------------------------- 1 | from json import load 2 | import warnings 3 | import torch 4 | import time 5 | import numpy as np 6 | 7 | import argparse 8 | 9 | 10 | def timer(thunk, repeat=1, number=10, dryrun=3, min_repeat_ms=1000): 11 | """Helper function to time a function""" 12 | for i in range(dryrun): 13 | thunk() 14 | ret = [] 15 | for _ in range(repeat): 16 | while True: 17 | beg = time.time() 18 | for _ in range(number): 19 | thunk() 20 | end = time.time() 21 | lat = (end - beg) * 1e3 22 | if lat >= min_repeat_ms: 23 | break 24 | number = int(max(min_repeat_ms / (lat / number) + 1, number * 1.618)) 25 | ret.append(lat / number) 26 | return ret 27 | 28 | def load_model(model_name,batch_size): 29 | 30 | PATH = f"./{model_name}_{batch_size}/" 31 | model = torch.load(PATH + 'model.pt') # 전체 모델을 통째로 불러옴, 클래스 선언 필수 32 | model.load_state_dict(torch.load(PATH + 'model_state_dict.pt')) # state_dict를 불러 온 후, 모델에 저장 33 | 34 | 35 | return model 36 | 37 | def benchmark(model_name,batchsize,imgsize): 38 | input_shape = (batchsize, 3, imgsize, imgsize) 39 | data_array = np.random.uniform(0, 255, size=input_shape).astype("float32") 40 | torch_data = torch.tensor(data_array) 41 | 42 | model = load_model(model_name,batchsize) 43 | model.eval() 44 | 45 | res = timer(lambda: model(torch_data), 46 | repeat=3, 47 | dryrun=5, 48 | min_repeat_ms=1000) 49 | print(f"Pytorch {model_name} latency for batch {batchsize} : {np.mean(res):.2f} ms") 50 | 51 | 52 | 53 | if __name__ == "__main__": 54 | import argparse 55 | 56 | parser = argparse.ArgumentParser() 57 | parser.add_argument('--model',default='resnet50' , type=str) 58 | parser.add_argument('--batchsize',default=1 , type=int) 59 | 60 | args = parser.parse_args() 61 | 62 | model_name = args.model 63 | batchsize = args.batchsize 64 | 65 | img_size = 224 66 | 67 | if model_name == 'inception_v3': 68 | img_size = 299 69 | benchmark(model_name,batchsize,img_size) 70 | -------------------------------------------------------------------------------- /torch/onnx/onnx_profiling.py: -------------------------------------------------------------------------------- 1 | import onnx 2 | from onnx import helper 3 | import onnxruntime as ort 4 | import numpy as np 5 | import pandas as pd 6 | import json 7 | import argparse 8 | import time 9 | from collections import OrderedDict 10 | from onnxruntime import InferenceSession, SessionOptions, get_device 11 | 12 | from onnxruntime.capi._pybind_state import ( # pylint: disable=E0611 13 | SessionIOBinding, OrtDevice as C_OrtDevice, OrtValue as C_OrtValue) 14 | 15 | 16 | if get_device().upper() == 'GPU': 17 | ort_device = C_OrtDevice( 18 | C_OrtDevice.cuda(), C_OrtDevice.default_memory(), 0) 19 | provider = 'CUDAExecutionProvider' 20 | else: 21 | ort_device = C_OrtDevice( 22 | C_OrtDevice.cpu(), C_OrtDevice.default_memory(), 0) 23 | provider = 'CPUExecutionProvider' 24 | 25 | print("provider = %r" % provider) 26 | 27 | 28 | def make_dataset(batch_size,size): 29 | image_shape = (3, size, size) 30 | data_shape = (batch_size,) + image_shape 31 | 32 | data = np.random.uniform(-1, 1, size=data_shape).astype("float32") 33 | 34 | return data,image_shape 35 | 36 | def original_onnx_serving(model_name,batch_size,size,repeat=10): 37 | model_path = f"./{model_name}_{batch_size}.onnx" 38 | so = SessionOptions() 39 | so.enable_profiling = True 40 | session = ort.InferenceSession(model_path, so, providers=[provider]) 41 | # bind = SessionIOBinding(session._sess) 42 | 43 | print("graph_optimization_level:", so.graph_optimization_level) 44 | 45 | # session = ort.InferenceSession(model_path) 46 | session.get_modelmeta() 47 | inname = [input.name for input in session.get_inputs()] 48 | outname = [output.name for output in session.get_outputs()] 49 | 50 | data, image_shape = make_dataset(batch_size,size) 51 | 52 | time_list = [] 53 | for i in range(repeat): 54 | start_time = time.time() 55 | session.run(outname, {inname[0]: data}) 56 | running_time = time.time() - start_time 57 | # print(f"ONNX serving {model_name}-{batch_size} inference latency : ",(running_time)*1000,"ms") 58 | time_list.append(running_time) 59 | 60 | prof = session.end_profiling() 61 | 62 | time_mean = np.mean(np.array(time_list[1:])) 63 | time_medium = np.median(np.array(time_list[1:])) 64 | print(f"{model_name} inference time medium : {time_medium*1000} ms") 65 | print(f"{model_name} inference time mean : {time_mean*1000} ms") 66 | 67 | 68 | if __name__ == "__main__": 69 | parser = argparse.ArgumentParser() 70 | parser.add_argument('--model',default='resnet50' , type=str) 71 | parser.add_argument('--batchsize',default=1 , type=int) 72 | parser.add_argument('--repeat',default=100 , type=int) 73 | 74 | args = parser.parse_args() 75 | 76 | model_name = args.model 77 | batch_size = args.batchsize 78 | repeat = args.repeat 79 | 80 | 81 | img_size=224 82 | if model_name == "inception_v3": 83 | img_size = 299 84 | 85 | original_onnx_serving(model_name,batch_size,img_size,repeat) 86 | -------------------------------------------------------------------------------- /torch/onnx/onnx_serving.py: -------------------------------------------------------------------------------- 1 | import onnx 2 | from onnx import helper 3 | import onnxruntime as ort 4 | import numpy as np 5 | import argparse 6 | import time 7 | 8 | 9 | def make_dataset(batch_size,size): 10 | image_shape = (3, size, size) 11 | data_shape = (batch_size,) + image_shape 12 | 13 | data = np.random.uniform(-1, 1, size=data_shape).astype("float32") 14 | 15 | return data,image_shape 16 | 17 | def original_onnx_serving(model_name,batch_size,size,repeat=10): 18 | model_path = f"./{model_name}_{batch_size}.onnx" 19 | session = ort.InferenceSession(model_path) 20 | session.get_modelmeta() 21 | inname = [input.name for input in session.get_inputs()] 22 | outname = [output.name for output in session.get_outputs()] 23 | 24 | data, image_shape = make_dataset(batch_size,size) 25 | 26 | time_list = [] 27 | for i in range(repeat): 28 | start_time = time.time() 29 | session.run(outname, {inname[0]: data}) 30 | running_time = time.time() - start_time 31 | # print(f"ONNX serving {model_name}-{batch_size} inference latency : ",(running_time)*1000,"ms") 32 | time_list.append(running_time) 33 | 34 | 35 | time_mean = np.mean(np.array(time_list[1:])) 36 | time_medium = np.median(np.array(time_list[1:])) 37 | print(f"{model_name} inference time medium : {time_medium*1000} ms") 38 | print(f"{model_name} inference time mean : {time_mean*1000} ms") 39 | 40 | 41 | if __name__ == "__main__": 42 | parser = argparse.ArgumentParser() 43 | parser.add_argument('--model',default='resnet50' , type=str) 44 | parser.add_argument('--batchsize',default=1 , type=int) 45 | parser.add_argument('--repeat',default=100 , type=int) 46 | 47 | args = parser.parse_args() 48 | 49 | model_name = args.model 50 | batch_size = args.batchsize 51 | repeat = args.repeat 52 | 53 | 54 | img_size=224 55 | if model_name == "inception_v3": 56 | img_size = 299 57 | 58 | original_onnx_serving(model_name,batch_size,img_size,repeat) 59 | -------------------------------------------------------------------------------- /torch/onnx/torch2onnx.py: -------------------------------------------------------------------------------- 1 | import torchvision.models as models 2 | import torch.onnx 3 | import onnx 4 | import onnxoptimizer 5 | import numpy as np 6 | import argparse 7 | 8 | 9 | def convert(model_name,batchsize,size): 10 | PATH = f"../base/{model_name}/" 11 | model = torch.load(PATH + 'model.pt') # 전체 모델을 통째로 불러옴, 클래스 선언 필수 12 | model.load_state_dict(torch.load(PATH + 'model_state_dict.pt')) # state_dict를 불러 온 후, 모델에 저장 13 | 14 | # ------------------------ onnx export ----------------------------- 15 | output_onnx = f'{model_name}_{batch_size}.onnx' 16 | print("==> Exporting model to ONNX format at '{}'".format(output_onnx)) 17 | input_names = ["input0"] 18 | output_names = ["output0"] 19 | inputs = torch.randn(batch_size, 3, size, size) 20 | 21 | torch_out = torch.onnx._export(model, inputs, output_onnx, export_params=True, verbose=False, 22 | input_names=input_names, output_names=output_names) 23 | 24 | def optimize_onnx(model_name,batch_size,skip_fuse_bn=False): 25 | opt_onnx = f'{model_name}_{batch_size}.opt.onnx' 26 | model_path = f"./{model_name}_{batch_size}.onnx" 27 | 28 | model = onnx.load(model_path) 29 | onnx.checker.check_model(model) 30 | onnx.helper.strip_doc_string(model) 31 | optimizers_list = onnxoptimizer.get_fuse_and_elimination_passes() 32 | if skip_fuse_bn: 33 | optimizers_list.remove('fuse_bn_into_conv') 34 | print(optimizers_list) 35 | model = onnxoptimizer.optimize(model, optimizers_list, 36 | fixed_point=True) 37 | onnx.checker.check_model(model) 38 | with open(opt_onnx, "wb") as f: 39 | f.write(model.SerializeToString()) 40 | 41 | if __name__ == "__main__": 42 | parser = argparse.ArgumentParser() 43 | parser.add_argument('--model',default='resnet50' , type=str) 44 | parser.add_argument('--batchsize',default=1 , type=int) 45 | 46 | args = parser.parse_args() 47 | 48 | model_name = args.model 49 | batch_size = args.batchsize 50 | 51 | 52 | img_size=224 53 | if model_name == "inception_v3": 54 | img_size = 299 55 | convert(model_name,batch_size,img_size) 56 | optimize_onnx(model_name,batch_size) 57 | -------------------------------------------------------------------------------- /torch/tvm/tvm_bert_export.py: -------------------------------------------------------------------------------- 1 | from json import load 2 | import warnings 3 | import time 4 | import numpy as np 5 | import tvm 6 | from tvm import relay 7 | import tvm.contrib.graph_executor as runtime 8 | import torch 9 | 10 | import argparse 11 | 12 | 13 | 14 | def load_model(model_name): 15 | 16 | PATH = f"../../base/{model_name}/" 17 | model = torch.load(PATH + 'model.pt') # 전체 모델을 통째로 불러옴, 클래스 선언 필수 18 | model.load_state_dict(torch.load(PATH + 'model_state_dict.pt')) # state_dict를 불러 온 후, 모델에 저장 19 | 20 | return model 21 | 22 | 23 | def convert_to_nhwc(mod): 24 | """Convert to NHWC layout""" 25 | desired_layouts = {"nn.conv2d": ["NHWC", "default"]} 26 | seq = tvm.transform.Sequential( 27 | [ 28 | relay.transform.RemoveUnusedFunctions(), 29 | relay.transform.ConvertLayout(desired_layouts), 30 | ] 31 | ) 32 | with tvm.transform.PassContext(opt_level=3): 33 | mod = seq(mod) 34 | return mod 35 | 36 | 37 | def compile_export(mod,params,target,batch_size): 38 | if target == "arm": 39 | target = tvm.target.arm_cpu() 40 | with tvm.transform.PassContext(opt_level=3): 41 | mod = relay.transform.InferType()(mod) 42 | lib = relay.build(mod, target=target, params=params) 43 | lib.export_library(f"./{model_name}_{batch_size}.tar") 44 | return lib 45 | 46 | 47 | def benchmark(model_name,seq_length,batch_size,target,dtype="float32",layout="NCHW"): 48 | input_name = "input0" 49 | 50 | inputs = np.random.randint(0, 2000, size=(seq_length)) 51 | token_types = np.random.randint(0,2,size=(seq_length)) 52 | 53 | tokens_tensor = torch.tensor(np.array([inputs])) 54 | segments_tensors = torch.tensor(np.array([token_types])) 55 | 56 | model = load_model(model_name) 57 | model.eval() 58 | 59 | traced_model = torch.jit.trace(model, tokens_tensor,segments_tensors) 60 | 61 | 62 | mod, params = relay.frontend.from_pytorch(traced_model, input_infos=[('input0', [batch_size,seq_length])],default_dtype=dtype) 63 | 64 | if layout == "NHWC": 65 | mod = convert_to_nhwc(mod) 66 | else: 67 | assert layout == "NCHW" 68 | 69 | lib=compile_export(mod,params,target,batch_size) 70 | print("export done :",f"{model_name}_{batch_size}.tar") 71 | 72 | dev = tvm.cpu() 73 | module = runtime.GraphModule(lib["default"](dev)) 74 | 75 | module.set_input(data0=tokens_tensor,data1=segments_tensors) 76 | 77 | # Evaluate 78 | ftimer = module.module.time_evaluator("run", dev, min_repeat_ms=500, repeat=10) 79 | prof_res = np.array(ftimer().results) * 1000 80 | print(f"TVM {model_name} latency for batch {batch_size} : {np.mean(prof_res[1:]):.2f} ms") 81 | 82 | 83 | 84 | if __name__ == "__main__": 85 | import argparse 86 | 87 | parser = argparse.ArgumentParser() 88 | parser.add_argument('--model',default='bert' , type=str) 89 | parser.add_argument('--batchsize',default=1 , type=int) 90 | parser.add_argument('--seq_length',default=128 , type=int) 91 | parser.add_argument('--target',default='llvm -mcpu=core-avx2' , type=str) 92 | 93 | args = parser.parse_args() 94 | 95 | model_name = args.model 96 | batch_size = args.batchsize 97 | seq_length = args.seq_length 98 | target = args.target 99 | 100 | benchmark(model_name,seq_length,batch_size,target) 101 | -------------------------------------------------------------------------------- /torch/tvm/tvm_export.py: -------------------------------------------------------------------------------- 1 | from json import load 2 | import warnings 3 | import time 4 | import numpy as np 5 | import tvm 6 | from tvm import relay 7 | import tvm.contrib.graph_executor as runtime 8 | import torch 9 | 10 | import argparse 11 | 12 | 13 | def timer(thunk, repeat=1, number=10, dryrun=3, min_repeat_ms=1000): 14 | """Helper function to time a function""" 15 | for i in range(dryrun): 16 | thunk() 17 | ret = [] 18 | for _ in range(repeat): 19 | while True: 20 | beg = time.time() 21 | for _ in range(number): 22 | thunk() 23 | end = time.time() 24 | lat = (end - beg) * 1e3 25 | if lat >= min_repeat_ms: 26 | break 27 | number = int(max(min_repeat_ms / (lat / number) + 1, number * 1.618)) 28 | ret.append(lat / number) 29 | return ret 30 | 31 | def load_model(model_name,batchsize): 32 | 33 | PATH = f"../base/{model_name}/" 34 | model = torch.load(PATH + 'model.pt') # 전체 모델을 통째로 불러옴, 클래스 선언 필수 35 | model.load_state_dict(torch.load(PATH + 'model_state_dict.pt')) # state_dict를 불러 온 후, 모델에 저장 36 | 37 | return model 38 | 39 | 40 | def convert_to_nhwc(mod): 41 | """Convert to NHWC layout""" 42 | desired_layouts = {"nn.conv2d": ["NHWC", "default"]} 43 | seq = tvm.transform.Sequential( 44 | [ 45 | relay.transform.RemoveUnusedFunctions(), 46 | relay.transform.ConvertLayout(desired_layouts), 47 | ] 48 | ) 49 | with tvm.transform.PassContext(opt_level=3): 50 | mod = seq(mod) 51 | return mod 52 | 53 | 54 | def compile_export(mod,params,target,batch_size): 55 | if target == "arm": 56 | target = tvm.target.arm_cpu() 57 | with tvm.transform.PassContext(opt_level=3): 58 | mod = relay.transform.InferType()(mod) 59 | lib = relay.build(mod, target=target, params=params) 60 | lib.export_library(f"./{model_name}_{batch_size}.tar") 61 | return lib 62 | 63 | 64 | def benchmark(model_name,imgsize,batch_size,target,dtype="float32",layout="NCHW"): 65 | input_name = "input0" 66 | input_shape = (batch_size, 3, imgsize, imgsize) 67 | 68 | # data = np.random.uniform(size=input_shape) 69 | data_array = np.random.uniform(0, 255, size=input_shape).astype("float32") 70 | torch_data = torch.tensor(data_array) 71 | 72 | model = load_model(model_name,batch_size) 73 | model.eval() 74 | traced_model = torch.jit.trace(model, torch_data) 75 | 76 | # mxnet to tvm format 77 | 78 | mod, params = relay.frontend.from_pytorch(traced_model, input_infos=[('input0', input_shape)],default_dtype=dtype) 79 | 80 | if layout == "NHWC": 81 | mod = convert_to_nhwc(mod) 82 | else: 83 | assert layout == "NCHW" 84 | 85 | lib=compile_export(mod,params,target,batch_size) 86 | print("export done :",f"{model_name}_{batch_size}.tar") 87 | 88 | dev = tvm.cpu() 89 | module = runtime.GraphModule(lib["default"](dev)) 90 | 91 | data = np.random.uniform(size=input_shape) 92 | module.set_input(input_name, data) 93 | 94 | # Evaluate 95 | ftimer = module.module.time_evaluator("run", dev, min_repeat_ms=500, repeat=10) 96 | prof_res = np.array(ftimer().results) * 1000 97 | print(f"TVM {model_name} latency for batch {batch_size} : {np.mean(prof_res[1:]):.2f} ms") 98 | 99 | 100 | 101 | if __name__ == "__main__": 102 | import argparse 103 | 104 | parser = argparse.ArgumentParser() 105 | parser.add_argument('--model',default='resnet50' , type=str) 106 | parser.add_argument('--batchsize',default=1 , type=int) 107 | parser.add_argument('--target',default='llvm -mcpu=core-avx2' , type=str) 108 | 109 | args = parser.parse_args() 110 | 111 | model_name = args.model 112 | batch_size = args.batchsize 113 | target = args.target 114 | 115 | img_size = 224 116 | 117 | if model_name == 'inception_v3': 118 | img_size = 299 119 | benchmark(model_name,img_size,batch_size,target) 120 | -------------------------------------------------------------------------------- /torch/tvm/tvm_profiling.py: -------------------------------------------------------------------------------- 1 | from json import load 2 | import warnings 3 | import time 4 | import numpy as np 5 | import tvm 6 | from tvm import relay 7 | import tvm.contrib.graph_executor as runtime 8 | from tvm.contrib.debugger import debug_executor 9 | 10 | import torch 11 | 12 | import argparse 13 | 14 | 15 | def timer(thunk, repeat=1, number=10, dryrun=3, min_repeat_ms=1000): 16 | """Helper function to time a function""" 17 | for i in range(dryrun): 18 | thunk() 19 | ret = [] 20 | for _ in range(repeat): 21 | while True: 22 | beg = time.time() 23 | for _ in range(number): 24 | thunk() 25 | end = time.time() 26 | lat = (end - beg) * 1e3 27 | if lat >= min_repeat_ms: 28 | break 29 | number = int(max(min_repeat_ms / (lat / number) + 1, number * 1.618)) 30 | ret.append(lat / number) 31 | return ret 32 | 33 | def load_model(model_name,batchsize): 34 | 35 | PATH = f"../base/{model_name}/" 36 | model = torch.load(PATH + 'model.pt') # 전체 모델을 통째로 불러옴, 클래스 선언 필수 37 | model.load_state_dict(torch.load(PATH + 'model_state_dict.pt')) # state_dict를 불러 온 후, 모델에 저장 38 | 39 | return model 40 | 41 | 42 | def convert_to_nhwc(mod): 43 | """Convert to NHWC layout""" 44 | desired_layouts = {"nn.conv2d": ["NHWC", "default"]} 45 | seq = tvm.transform.Sequential( 46 | [ 47 | relay.transform.RemoveUnusedFunctions(), 48 | relay.transform.ConvertLayout(desired_layouts), 49 | ] 50 | ) 51 | with tvm.transform.PassContext(opt_level=3): 52 | mod = seq(mod) 53 | return mod 54 | 55 | 56 | def compile_export(mod,params,target,batch_size): 57 | if target == "arm": 58 | target = tvm.target.arm_cpu() 59 | with relay.build_config(opt_level=3): 60 | complied_graph_lib = relay.build(mod, target, params=params) 61 | 62 | return complied_graph_lib 63 | 64 | 65 | def benchmark(model_name,imgsize,batch_size,target,dtype="float32",layout="NCHW"): 66 | input_name = "input0" 67 | input_shape = (batch_size, 3, imgsize, imgsize) 68 | out_shape = (batch_size, 1000) 69 | # data = np.random.uniform(size=input_shape) 70 | data_array = np.random.uniform(0, 255, size=input_shape).astype("float32") 71 | torch_data = torch.tensor(data_array) 72 | 73 | model = load_model(model_name,batch_size) 74 | model.eval() 75 | traced_model = torch.jit.trace(model, torch_data) 76 | 77 | # mxnet to tvm format 78 | 79 | mod, params = relay.frontend.from_pytorch(traced_model, input_infos=[('input0', input_shape)],default_dtype=dtype) 80 | 81 | if layout == "NHWC": 82 | mod = convert_to_nhwc(mod) 83 | else: 84 | assert layout == "NCHW" 85 | 86 | dev = tvm.cpu() 87 | data = np.random.uniform(size=input_shape) 88 | 89 | complied_graph_lib =compile_export(mod,params,target,batch_size) 90 | # gmod = complied_graph_lib["debug_create"]("default", dev) 91 | # set_input = gmod["set_input"] 92 | # run = gmod["run"] 93 | # get_output = gmod["get_output"] 94 | # set_input("data", tvm.nd.array(data)) 95 | # run() 96 | # out = get_output(0).numpy() 97 | 98 | debug_g_mod = debug_executor.GraphModuleDebug( 99 | complied_graph_lib["debug_create"]("default", dev), 100 | [dev], 101 | complied_graph_lib.get_graph_json(), 102 | None, 103 | ) 104 | debug_g_mod.set_input("input0", data) 105 | debug_g_mod.run() 106 | out = debug_g_mod.get_output(0).numpy() 107 | 108 | # Evaluate 109 | ftimer = debug_g_mod.module.time_evaluator("run", dev, min_repeat_ms=500, repeat=10) 110 | prof_res = np.array(ftimer().results) * 1000 111 | print(f"TVM {model_name} latency for batch {batch_size} : {np.mean(prof_res[1:]):.2f} ms") 112 | 113 | 114 | 115 | if __name__ == "__main__": 116 | import argparse 117 | 118 | parser = argparse.ArgumentParser() 119 | parser.add_argument('--model',default='resnet50' , type=str) 120 | parser.add_argument('--batchsize',default=1 , type=int) 121 | parser.add_argument('--target',default='llvm -mcpu=core-avx2' , type=str) 122 | 123 | args = parser.parse_args() 124 | 125 | model_name = args.model 126 | batch_size = args.batchsize 127 | target = args.target 128 | 129 | img_size = 224 130 | 131 | if model_name == 'inception_v3': 132 | img_size = 299 133 | benchmark(model_name,img_size,batch_size,target) -------------------------------------------------------------------------------- /torch/tvm/tvm_serving.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import time 3 | import argparse 4 | import tvm.contrib.graph_executor as runtime 5 | 6 | from tvm import relay 7 | from tvm.relay import testing 8 | import tvm 9 | from tvm import te 10 | 11 | def benchmark(model_name, img_size,batch_size, repeat=3): 12 | ctx = tvm.cpu() 13 | 14 | input_name = "input0" 15 | input_shape = (batch_size, 3, img_size, img_size) 16 | output_shape = (batch_size, 1000) 17 | 18 | loaded_lib = tvm.runtime.load_module(f'./{model_name}_{batch_size}.tar') 19 | 20 | module = runtime.GraphModule(loaded_lib["default"](ctx)) 21 | 22 | # Feed input data 23 | data = np.random.uniform(size=input_shape) 24 | module.set_input(input_name, data) 25 | 26 | # Evaluate 27 | ftimer = module.module.time_evaluator("run", ctx, min_repeat_ms=500, repeat=repeat) 28 | prof_res = np.array(ftimer().results) * 1000 29 | print(f"TVM {model_name} latency for batch {batch_size} : {np.mean(prof_res):.2f} ms") 30 | 31 | return np.array(ftimer().results) 32 | 33 | 34 | if __name__ == "__main__": 35 | import argparse 36 | 37 | parser = argparse.ArgumentParser() 38 | parser.add_argument('--model',default='resnet50' , type=str) 39 | parser.add_argument('--batchsize',default=1 , type=int) 40 | 41 | args = parser.parse_args() 42 | 43 | model_name = args.model 44 | batch_size = args.batchsize 45 | img_size = 224 46 | 47 | 48 | if model_name == 'inception_v3': 49 | img_size = 299 50 | benchmark(model_name,img_size,batch_size) 51 | --------------------------------------------------------------------------------