├── .DS_Store ├── paper └── DADS.pdf ├── models ├── .DS_Store ├── __pycache__ │ ├── LeNet.cpython-39.pyc │ ├── AlexNet.cpython-39.pyc │ ├── VggNet.cpython-39.pyc │ ├── EasyModel.cpython-39.pyc │ ├── InceptionBlock.cpython-39.pyc │ └── InceptionBlockV2.cpython-39.pyc ├── AlexNet.py ├── VggNet.py ├── EasyModel.py ├── InceptionBlock.py └── InceptionBlockV2.py ├── utils ├── .DS_Store ├── __pycache__ │ ├── excel_utils.cpython-39.pyc │ └── inference_utils.cpython-39.pyc ├── excel_utils.py └── inference_utils.py ├── assets ├── image-20230709090527770.png ├── image-20230709091251573.png ├── image-20230709091256841.png ├── image-20230709092724650.png ├── image-20230709094554166.png ├── image-20230709095005389.png └── image-20230709095546622.png ├── __pycache__ └── server_func.cpython-39.pyc ├── net ├── __pycache__ │ ├── net_utils.cpython-39.pyc │ ├── monitor_client.cpython-39.pyc │ └── monitor_server.cpython-39.pyc ├── monitor_client.py ├── monitor_server.py └── net_utils.py ├── dads_framework ├── __pycache__ │ ├── dads.cpython-39.pyc │ ├── dinic.cpython-39.pyc │ └── graph_construct.cpython-39.pyc ├── dads.py ├── dinic.py └── graph_construct.py ├── .idea ├── vcs.xml ├── misc.xml ├── .gitignore ├── inspectionProfiles │ ├── profiles_settings.xml │ └── Project_Default.xml ├── modules.xml └── DADS.iml ├── cloud_api.py ├── edge_api.py ├── server_func.py └── README.md /.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Tjyy-1223/DADS/HEAD/.DS_Store -------------------------------------------------------------------------------- /paper/DADS.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Tjyy-1223/DADS/HEAD/paper/DADS.pdf -------------------------------------------------------------------------------- /models/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Tjyy-1223/DADS/HEAD/models/.DS_Store -------------------------------------------------------------------------------- /utils/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Tjyy-1223/DADS/HEAD/utils/.DS_Store -------------------------------------------------------------------------------- /assets/image-20230709090527770.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Tjyy-1223/DADS/HEAD/assets/image-20230709090527770.png -------------------------------------------------------------------------------- /assets/image-20230709091251573.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Tjyy-1223/DADS/HEAD/assets/image-20230709091251573.png -------------------------------------------------------------------------------- /assets/image-20230709091256841.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Tjyy-1223/DADS/HEAD/assets/image-20230709091256841.png -------------------------------------------------------------------------------- /assets/image-20230709092724650.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Tjyy-1223/DADS/HEAD/assets/image-20230709092724650.png -------------------------------------------------------------------------------- /assets/image-20230709094554166.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Tjyy-1223/DADS/HEAD/assets/image-20230709094554166.png -------------------------------------------------------------------------------- /assets/image-20230709095005389.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Tjyy-1223/DADS/HEAD/assets/image-20230709095005389.png -------------------------------------------------------------------------------- /assets/image-20230709095546622.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Tjyy-1223/DADS/HEAD/assets/image-20230709095546622.png -------------------------------------------------------------------------------- /__pycache__/server_func.cpython-39.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Tjyy-1223/DADS/HEAD/__pycache__/server_func.cpython-39.pyc -------------------------------------------------------------------------------- /models/__pycache__/LeNet.cpython-39.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Tjyy-1223/DADS/HEAD/models/__pycache__/LeNet.cpython-39.pyc -------------------------------------------------------------------------------- /models/__pycache__/AlexNet.cpython-39.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Tjyy-1223/DADS/HEAD/models/__pycache__/AlexNet.cpython-39.pyc -------------------------------------------------------------------------------- /models/__pycache__/VggNet.cpython-39.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Tjyy-1223/DADS/HEAD/models/__pycache__/VggNet.cpython-39.pyc -------------------------------------------------------------------------------- /net/__pycache__/net_utils.cpython-39.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Tjyy-1223/DADS/HEAD/net/__pycache__/net_utils.cpython-39.pyc -------------------------------------------------------------------------------- /models/__pycache__/EasyModel.cpython-39.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Tjyy-1223/DADS/HEAD/models/__pycache__/EasyModel.cpython-39.pyc -------------------------------------------------------------------------------- /utils/__pycache__/excel_utils.cpython-39.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Tjyy-1223/DADS/HEAD/utils/__pycache__/excel_utils.cpython-39.pyc -------------------------------------------------------------------------------- /dads_framework/__pycache__/dads.cpython-39.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Tjyy-1223/DADS/HEAD/dads_framework/__pycache__/dads.cpython-39.pyc -------------------------------------------------------------------------------- /net/__pycache__/monitor_client.cpython-39.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Tjyy-1223/DADS/HEAD/net/__pycache__/monitor_client.cpython-39.pyc -------------------------------------------------------------------------------- /net/__pycache__/monitor_server.cpython-39.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Tjyy-1223/DADS/HEAD/net/__pycache__/monitor_server.cpython-39.pyc -------------------------------------------------------------------------------- /dads_framework/__pycache__/dinic.cpython-39.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Tjyy-1223/DADS/HEAD/dads_framework/__pycache__/dinic.cpython-39.pyc -------------------------------------------------------------------------------- /models/__pycache__/InceptionBlock.cpython-39.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Tjyy-1223/DADS/HEAD/models/__pycache__/InceptionBlock.cpython-39.pyc -------------------------------------------------------------------------------- /utils/__pycache__/inference_utils.cpython-39.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Tjyy-1223/DADS/HEAD/utils/__pycache__/inference_utils.cpython-39.pyc -------------------------------------------------------------------------------- /models/__pycache__/InceptionBlockV2.cpython-39.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Tjyy-1223/DADS/HEAD/models/__pycache__/InceptionBlockV2.cpython-39.pyc -------------------------------------------------------------------------------- /dads_framework/__pycache__/graph_construct.cpython-39.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Tjyy-1223/DADS/HEAD/dads_framework/__pycache__/graph_construct.cpython-39.pyc -------------------------------------------------------------------------------- /.idea/vcs.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | -------------------------------------------------------------------------------- /.idea/misc.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | -------------------------------------------------------------------------------- /.idea/.gitignore: -------------------------------------------------------------------------------- 1 | # Default ignored files 2 | /shelf/ 3 | /workspace.xml 4 | # Editor-based HTTP Client requests 5 | /httpRequests/ 6 | # Datasource local storage ignored files 7 | /dataSources/ 8 | /dataSources.local.xml 9 | -------------------------------------------------------------------------------- /.idea/inspectionProfiles/profiles_settings.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 6 | -------------------------------------------------------------------------------- /.idea/modules.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | -------------------------------------------------------------------------------- /.idea/DADS.iml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | -------------------------------------------------------------------------------- /cloud_api.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import sys,getopt 3 | import warnings 4 | warnings.filterwarnings("ignore") 5 | from net.monitor_server import MonitorServer 6 | from server_func import start_server 7 | from net import net_utils 8 | 9 | """ 10 | 云端设备api 用于接收中间数据,并在云端执行剩余的DNN部分,将结果保存在excel表格中 11 | server 启动指令 python cloud_api.py -i 127.0.0.1 -p 9999 -d cpu 12 | "-i", "--ip" 服务端 ip地址 13 | "-p", "--port" 服务端 开放端口 14 | "-d", "--device" 是否开启服务端GPU计算 cpu or cuda 15 | 16 | """ 17 | if __name__ == '__main__': 18 | try: 19 | opts, args = getopt.getopt(sys.argv[1:], "i:p:d:", ["ip=","port=","device"]) 20 | except getopt.GetoptError: 21 | print('input argv error') 22 | sys.exit(2) 23 | 24 | # 处理 options中以元组的方式存在(opt,arg) 25 | ip,port = "127.0.0.1",8090 26 | device = "cpu" 27 | for opt, arg in opts: 28 | if opt in ("-i", "--ip"): 29 | ip = arg 30 | elif opt in ("-p", "--port"): 31 | port = int(arg) 32 | elif opt in ("-d", "--device"): 33 | device = arg 34 | 35 | 36 | if device == "cuda" and torch.cuda.is_available() == False: 37 | raise RuntimeError("本机器上不可以使用cuda") 38 | 39 | while True: 40 | # 开启服务端进行监听 41 | socket_server = net_utils.get_socket_server(ip, port) 42 | 43 | # 开启:带宽监测服务端 44 | monitor_ser = MonitorServer(ip=ip) 45 | monitor_ser.start() 46 | monitor_ser.join() 47 | 48 | start_server(socket_server,device) 49 | monitor_ser.terminate() 50 | 51 | -------------------------------------------------------------------------------- /dads_framework/dads.py: -------------------------------------------------------------------------------- 1 | from dads_framework.dinic import dinic_algorithm,get_min_cut_set 2 | from dads_framework.graph_construct import graph_construct 3 | 4 | def algorithm_DSL(model, model_input, edge_latency_list, cloud_latency_list, bandwidth, net_type="wifi"): 5 | """ 6 | 在低负载情况下为传入模型选择最优分割策略 7 | :param model: 传入DNN模型 8 | :param model_input: 模型输入 9 | :param edge_latency_list: 边缘设备上各层的推理时延 10 | :param cloud_latency_list: 云端设备上各层的推理时延 11 | :param bandwidth: 网络带宽 MB/s 12 | :param net_type: 当前网络带宽状况,默认为 "wifi" 13 | :return: 有向图中的对应的割集(不包含edge顶点和cloud顶点)以及划分过程会用到的 dict_node_layer,记录了顶点对应了第几层 14 | """ 15 | # 构建对应的有向图 16 | graph, dict_node_layer, dict_layer_input_size = graph_construct(model, model_input, edge_latency_list, cloud_latency_list, bandwidth=bandwidth, net_type=net_type) 17 | # min_cut_value表示最短推理时延,reachable表示需要放在边缘端推理的顶点, non_reachable表示放在云端推理的顶点 18 | min_cut_value, reachable, non_reachable = dinic_algorithm(graph) 19 | 20 | # 检查一些bug时可能用到 21 | # for edge in graph.edges(data=True): 22 | # print(edge) 23 | # print(reachable) 24 | # print(non_reachable) 25 | 26 | # partition_edge表示图中需要切割的边 27 | graph_partition_edge = get_min_cut_set(graph, min_cut_value, reachable, non_reachable) 28 | return graph_partition_edge,dict_node_layer 29 | 30 | 31 | 32 | def get_partition_points(graph_partition_edge, dict_node_layer): 33 | """ 34 | 根据有向图的割集 graph_partition_edge 转换成DNN模型切分点 model_partition_edge 35 | :param graph_partition_edge: 有向图的割集 36 | :param dict_node_layer: 有向图顶点与模型层的对应 37 | :return: model_partition_edge: 模型中在哪两层之间进行分割 38 | """ 39 | model_partition_edge = [] 40 | for graph_edge in graph_partition_edge: 41 | # 表示在DNN模型中的第 start_layer 层 - end_layer之间进行划分(也就是说在start_layer之后进行划分) 42 | start_layer = dict_node_layer[graph_edge[0]] 43 | end_layer = dict_node_layer[graph_edge[1]] 44 | model_partition_edge.append((start_layer, end_layer)) 45 | return model_partition_edge 46 | 47 | -------------------------------------------------------------------------------- /edge_api.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import sys, getopt 3 | from server_func import start_client 4 | from net.monitor_client import MonitorClient 5 | import multiprocessing 6 | 7 | import warnings 8 | warnings.filterwarnings("ignore") 9 | 10 | 11 | """ 12 | 边缘设备api,用于启动边缘设备,进行前半部分计算后,将中间数据传递给云端设备 13 | client 启动指令 python edge_api.py -i 127.0.0.1 -p 9999 -d cpu -t easy_net 14 | "-t", "--type" 模型种类参数 "alex_net" "vgg_net" "easy_net" "inception" "inception_v2" 15 | "-i", "--ip" 服务端 ip地址 16 | "-p", "--port" 服务端 开放端口 17 | "-d", "--device" 是否开启客户端GPU计算 cpu or cuda 18 | """ 19 | if __name__ == '__main__': 20 | try: 21 | opts, args = getopt.getopt(sys.argv[1:], "t:i:p:d:", ["type=","ip=","port=","device_on="]) 22 | except getopt.GetoptError: 23 | print('input argv error') 24 | sys.exit(2) 25 | 26 | # 处理 options中以元组的方式存在(opt,arg) 27 | model_type = "" 28 | ip,port = "127.0.0.1",999 29 | device = "cpu" 30 | for opt, arg in opts: 31 | if opt in ("-t", "--type"): 32 | model_type = arg 33 | elif opt in ("-i", "--ip"): 34 | ip = arg 35 | elif opt in ("-p", "--port"): 36 | port = int(arg) 37 | elif opt in ("-d", "--device"): 38 | device = arg 39 | 40 | if device == "cuda" and torch.cuda.is_available() == False: 41 | raise RuntimeError("本机器上不可以使用cuda") 42 | 43 | 44 | # 开启:带宽监测客户端 45 | # 如果没有两个设备测试的条件 可以使用下面的方式 将带宽自定义 46 | # bandwidth_value = 10 #Mbps 47 | bandwidth_value = multiprocessing.Value('d', 0.0) 48 | monitor_cli = MonitorClient(ip=ip, bandwidth_value=bandwidth_value) 49 | monitor_cli.start() 50 | 51 | # 等待子进程结束后获取到带宽数据 52 | monitor_cli.join() 53 | print(f"get bandwidth value : {bandwidth_value.value} MB/s") 54 | 55 | # step2 准备input数据 56 | x = torch.rand(size=(1, 3, 224, 224), requires_grad=False) 57 | x = x.to(device) 58 | 59 | # 部署阶段 - 选择优化分层点 60 | # upload_bandwidth = bandwidth_value.value # MBps 61 | upload_bandwidth = 10 # MBps 为确保程序正确运行 这里设置为10;实机运行使用上面那行 62 | 63 | # 使用云边协同的方式进行模拟 64 | start_client(ip, port, x, model_type, upload_bandwidth, device) 65 | 66 | -------------------------------------------------------------------------------- /net/monitor_client.py: -------------------------------------------------------------------------------- 1 | import time 2 | import torch 3 | from multiprocessing import Process 4 | from net import net_utils 5 | from apscheduler.schedulers.blocking import BlockingScheduler 6 | import multiprocessing 7 | 8 | 9 | class MonitorClient(Process): 10 | """ 11 | 带宽监视器客户端,其工作流程如下:通过定时机制每隔一段时间测量一次 12 | 1. 生成数据并发送给服务端 由服务端记录时间 13 | 2. 获取数据的传输时延 使用进程通信 - 供边缘端进行模型划分 14 | """ 15 | def __init__(self, ip, bandwidth_value, port=9922, interval=3): 16 | super(MonitorClient, self).__init__() 17 | self.ip = ip 18 | self.bandwidth_value = bandwidth_value 19 | self.port = port 20 | self.interval = interval 21 | 22 | 23 | def start_client(self) -> None: 24 | # 传入的数据大小 25 | data = torch.rand((1, 3, 224, 224)) 26 | 27 | while True: 28 | try: 29 | # 与服务端进行连接 发生意外就一直尝试 30 | conn = net_utils.get_socket_client(self.ip, self.port) 31 | # 发送数据 32 | net_utils.send_data(conn, data, "data", show=False) 33 | 34 | # 插入一个break消息 防止粘包现象 35 | net_utils.send_short_data(conn, "break", show=False) 36 | 37 | # 直到接收到回应的数据时延 则退出循环 38 | latency = net_utils.get_short_data(conn) 39 | # print(f"monitor client get latency : {latency} MB/s ") 40 | if latency is not None: 41 | self.bandwidth_value.value = latency 42 | net_utils.close_conn(conn) 43 | break 44 | time.sleep(1) 45 | except ConnectionRefusedError: 46 | pass 47 | # print("[Errno 61] Connection refused, try again.") 48 | 49 | def schedular(self): 50 | # 使用定时机制 每隔一段时间后监测带宽 51 | # 创建调度器 52 | scheduler = BlockingScheduler() 53 | 54 | # 添加任务 55 | scheduler.add_job(self.start_client, 'interval', seconds=self.interval) 56 | scheduler.start() 57 | 58 | 59 | def run(self) -> None: 60 | # self.schedular() 61 | self.start_client() 62 | 63 | 64 | # if __name__ == '__main__': 65 | # ip = "127.0.0.1" 66 | # bandwidth_value = multiprocessing.Value('d', 0.0) 67 | # monitor_cli = MonitorClient(ip=ip, bandwidth_value=bandwidth_value) 68 | # 69 | # monitor_cli.start() 70 | # monitor_cli.join() -------------------------------------------------------------------------------- /net/monitor_server.py: -------------------------------------------------------------------------------- 1 | from multiprocessing import Process 2 | from apscheduler.schedulers.blocking import BlockingScheduler 3 | from net import net_utils 4 | 5 | def get_bandwidth(conn): 6 | """ 7 | 通过一次信号传输来计算带宽 8 | :param conn: 连接好的conn 9 | :return: 带宽 MB/s 10 | """ 11 | # 得到传输时延 12 | _,latency = net_utils.get_data(conn) 13 | # print(f"{latency} ms \n") 14 | # 计算数据的字节数 Byte 接收数据size固定为[1,3,224,224] 15 | # data_size = 1 * 3 * 224 * 224 * 8 16 | 17 | # x = torch.rand((1, 3, 224, 224)) 18 | # print(len(pickle.dumps(x))) 19 | # 得到的数据大小为 602541 bytes 20 | data_size = 602541 21 | 22 | # 计算带宽 MB/s 23 | bandwidth = (data_size/1024/1024) / (latency / 1000) 24 | print(f"monitor server get bandwidth : {bandwidth} MB/s ") 25 | return bandwidth 26 | 27 | 28 | class MonitorServer(Process): 29 | """ 30 | 带宽监视器服务端,其工作流程如下:ip为传入的ip 端口默认为9922 31 | 1. 带宽监视器客户端传来的数据 : 通过定时机制开启 每隔一段时间开启一次 32 | 2. 记录传输时间需要的传输时延 (ms) 33 | 3. 计算带宽 并将速度转换成单位 MB/s 34 | 4. 将带宽数据返回给客户端 35 | """ 36 | def __init__(self, ip, port=9922, interval=3): 37 | super(MonitorServer, self).__init__() 38 | self.ip = ip 39 | self.port = port 40 | self.interval = interval 41 | 42 | 43 | def start_server(self) -> None: 44 | # 创建一个socket服务端 45 | socket_server = net_utils.get_socket_server(self.ip, self.port) 46 | # 超过10s没有连接后自动断开 不会一直阻塞等待 47 | # socket_server.settimeout(10) 48 | 49 | # 等待客户端连接 没有客户端连接的话会一直阻塞并等待 50 | conn, client = socket_server.accept() 51 | 52 | # 获得传输带宽 MB/s 53 | bandwidth = get_bandwidth(conn) 54 | 55 | # 插入一个break消息接收 防止数据粘包现象 56 | net_utils.get_short_data(conn) 57 | 58 | # 将获取的带宽传输到客户端 59 | net_utils.send_short_data(conn, bandwidth, "bandwidth", show=False) 60 | 61 | # 关闭连接 62 | net_utils.close_conn(conn) 63 | net_utils.close_socket(socket_server) 64 | 65 | 66 | def schedular(self): 67 | # 使用定时机制 每隔一段时间后监测带宽 68 | # 创建调度器 69 | scheduler = BlockingScheduler() 70 | 71 | # 添加任务 72 | scheduler.add_job(self.start_server, 'interval', seconds=self.interval) 73 | scheduler.start() 74 | 75 | 76 | def run(self) -> None: 77 | # self.schedular() 78 | self.start_server() 79 | 80 | 81 | 82 | # if __name__ == '__main__': 83 | # ip = "127.0.0.1" 84 | # monitor_ser = MonitorServer(ip=ip) 85 | # 86 | # monitor_ser.start() 87 | # monitor_ser.join() 88 | # 89 | # 90 | 91 | 92 | -------------------------------------------------------------------------------- /models/AlexNet.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | from collections import abc 4 | 5 | class AlexNet(nn.Module): 6 | def __init__(self, input_channels=3, num_classes: int = 1000) -> None: 7 | """ 8 | input_channels: 输入图像的通道数,默认通道数为3 9 | num_classes: AlexNet的输出维度,默认为1000 10 | """ 11 | super(AlexNet, self).__init__() 12 | self.has_dag_topology = False 13 | self.layers = nn.Sequential( 14 | nn.Conv2d(input_channels,64,kernel_size=(11,11),stride=(4,4),padding=1), 15 | nn.ReLU(inplace=True), 16 | nn.MaxPool2d(kernel_size=3, stride=2), 17 | nn.Conv2d(64, 192, kernel_size=(5, 5), padding=2), 18 | nn.ReLU(inplace=True), 19 | nn.MaxPool2d(kernel_size=3, stride=2), 20 | nn.Conv2d(192, 384, kernel_size=(3, 3), padding=1), 21 | nn.ReLU(inplace=True), 22 | nn.Conv2d(384, 256, kernel_size=(3, 3), padding=1), 23 | nn.ReLU(inplace=True), 24 | nn.Conv2d(256, 256, kernel_size=(3, 3), padding=1), 25 | nn.ReLU(inplace=True), 26 | nn.MaxPool2d(kernel_size=3, stride=2), 27 | nn.AdaptiveAvgPool2d((6, 6)), 28 | nn.Flatten(), 29 | nn.Dropout(), 30 | nn.Linear(256 * 6 * 6, 4096), 31 | nn.ReLU(inplace=True), 32 | nn.Dropout(), 33 | nn.Linear(4096, 4096), 34 | nn.ReLU(inplace=True), 35 | nn.Linear(4096, num_classes), 36 | ) 37 | self.len = len(self.layers) 38 | 39 | def forward(self, x: torch.Tensor) -> torch.Tensor: 40 | x = self.layers(x) 41 | return x 42 | 43 | def __iter__(self): 44 | """ 用于遍历AlexNet模型的每一层 """ 45 | return SentenceIterator(self.layers) 46 | 47 | def __len__(self): 48 | return self.len 49 | 50 | def __getitem__(self, index): 51 | layer = nn.Sequential() 52 | try: 53 | if index < self.len: 54 | layer = self.layers[index] 55 | except IndexError: 56 | raise StopIteration() 57 | return layer 58 | 59 | 60 | class SentenceIterator(abc.Iterator): 61 | """ 62 | AlexNet迭代器 63 | 下面是 AlexNet 网络的迭代参数调整 64 | 将下面的设置传入到 AlexNet 的 __iter__ 中可以完成对于 AlexNet 网络的层级遍历 65 | """ 66 | def __init__(self, layers): 67 | self.layers = layers 68 | self._index = 0 69 | self.len = len(layers) 70 | 71 | def __next__(self): 72 | layer = nn.Sequential() 73 | try: 74 | if self._index <= self.len: 75 | layer = self.layers[self._index] 76 | except IndexError: 77 | raise StopIteration() 78 | else: 79 | self._index += 1 80 | return layer 81 | -------------------------------------------------------------------------------- /.idea/inspectionProfiles/Project_Default.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 71 | -------------------------------------------------------------------------------- /utils/excel_utils.py: -------------------------------------------------------------------------------- 1 | import xlrd 2 | import xlwt 3 | from xlutils.copy import copy 4 | 5 | def create_excel_xsl(path, sheet_name, value): 6 | """ 7 | 根据value值创建一个excel表格和sheet 8 | :param path: 表格路径 9 | :param sheet_name: sheet名称 10 | :param value: 表头,表头规范如下 11 | :return: None 12 | 13 | value = [["feature1", "feature2", "feature3"....]] 14 | """ 15 | index = len(value) 16 | try: 17 | with xlrd.open_workbook(path) as workbook: 18 | workbook = copy(workbook) 19 | # worksheet = workbook.sheet_by_name(sheet_name) 20 | worksheet = workbook.add_sheet(sheet_name) # 在工作簿中新建一个表格 21 | for i in range(len(value[0])): 22 | worksheet.col(i).width = 256 * 30 # Set the column width 23 | for i in range(0, index): 24 | for j in range(0, len(value[i])): 25 | worksheet.write(i, j, value[i][j]) 26 | workbook.save(path) 27 | print("xls格式表格创建成功") 28 | except FileNotFoundError: 29 | workbook = xlwt.Workbook() # 新建一个工作簿 30 | worksheet = workbook.add_sheet(sheet_name) # 在工作簿中新建一个表格 31 | for i in range(len(value[0])): 32 | worksheet.col(i).width = 256 * 30 # Set the column width 33 | for i in range(0, index): 34 | for j in range(0, len(value[i])): 35 | worksheet.write(i, j, value[i][j]) 36 | workbook.save(path) 37 | print("xls格式表格创建成功") 38 | 39 | 40 | def write_excel_xls_append(path, sheet_name, value): 41 | """ 42 | 将value值写入到指定的excel表格中 43 | :param path: 表格路径 44 | :param sheet_name: sheet名称 45 | :param value: 新增一列,形式如下 46 | :return: None 47 | 48 | value = [["feature1", "feature2", "feature3"....]] 49 | """ 50 | index = len(value) 51 | workbook = xlrd.open_workbook(path) 52 | worksheet = workbook.sheet_by_name(sheet_name) 53 | 54 | rows_old = worksheet.nrows # 获取表格中已存在的数据的行数 55 | new_workbook = copy(workbook) # 将xlrd对象拷贝转化为xlwt对象 56 | new_worksheet = new_workbook.get_sheet(sheet_name) 57 | 58 | for i in range(len(value[0])): 59 | new_worksheet.col(i).width = 256 * 30 # Set the column width 60 | 61 | for i in range(0, index): 62 | for j in range(0, len(value[i])): 63 | new_worksheet.write(i + rows_old, j, value[i][j]) 64 | 65 | new_workbook.save(path) # 保存工作簿 66 | print("xls格式表格【追加】写入数据成功!") 67 | 68 | 69 | def sheet_exists(path, sheet_name): 70 | """ 71 | 判断excel表格的sheet表格是否存在 72 | :param path: 表格路径 73 | :param sheet_name: sheet名称 74 | :return: Ture or False 是否存在 75 | """ 76 | try: 77 | workbook = xlrd.open_workbook(path) 78 | worksheet = workbook.sheet_by_name(sheet_name) 79 | if worksheet is None: 80 | return False 81 | except Exception: 82 | return False 83 | 84 | 85 | def read_excel_xls(path, sheet_name): 86 | """ 87 | 展示excel表格中的数据 88 | :param path: 表格路径 89 | :param sheet_name: sheet名称 90 | :return: 91 | """ 92 | workbook = xlrd.open_workbook(path) # 打开工作簿 93 | worksheet = workbook.sheet_by_name(sheet_name) # 获取工作簿中的所有表格 94 | for i in range(0, worksheet.nrows): 95 | for j in range(0, worksheet.ncols): 96 | print(worksheet.cell_value(i, j), "\t", end="") # 逐行逐列读取数据 97 | print() 98 | 99 | 100 | def get_excel_data(path, sheet_name, col_name): 101 | """ 102 | 读取excel表格中指定列的数据 103 | :param path: 表格路径 104 | :param sheet_name: sheet名称 105 | :param col_name: sheet中的列名,级某属性名称 106 | :return: 相应的数据list 107 | """ 108 | workbook = xlrd.open_workbook(path) # 打开工作簿 109 | worksheet = workbook.sheet_by_name(sheet_name) # 获取工作簿中的所有表格 110 | 111 | col_index = -1 112 | for j in range(0, worksheet.ncols): 113 | if worksheet.cell_value(0, j) == col_name: 114 | col_index = j 115 | if col_index == -1: 116 | print("no matched col name") 117 | return None 118 | 119 | data = [] 120 | for i in range(1, worksheet.nrows): 121 | for j in range(0, worksheet.ncols): 122 | if j == col_index: 123 | data.append(worksheet.cell_value(i, j)) 124 | return data -------------------------------------------------------------------------------- /server_func.py: -------------------------------------------------------------------------------- 1 | from utils import inference_utils 2 | from dads_framework.dads import algorithm_DSL, get_partition_points 3 | from dads_framework.graph_construct import get_layers_latency 4 | import net.net_utils as net 5 | 6 | def start_server(socket_server, device): 7 | """ 8 | 开始监听客户端传来的消息 9 | 一般仅在 cloud_api.py 中直接调用 10 | :param socket_server: socket服务端 11 | :param device: 使用本地的cpu运行还是cuda运行 12 | :return: None 13 | """ 14 | # 等待客户端连接 15 | conn, client = net.wait_client(socket_server) 16 | 17 | # 接收模型类型 18 | model_type = net.get_short_data(conn) 19 | print(f"get model type successfully.") 20 | 21 | # 读取模型 22 | model = inference_utils.get_dnn_model(model_type) 23 | 24 | # 获取云端各层时延 25 | cloud_latency_list = get_layers_latency(model, device=device) 26 | net.send_short_data(conn, cloud_latency_list, "model latency on the cloud device.") 27 | 28 | # 接收模型分层点 29 | model_partition_edge = net.get_short_data(conn) 30 | print(f"get partition point successfully.") 31 | 32 | # 获取划分后的边缘端模型和云端模型 33 | _, cloud_model = inference_utils.model_partition(model, model_partition_edge) 34 | cloud_model = cloud_model.to(device) 35 | 36 | # 接收中间数据并返回传输时延 37 | edge_output, transfer_latency = net.get_data(conn) 38 | 39 | # 避免连续发送两个消息 防止消息粘包 40 | conn.recv(40) 41 | 42 | print(f"get edge_output and transfer latency successfully.") 43 | net.send_short_data(conn, transfer_latency, "transfer latency") 44 | 45 | # 避免连续发送两个消息 防止消息粘包 46 | conn.recv(40) 47 | 48 | inference_utils.warmUp(cloud_model, edge_output, device) 49 | 50 | # 记录云端推理时延 51 | cloud_output, cloud_latency = inference_utils.recordTime(cloud_model, edge_output, device, epoch_cpu=30, 52 | epoch_gpu=100) 53 | print(f"{model_type} 在云端设备上推理完成 - {cloud_latency:.3f} ms") 54 | net.send_short_data(conn, cloud_latency, "cloud latency") 55 | 56 | print("================= DNN Collaborative Inference Finished. ===================") 57 | 58 | 59 | def start_client(ip, port, input_x, model_type, upload_bandwidth, device): 60 | """ 61 | 启动一个client客户端 向server端发起推理请求 62 | 一般仅在 edge_api.py 中直接调用 63 | :param ip: server端的ip地址 64 | :param port: server端的端口地址 65 | :param input_x: 初始输入 66 | :param model_type: 选用的模型类型 67 | :param upload_bandwidth 上传带宽 68 | :param device: 在本地cpu运行还是cuda运行 69 | :return: None 70 | """ 71 | # 读取模型 72 | model = inference_utils.get_dnn_model(model_type) 73 | # 和云端建立连接 74 | conn = net.get_socket_client(ip, port) 75 | 76 | # 发送一个数据请求云端的各层推理时延 77 | net.send_short_data(conn, model_type, msg="model type") 78 | edge_latency_list = get_layers_latency(model, device=device) # 计算出边缘端的时延参数 79 | cloud_latency_list = net.get_short_data(conn) # 接受到云端的时延参数 80 | 81 | # 获得图中的割集以及dict_node_layer字典 82 | graph_partition_edge, dict_node_layer = algorithm_DSL(model, input_x, 83 | edge_latency_list, cloud_latency_list, 84 | bandwidth=upload_bandwidth) 85 | # 获得在DNN模型哪层之后划分 86 | model_partition_edge = get_partition_points(graph_partition_edge, dict_node_layer) 87 | print(f"partition edges : {model_partition_edge}") 88 | 89 | # 发送划分点 90 | net.send_short_data(conn, model_partition_edge, msg="partition strategy") 91 | 92 | # 获取划分后的边缘端模型和云端模型 93 | edge_model, _ = inference_utils.model_partition(model, model_partition_edge) 94 | edge_model = edge_model.to(device) 95 | 96 | # 开始边缘端的推理 首先进行预热 97 | inference_utils.warmUp(edge_model, input_x, device) 98 | edge_output, edge_latency = inference_utils.recordTime(edge_model, input_x, device, epoch_cpu=30, epoch_gpu=100) 99 | print(f"{model_type} 在边缘端设备上推理完成 - {edge_latency:.3f} ms") 100 | 101 | # 发送中间数据 102 | net.send_data(conn, edge_output, "edge output") 103 | 104 | # 避免连续接收两个消息 防止消息粘包 105 | conn.sendall("avoid sticky".encode()) 106 | 107 | transfer_latency = net.get_short_data(conn) 108 | print(f"{model_type} 传输完成 - {transfer_latency:.3f} ms") 109 | 110 | # 避免连续接收两个消息 防止消息粘包 111 | conn.sendall("avoid sticky".encode()) 112 | 113 | cloud_latency = net.get_short_data(conn) 114 | print(f"{model_type} 在云端设备上推理完成 - {cloud_latency:.3f} ms") 115 | 116 | print("================= DNN Collaborative Inference Finished. ===================") 117 | conn.close() 118 | 119 | 120 | -------------------------------------------------------------------------------- /models/VggNet.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | from collections import abc 4 | 5 | 6 | class VggNet(nn.Module): 7 | def __init__(self,input_channels=3, num_classes: int = 1000,init_weights:bool = True) -> None: 8 | """ 9 | input_channels: 输入图像的通道数,默认通道数为3 10 | num_classes: AlexNet的输出维度,默认为1000 11 | """ 12 | super(VggNet, self).__init__() 13 | self.has_dag_topology = False 14 | 15 | self.layers = nn.Sequential( 16 | nn.Conv2d(input_channels, 3, kernel_size=(3,3), stride=(1, 1), padding=(1,1)), 17 | nn.ReLU(inplace=True), 18 | nn.Conv2d(3, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)), 19 | nn.ReLU(inplace=True), 20 | 21 | nn.MaxPool2d(kernel_size=2, stride=2), 22 | 23 | nn.Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)), 24 | nn.ReLU(inplace=True), 25 | nn.Conv2d(64, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)), 26 | nn.ReLU(inplace=True), 27 | 28 | nn.MaxPool2d(kernel_size=2, stride=2), 29 | 30 | nn.Conv2d(128, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)), 31 | nn.ReLU(inplace=True), 32 | nn.Conv2d(128, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)), 33 | nn.ReLU(inplace=True), 34 | nn.Conv2d(128, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)), 35 | nn.ReLU(inplace=True), 36 | 37 | nn.MaxPool2d(kernel_size=2, stride=2), 38 | 39 | nn.Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)), 40 | nn.ReLU(inplace=True), 41 | nn.Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)), 42 | nn.ReLU(inplace=True), 43 | nn.Conv2d(256, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)), 44 | nn.ReLU(inplace=True), 45 | 46 | nn.MaxPool2d(kernel_size=2, stride=2), 47 | 48 | nn.Conv2d(512, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)), 49 | nn.ReLU(inplace=True), 50 | nn.Conv2d(512, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)), 51 | nn.ReLU(inplace=True), 52 | nn.Conv2d(512, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)), 53 | nn.ReLU(inplace=True), 54 | 55 | nn.MaxPool2d(kernel_size=2, stride=2), 56 | 57 | nn.AdaptiveAvgPool2d((7, 7)), 58 | nn.Flatten(), 59 | nn.Linear(512 * 7 * 7, 4096), 60 | nn.ReLU(True), 61 | nn.Dropout(), 62 | nn.Linear(4096, 4096), 63 | nn.ReLU(True), 64 | nn.Dropout(), 65 | nn.Linear(4096, num_classes), 66 | ) 67 | self.len = len(self.layers) 68 | if init_weights: 69 | self._initialize_weights() 70 | 71 | def forward(self,x: torch.Tensor) -> torch.Tensor: 72 | x = self.layers(x) 73 | return x 74 | 75 | def __iter__(self): 76 | """ 用于遍历VGG-16模型的每一层 """ 77 | return SentenceIterator(self.layers) 78 | 79 | def __len__(self): 80 | return self.len 81 | 82 | def __getitem__(self, index): 83 | layer = nn.Sequential() 84 | try: 85 | if index < self.len: 86 | layer = self.layers[index] 87 | except IndexError: 88 | raise StopIteration() 89 | return layer 90 | 91 | def _initialize_weights(self) -> None: 92 | for m in self.modules(): 93 | if isinstance(m, nn.Conv2d): 94 | nn.init.kaiming_normal_(m.weight, mode='fan_out', nonlinearity='relu') 95 | if m.bias is not None: 96 | nn.init.constant_(m.bias, 0) 97 | elif isinstance(m, nn.BatchNorm2d): 98 | nn.init.constant_(m.weight, 1) 99 | nn.init.constant_(m.bias, 0) 100 | elif isinstance(m, nn.Linear): 101 | nn.init.normal_(m.weight, 0, 0.01) 102 | nn.init.constant_(m.bias, 0) 103 | 104 | 105 | class SentenceIterator(abc.Iterator): 106 | """ 107 | VGG-16迭代器 108 | 下面是 VGG-16 网络的迭代参数调整 109 | 将下面的设置传入到 VGG-16 的 __iter__ 中可以完成对于 VGG-16 网络的层级遍历 110 | """ 111 | def __init__(self,layers): 112 | self.layers = layers 113 | self._index = 0 114 | self.len = len(layers) 115 | 116 | def __next__(self): 117 | layer = nn.Sequential() 118 | try: 119 | if self._index <= self.len: 120 | layer = self.layers[self._index] 121 | except IndexError: 122 | raise StopIteration() 123 | else: 124 | self._index += 1 125 | return layer 126 | 127 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # DADS 2 |  💻 欢迎在云边协同领域工作的同学一起交流 3 | 4 | 💻 如果有一些代码中的bug,请提出issue,我补充解决 5 | 6 | 🥳 本项目根据经典论文进行复现:Dynamic adaptive DNN surgery for inference acceleration on the edge 7 | 8 | 论文链接🔗:[DADS](https://github.com/Tjyy-1223/DADS/blob/main/paper/DADS.pdf) 9 | 10 | ## 研究内容 11 | 12 | **DNN的最新进展表明DNN不再局限于链式拓扑,DAG拓扑变得流行。** 13 | 14 | 例如,GoogleNet和ResNet是DAG结构。显然,将DAG图划分代替链式划分涉及更复杂的图论问题。设计了一种动态自适应DNN手术(DADS)方案,通过持续监测DNN网络状态来优化分割DNN网络。 15 | 16 | DADS的关键设计如下: 17 | 18 | - DADS持续监测网络状况,并确定系统是在轻载状态还是重负载状态下运行。 19 | - **轻载条件下,** 设计DSL算法,以最小化处理一帧的总延迟。在这一部分中,为了求解时延最小化问题,我们将原问题转化为一个等价的最小割问题,以便找到全局最优解。 20 | - **高负载情况下,** 提出了DSH算法,最大化吞吐量,即单位时间内可以处理的帧数。然而,本文证明了这类优化问题是np -难问题,不能在多项式时间复杂度内求解。DSH采用近似方法,其近似比为3。 21 | 22 | ![image-20230709090527770](./assets/image-20230709090527770.png) 23 | 24 | **项目内容:** 25 | 26 | + 实现了将DAG拓扑DNN转化为有向图的过程 27 | + 实现了以dinic算法完成有向图最小割的寻找任务,并将其转化为DNN模型的划分策略 28 | + 将划分后的DNN模型以云边协同的模式进行推理 29 | 30 | **局限性:** 31 | 32 | + 目前只实现了DSL()算法,即普通推理单个DNN任务;后续有时间的话会实现DSH()算法,解决高负载情况下的划分策略选取。 33 | + DADS算法需要对模型结构进行设计,包括一个DAG网络有哪些分支点、合并点,以及如何对一个DAG结构进行循环都需要自行设计。可以在models文件夹下查看模型的详细设计步骤。 34 | + DADS框架中,提前测量的模式测量模型在云边端设备上的推理时间,从云端设备上的推理时延需要传回本地,这增加了构建过程中的开销,且在网络条件较差时会受影响。 35 | + 由于在DADS的考虑中,每个节点代表了DNN的一个单层结构,这也表明了DADS很难直接应用在GoogleNet等现有复杂结构中,除非你对GoogleNet中所有层进行展开后,重构其复杂的内部结构。而对于Pytorch以及后续DNN,会将相似的结构封装在Inception Block中,而DADS只能在Inception Block两端进行划分。 36 | 37 | ## 项目结构 38 | 39 | ```python 40 | DADS 41 | ├── server_func.py # 云边端交流的功能函数 42 | ├── cloud_api.py # 模拟云端设备入口 43 | ├── edge_api.py # 模拟边端设备入口 44 | ├── dads_framework # dads框架 45 | │   ├── dads.py # DSL() 选取最优划分点 46 | │   ├── dinic.py # dinic算法实现 47 | │   └── graph_construct.py # 构建有向图算法 48 | ├── models # 采用的DNN模型 49 | │   ├── AlexNet.py 50 | │   ├── EasyModel.py 51 | │   ├── InceptionBlock.py 52 | │   ├── InceptionBlockV2.py 53 | │   ├── VggNet.py 54 | ├── net # 网络模块 55 | │   ├── monitor_client.py # 带宽监视器客户端 56 | │   ├── monitor_server.py # 带宽监视器服务端 57 | │   └── net_utils.py # 网络功能方法 58 | ├── paper # 论文 59 | │   └── DADS.pdf 60 | └── utils # 其他工具 61 | ├── excel_utils.py # excel操作 62 | └── inference_utils.py # 协同推理功能函数 63 | ``` 64 | 65 | 66 | 67 | ## 运行环境 68 | 69 | ``` 70 | python 3.9 71 | torch==1.9.0.post2 72 | torchvision==0.10.0 73 | xlrd==2.0.1 74 | apscheduler 75 | ``` 76 | 77 | ## 项目运行 78 | 79 | ### 单任务模式 80 | 81 | + **一般用于评估对于DNN推理时延的性能改进:每次需要通过指令向客户端提供任务** 82 | + **带宽数据为每次进行推理之前 进行单次监测** 83 | 84 | 云端设备上运行 : 可以改成服务端开放的ip和端口;-d表示云端使用cpu还是gpu:输入参数"cpu"或"cuda" 85 | 86 | ```python 87 | python cloud_api.py -i 127.0.0.1 -p 9999 -d cpu 88 | ``` 89 | 90 | 边端设备上上运行:-i和-d为服务端开放的ip和端口;-d表示边端使用cpu还是gpu:输入参数"cpu"或"cuda" 91 | 92 | ```python 93 | # -t表示模型类型 传入参数可以为 "alex_net" "vgg_net" "easy_net" "inception" "inception_v2" 94 | python edge_api.py -i 127.0.0.1 -p 9999 -d cpu -t easy_net 95 | ``` 96 | 97 | **单机运行结果如下:** 98 | 99 | **云端设备:** python cloud_api.py -i 127.0.0.1 -p 9999 -d cpu 100 | 101 | ``` 102 | monitor server get bandwidth : 3192.4125617104364 MB/s 103 | successfully connection : 104 | get model type successfully. 105 | short message , model latency on the cloud device. has been sent successfully 106 | get partition point successfully. 107 | get edge_output and transfer latency successfully. 108 | short message , transfer latency has been sent successfully 109 | easy_net 在云端设备上推理完成 - 0.001 ms 110 | short message , cloud latency has been sent successfully 111 | ================= DNN Collaborative Inference Finished. =================== 112 | ``` 113 | 114 | **边端设备:** python edge_api.py -i 127.0.0.1 -p 9999 -d cpu -t easy_net 115 | 116 | ``` 117 | (tjyy) tianjiangyu@tianjiangyudeMacBook-Pro Neurosurgeon % python edge_api.py -i 127.0.0.1 -p 9999 -d cpu -t alex_net 118 | get bandwidth value : 3192.4125617104364 MB/s 119 | short message , model type has been sent successfully 120 | start construct graph for model... 121 | partition edges : [] 122 | short message , partition strategy has been sent successfully 123 | easy_net 在边缘端设备上推理完成 - 2.193 ms 124 | get yes , edge output has been sent successfully 125 | easy_net 传输完成 - 0.232 ms 126 | easy_net 在云端设备上推理完成 - 0.001 ms 127 | ================= DNN Collaborative Inference Finished. =================== 128 | ``` 129 | 130 | 131 | 132 | ## 论文细节 133 | 134 | #### A. The Impact of DNN Inference Workloads 135 | 136 | image-20230709092724650 137 | 138 | **(1)在轻工作负载下是这样的:对于每个阶段,当前帧在下一帧到达之前完成。如图8下面的子图展示** ,数学上表示为: 139 | 140 | 141 | $$ 142 | max\{t_c,t_t,t_e \} \lt 1/Q 143 | $$ 144 | 145 | 146 | 甘特图表示如图8,这种情况下,我们只需要尽快完成每一帧,即最小化 147 | 148 | 149 | $$ 150 | T_e + T_c + T_t 151 | $$ 152 | 153 | 154 | **(2)当系统负载较大时,最小化Te+Tt+ Tc可能会导致系统拥塞** 155 | 156 | 157 | $$ 158 | max\{t_c,t_t,t_e \} \ge 1/Q 159 | $$ 160 | 161 | 162 | 例如,在图8中靠上的子图,由于边缘设备上推理时延过长,使下一帧在当前帧完成边缘之前到达。因此,在这种情况下,**我们需要最大化系统的吞吐量**,即最小化 163 | 164 | 165 | $$ 166 | max\{t_c,t_t,t_e \} 167 | $$ 168 | 169 | 170 | 即最大化系统吞吐量:在单位时间内可以处理的任务数量。 171 | 172 | 173 | 174 | #### B. The Light Workload Partitioning Algorithm 175 | 176 | image-20230709094554166 177 | 178 | **构建了一个新的图G',使每个边只捕获一个延迟值,将ECDI-L问题转化为G的最小加权s-t切割问题。** 179 | 180 | 首先介绍了如何在G的基础上构造G。 181 | 182 | - Cloud Computing Delay:red links,用来捕获云计算时延 183 | - Edge Computing Delay:blue links,以捕获v的边缘计算延迟。 184 | - Communication Delay:其他所有links对应通信时延,为解决如图中v1有多个后继节点,所以引入了v1'。 185 | 186 | 红、蓝、黑对应成本是云计算、边缘计算和通信延迟。虚线链接被赋予无穷大。 187 | 188 | **ECDI-L等价于G的最小e-c cut。** 189 | 190 | - 如果对e到vi的链接进行切割(图9(b)所示的红色链接),则vi将在云端进行处理 191 | - 如果从vj到c进行链接切割(图9(b)中的蓝色链接),则将vj加工在边缘上 192 | - 如果对vi到vj进行链路切割(图9(b)所示黑色链路),则将vi的数据传输到云端 193 | 194 | 红色链路的总切割成本等于云计算时间Tc。蓝色边的总切割成本等于边的计算时间Te。黑色链路的总剪切代价等于不考虑网络延迟Tt的传输时间。如果G的e-c cut最小,则单个帧上的推理延迟最小。 195 | 196 | ![image-20230709095005389](./assets/image-20230709095005389.png) 197 | 198 | 199 | 200 | #### C. The Heavy Workload Partitioning Algorithms 201 | 202 | ![image-20230709095546622](./assets/image-20230709095546622.png) 203 | 204 | 205 | 206 | #### D. The Dynamic Partitioning Algorithm 207 | 208 | image-20230709091256841 209 | 210 | 网络状态是变化的,将动态地影响工作负载模式的选择和分区决策;设计了DADS以适应网络的动态变化。如算法3所示: 211 | 212 | - monitor-task()监控视频是否活动(第2行)。这可以通过“iperf”工具实现。具体实现参见第四节。 213 | - 实时网络带宽由monitor-net()(第3行)推导得到 214 | - 调用DSL()计算分区策略(第4行)。 215 | - 此时,如果处于轻负载模式,并且通过DSL进行分区是被接受的;否则,系统处于重负载模式,调用DSH()调整分区策略以最小化最大延迟(第6行)。然而,如果完成率仍然小于采样率,则说明采样率过大,即使DSH()也不能满足采样率。系统将会拥塞。它调用用户降低采样率(第7-8行)。 216 | 217 | ## 交流 218 | 219 | 如果对本项目有更好的想法或者交流,可以在GitHub Issue提出问题 220 | -------------------------------------------------------------------------------- /net/net_utils.py: -------------------------------------------------------------------------------- 1 | import socket 2 | import time 3 | import pickle 4 | import torch 5 | import platform 6 | import speedtest as spt 7 | 8 | def get_socket_server(ip, port, max_client_num=10): 9 | """ 10 | 为服务端 - 云端设备创建一个socket 用来等待客户端连接 11 | :param ip: 云端设备机器的ip 12 | :param port: socket的网络端口 13 | :param max_client_num: 最大可连接的用户数 14 | :return: 创建好的socket 15 | """ 16 | socket_server = socket.socket(socket.AF_INET, socket.SOCK_STREAM) # 创建socket 17 | 18 | # 判断使用的是什么平台 19 | sys_platform = platform.platform().lower() 20 | if "windows" in sys_platform: 21 | socket_server.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1) # windows 22 | else: 23 | socket_server.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEPORT, 1) # macos or linux 24 | 25 | socket_server.bind((ip, port)) # 绑定端口号 26 | socket_server.listen(max_client_num) # 打开监听 27 | return socket_server 28 | 29 | 30 | def get_socket_client(ip, port): 31 | """ 32 | 客户端(边端设备)创建一个socket 用于连接云端设备 33 | :param ip: 要连接的云端设备机器的ip 34 | :param port: 云端设备socket的端口 35 | :return: 创建好的连接 36 | """ 37 | conn = socket.socket(socket.AF_INET, socket.SOCK_STREAM) 38 | conn.connect((ip, port)) 39 | return conn 40 | 41 | 42 | def close_conn(conn): 43 | """ 44 | 边端设备 终止conn连接 45 | :param conn: conn连接 46 | :return: 终止连接 47 | """ 48 | conn.close() 49 | 50 | 51 | 52 | def close_socket(p): 53 | """ 54 | 云端设备 关闭socket 55 | :param p: socket 56 | :return:关闭连接 57 | """ 58 | p.close() 59 | 60 | 61 | def wait_client(p): 62 | """ 63 | 等待一次conn连接 64 | :param p: socket 65 | :return: 66 | """ 67 | conn, client = p.accept() 68 | print(f"successfully connection :{conn}") 69 | return conn,client 70 | 71 | 72 | def send_data(conn, x, msg="msg", show=True): 73 | """ 74 | 向另一方发送较长数据 例如DNN模型中间层产生的tensor 75 | 注意:接收数据需要使用get_data函数 76 | 这个send_data消息主要分为: 发送数据长度 - 接收回应 - 发送真实数据 - 接收回应 77 | :param conn: 客户端的conn连接 78 | :param x: 要发送的数据 79 | :param msg: 对应的 提示 80 | :param show: 是否展示数据通信消息 81 | :return: 82 | """ 83 | send_x = pickle.dumps(x) 84 | conn.sendall(pickle.dumps(len(send_x))) 85 | resp_len = conn.recv(1024).decode() 86 | 87 | 88 | conn.sendall(send_x) 89 | resp_data = conn.recv(1024).decode() 90 | if show: 91 | print(f"get {resp_data} , {msg} has been sent successfully") # 表示对面已收到数据 92 | 93 | 94 | 95 | def send_short_data(conn, x, msg="msg", show=True): 96 | """ 向另一方发送比较短的数据 接收数据直接使用get_short_data""" 97 | send_x = pickle.dumps(x) 98 | conn.sendall(send_x) 99 | if show: 100 | print(f"short message , {msg} has been sent successfully") # 表示对面已收到数据 101 | 102 | 103 | 104 | def get_data(conn): 105 | """ 106 | 获取一次长数据 主要分为 获取数据长度 - 回应 - 获取数据 - 回应 107 | :param conn: 建立好的连接 108 | :return: 解析后的数据 和 获取数据消耗的时延 109 | """ 110 | # 接收数据长度 111 | data_len = pickle.loads(conn.recv(1024)) 112 | conn.sendall("yes len".encode()) 113 | 114 | # 接收数据并记录时延 115 | sum_time = 0.0 116 | data = [conn.recv(1)] 117 | while True: 118 | start_time = time.perf_counter() 119 | packet = conn.recv(40960) 120 | end_time = time.perf_counter() 121 | transport_time = (end_time - start_time) * 1000 # 单位转换成ms 122 | sum_time += transport_time 123 | 124 | data.append(packet) 125 | if len(b"".join(data)) >= data_len: 126 | break 127 | # if len(packet) < 4096: break 128 | 129 | parse_data = pickle.loads(b"".join(data)) 130 | conn.sendall("yes".encode()) 131 | return parse_data,sum_time 132 | 133 | 134 | def get_short_data(conn): 135 | """ 获取短数据""" 136 | return pickle.loads(conn.recv(1024)) 137 | 138 | 139 | def get_bandwidth(): 140 | """ 141 | 获取当前的网络带宽 142 | :return: 网络带宽 MB/s 143 | """ 144 | print("正在获取网络带宽,wait...") 145 | spd = spt.Speedtest(secure=True) 146 | spd.get_best_server() 147 | 148 | # download = int(spd.download() / 1024 / 1024) 149 | upload = int(spd.upload() / 1024 / 1024) 150 | 151 | # print(f'当前下载速度为:{str(download)} MB/s') 152 | print(f'当前上传速度为:{str(upload)} MB/s') 153 | return upload 154 | 155 | 156 | def get_speed(network_type,bandwidth): 157 | """ 158 | 根据speed_type获取网络带宽 159 | :param network_type: 3g lte or wifi 160 | :param bandwidth 对应的网络速度 3g单位为KB/s lte和wifi单位为MB/s 161 | :return: 带宽速度 单位:Bpms bytes_per_ms 单位毫秒内可以传输的字节数 162 | """ 163 | transfer_from_MB_to_B = 1024 * 1024 164 | transfer_from_KB_to_B = 1024 165 | 166 | if network_type == "3g": 167 | return bandwidth * transfer_from_KB_to_B / 1000 168 | elif network_type == "lte" or network_type == "wifi": 169 | return bandwidth * transfer_from_MB_to_B / 1000 170 | else: 171 | raise RuntimeError(f"目前不支持network type - {network_type}") 172 | 173 | 174 | def create_server(p): 175 | """ 176 | 使用socket 建立一个 server - 循环等待客户端发来请求 177 | 一般仅在测试的时候进行使用 178 | :param p: socket连接 179 | :return: None 180 | """ 181 | while True: 182 | conn, client = p.accept() # 接收到客户端的请求 183 | print(f"connect with client :{conn} successfully ") 184 | 185 | sum_time = 0.0 186 | # 收发消息 187 | data = [conn.recv(1)] # 为了更准确地记录时间,先获取长度为1的消息,之后开启计时 188 | while True: 189 | start_time = time.perf_counter() # 记录开始时间 190 | packet = conn.recv(1024) 191 | end_time = time.perf_counter() # 记录结束时间 192 | transport_time = (end_time - start_time) * 1000 193 | sum_time += transport_time # 传输时间累计到sum_time变量中 194 | 195 | data.append(packet) 196 | if len(packet) < 1024: # 长度 < 1024 代表所有数据已经被接受 197 | break 198 | 199 | parse_data = pickle.loads(b"".join(data)) # 发送和接收数据都使用pickle包,所以这里进行解析pickle 200 | print(f"get all data come from :{conn} successfully ") 201 | 202 | if torch.is_tensor(parse_data): # 主要对tensor数据进行数据大小的衡量 203 | total_num = 1 204 | for num in parse_data.shape: 205 | total_num += num 206 | data_size = total_num * 4 207 | else: 208 | data_size = 0.0 209 | 210 | print(f"data size(bytes) : {data_size} \t transfer time : {sum_time:.3} ms") 211 | print("=====================================") 212 | conn.send("yes".encode("UTF-8")) # 接收到所有请求后回复client 213 | conn.close() 214 | 215 | 216 | def show_speed(data_size,actual_latency,speed_Bpms): 217 | """ 218 | 用于比较: 219 | (1)iperf真实带宽 和 预测带宽 220 | (2)真实传输时延 和 根据公式计算得出的的预测传输时延 221 | 一般只有测试的时候会使用 222 | :param data_size: 数据大小 - bytes 223 | :param actual_latency: 实际传输时延 224 | :param speed_Bpms: iperf获取的真实带宽 225 | :return: 展示比较 应该是差不多的比较结果 226 | """ 227 | print(f"actual speed : {speed_Bpms:.3f} B/ms") # iperf获取的带宽 228 | print(f"predicted speed : {(data_size/actual_latency):.3f} B/ms") # 通过数据大小和真实传输时间计算的带宽 229 | 230 | print(f"actual latency for {data_size} bytes : {actual_latency:.3f} ms") # 实际记录的时延 231 | print(f"predicted latency for {data_size} bytes : {(data_size / speed_Bpms):.3f} ms") # 通过iperf带宽预测的时延 -------------------------------------------------------------------------------- /dads_framework/dinic.py: -------------------------------------------------------------------------------- 1 | import decimal 2 | 3 | import networkx as nx 4 | import sys 5 | from collections import deque 6 | from decimal import Decimal 7 | 8 | def create_residual_network(origin_digraph): 9 | """ 10 | 根据传入的原始有向图 构建初始化残差网络图 11 | 初始的residual network就是 origin digraph的拷贝 12 | :param origin_digraph: 原始构建好的有向图 13 | :return: 构建好的初始残差图 residual_graph 14 | """ 15 | return origin_digraph.copy() 16 | 17 | 18 | 19 | def bfs_for_level_digraph(residual_digraph): 20 | """ 21 | 根据传入的 residual digraph 使用bfs构建 level digraph 22 | :param residual_digraph: 残差网络 23 | :return: 构建好的层级网络信息 level_dict 24 | 以及 最后一个节点是否在dict中(boolean):cloud_node_in_dict , 用于dinic算法终止条件的判断 25 | """ 26 | level_dict = {} # 记录节点是否已经被被访问过 同时记录节点的层数 27 | start_node = 'edge' 28 | level_dict[start_node] = 1 29 | 30 | # 初始化一个队列 用于bfs遍历 31 | Q = deque() 32 | Q.append(start_node) 33 | 34 | # 开始bfs遍历 -> 构建level digraph 35 | while True: 36 | if len(Q) == 0: 37 | break 38 | 39 | # print("-------------") 40 | node = Q.popleft() # 弹出上一层次的节点 41 | # print(f"弹出 : {node}") 42 | 43 | now_level = level_dict[node] 44 | for neighbor_nodes in nx.neighbors(residual_digraph,node): 45 | # 如果neighbor_nodes已经在队列里面 就不需要进行重复添加 46 | if(neighbor_nodes not in level_dict.keys()) and (neighbor_nodes not in Q) \ 47 | and residual_digraph.get_edge_data(node,neighbor_nodes)["capacity"] > 0: 48 | level_dict[neighbor_nodes] = now_level + 1 49 | Q.append(neighbor_nodes) 50 | 51 | 52 | # 判断结束节点t是否保存在层级图中 53 | end_node = 'cloud' 54 | cloud_node_in_dict = end_node in level_dict.keys() 55 | return level_dict,cloud_node_in_dict 56 | 57 | 58 | 59 | def dfs_once(residual_graph,level_dict,dfs_start_node,augment_value): 60 | """ 61 | 使用 dfs 方法来不断选取增广路径,一次DFS可以实现多次增广,并在dfs过程中不断修改residual_graph的权重值 62 | 在层次网络中用一次DFS过程进行增广,DFS执行完毕,该阶段的增广也执行完毕。 63 | :param residual_graph: 残差网络信息 64 | :param level_dict: 层级网络信息 65 | :param dfs_start_node: dfs出发点 66 | :param augment_value: 此次增广的增广值 67 | :return: 返回增广路径的值 68 | """ 69 | tmp = augment_value 70 | end_node = "cloud" 71 | 72 | # 首先排除特殊情况 73 | if dfs_start_node == end_node: 74 | return augment_value 75 | 76 | for node in residual_graph.nodes(): # 遍历图中所有顶点 77 | if level_dict[dfs_start_node] + 1 == level_dict[node]: # 找到下一层次的节点 78 | if residual_graph.has_edge(dfs_start_node,node) and residual_graph.get_edge_data(dfs_start_node, node)["capacity"] > 0: # capacity = 0 表示已经没有容量了 可以不通过这个路径 79 | capacity = residual_graph.get_edge_data(dfs_start_node, node)["capacity"] 80 | # print(f"{dfs_start_node} -> {node} : {capacity}") 81 | # 开始进行dfs找到一个增广路径 并记录增广值(木桶效应 - 取最小值) 82 | flow_value = dfs_once(residual_graph,level_dict,node,min(tmp,capacity)) 83 | # print(f"flow value : {flow_value}") 84 | 85 | # 增加反向边 或者 修改反向边的值 86 | if flow_value > 0: 87 | if not residual_graph.has_edge(node,dfs_start_node): 88 | residual_graph.add_edge(node, dfs_start_node, capacity=flow_value) 89 | else: 90 | neg_flow_value = residual_graph.get_edge_data(node,dfs_start_node)["capacity"] 91 | residual_graph.add_edge(node, dfs_start_node, capacity=flow_value + neg_flow_value) 92 | 93 | # 处理正向边 94 | # print(f"{dfs_start_node} -> {node} : {capacity-flow_value}") 95 | # print("-------------------------------") 96 | residual_graph.add_edge(dfs_start_node, node, capacity=capacity - flow_value) 97 | # 如果边权重为0 就可以删除掉这个边了 防止level digraph构建错误 98 | if capacity - flow_value <= 0: 99 | residual_graph.remove_edge(dfs_start_node, node) 100 | 101 | tmp -= flow_value 102 | return augment_value - tmp 103 | 104 | 105 | def dinic_algorithm(origin_digraph): 106 | """ 107 | 对有向图使用dinic算法找到 最大流、最小割的解决策略 108 | :param origin_digraph: 原始构建好的有向图 109 | :return: min_cut_value, reachable, non_reachable 110 | """ 111 | min_cut_value = 0 112 | inf = sys.maxsize 113 | 114 | # 通过原始图创建一个初始的residual digraph 115 | residual_graph = create_residual_network(origin_digraph) 116 | # print(residual_graph.edges(data=True)) 117 | 118 | for edge in residual_graph.edges(data=True): 119 | u = edge[0] 120 | v = edge[1] 121 | c = Decimal(str(edge[2]['capacity'])).quantize(Decimal('0.000')) 122 | # print(u,v,c) 123 | residual_graph.add_edge(u,v,capacity=c) 124 | 125 | # 通过bfs算法构建level dict信息;也可以当成构建level graph 126 | level_dict, cloud_node_in_dict = bfs_for_level_digraph(residual_graph) 127 | while cloud_node_in_dict: 128 | # print("bfs construction") 129 | # 首先进行一次dfs遍历 130 | dfs_value = dfs_once(residual_graph,level_dict,dfs_start_node="edge",augment_value=inf) 131 | min_cut_value += dfs_value 132 | # print(dfs_value) 133 | while dfs_value > 0: # dfs_value > 0 说明还可以继续进行dfs搜索其他增广路径 134 | # print(residual_graph.edges(data=True)) 135 | # print("dfs search") 136 | dfs_value = dfs_once(residual_graph, level_dict, dfs_start_node="edge", augment_value=inf) 137 | min_cut_value += dfs_value 138 | 139 | # 当本阶段dfs遍历结束之后 ,重新生成新的bfs - level digraph进行循环:知道终点不能表示在level digraph中 140 | level_dict, cloud_node_in_dict = bfs_for_level_digraph(residual_graph) 141 | 142 | # 根据最后的 residual_graph (level_dict), 从edge可以到达的点属于 reachable,其他顶点属于 non_reachable 143 | reachable, non_reachable = set(), set() 144 | for node in residual_graph: 145 | if node in level_dict.keys(): reachable.add(node) 146 | else: non_reachable.add(node) 147 | 148 | return min_cut_value, reachable, non_reachable 149 | 150 | 151 | def get_min_cut_set(graph, min_cut_value, reachable, non_reachable): 152 | """ 153 | 获取最小割集,即在图中的哪个顶点进行切割 154 | 根据 min_cut_value, reachable, non_reachable 参数 155 | :param graph: 构建好的有向图 156 | :param min_cut_value: 最小割的值,用于assert验证,确保划分正确 157 | :param reachable: 划分后可以到达的顶点 158 | :param non_reachable: 划分后不可到达的顶点 159 | :return: partition_edge 表示在DNN模型中的划分点(即不包含 edge 和 cloud 相关的边) 160 | """ 161 | start = 'edge' 162 | end = 'cloud' 163 | 164 | # cut_set = [] 165 | cut_set_sum = 0.000 166 | graph_partition_edge = [] 167 | for u, nbrs in ((n, graph[n]) for n in reachable): 168 | for v in nbrs: 169 | if v in non_reachable: 170 | if u != start and v != end: 171 | graph_partition_edge.append((u, v)) 172 | # cut_set.append((u, v)) 173 | cut_set_sum += graph.edges[u, v]["capacity"] 174 | 175 | # 通过 cut-set 得到的最小割值 176 | 177 | cut_set_sum = "{:.3f}".format(round(cut_set_sum,3)) 178 | min_cut_value = "{:.3f}".format(round(min_cut_value,3)) # 通过 dinic 算法得到的最小割值 179 | 180 | # 确保二者相等才可以得正确的划分 181 | if cut_set_sum != min_cut_value: 182 | raise RuntimeError("dinic算法选择的最优策略有瑕疵,请检查") 183 | return graph_partition_edge -------------------------------------------------------------------------------- /models/EasyModel.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | from torch import Tensor 4 | from typing import Optional, List, Callable, Any 5 | from collections import abc 6 | 7 | 8 | def getBlockIndex(item, accumulate_len): 9 | """ 10 | 通过传入的下标item,提供改item代表的层应该在哪个模块中选择 11 | :param item: item or index 层的下标,从0开始计数 12 | :param accumulate_len: 代表各部分累加和的列表 13 | :return: 对应的模块下标 part_index part_index = 0 代表features 以此类推 part_index = 1 代表inception3 14 | """ 15 | for part_index in range(len(accumulate_len)): 16 | part_len = accumulate_len[part_index] 17 | # 找到属于哪个模块 18 | if item < part_len: 19 | return part_index 20 | return len(accumulate_len) 21 | 22 | 23 | class Operation_Concat(nn.Module): 24 | """ 25 | Operation_Concat 用于最后的拼接操作 26 | """ 27 | def __init__(self): 28 | super().__init__() 29 | self.res = 0 30 | def forward(self,outputs): 31 | self.res = torch.cat(outputs,1) 32 | return self.res 33 | 34 | 35 | class EasyModel(nn.Module): 36 | """ 37 | 构建一个InceptionBlock结构 : 是一个DAG拓扑结构的模型 38 | """ 39 | def __init__(self,in_channels:int = 3) -> None: 40 | super(EasyModel, self).__init__() 41 | self.preInference = nn.Sequential( 42 | nn.Conv2d(in_channels=in_channels, out_channels=3, kernel_size=(7, 7), stride=(2, 2)) 43 | ) 44 | 45 | self.branch1 = nn.Sequential( 46 | nn.Conv2d(in_channels=3, out_channels=8, kernel_size=(3, 3), padding=1) 47 | ) 48 | 49 | self.branch2 = nn.Sequential( 50 | nn.Conv2d(in_channels=3, out_channels=8, kernel_size=(3, 3), padding=1) 51 | ) 52 | self.concat = Operation_Concat() 53 | 54 | self.branch_list = [self.preInference, self.branch1, self.branch2] 55 | self.accumulate_len = [] 56 | for i in range(len(self.branch_list)): 57 | if i == 0: 58 | self.accumulate_len.append(len(self.branch_list[i])) 59 | else: 60 | self.accumulate_len.append(self.accumulate_len[i - 1] + len(self.branch_list[i])) 61 | 62 | 63 | # 如果是DAG拓扑结构需要自己设计好下面几个设定 64 | self.has_dag_topology = True 65 | self.record_output_list = [self.accumulate_len[0], self.accumulate_len[1], self.accumulate_len[2]] # 哪几层需要保存输出 66 | self.dag_dict = { # 定义DAG拓扑相关层的输入 67 | self.accumulate_len[0] + 1: self.accumulate_len[0], 68 | self.accumulate_len[1] + 1: self.accumulate_len[0], 69 | self.accumulate_len[2] + 1: [self.accumulate_len[1], self.accumulate_len[2],], 70 | } 71 | 72 | def _forward(self,x: Tensor) -> List[Tensor]: 73 | branch1 = self.branch1(x) 74 | branch2 = self.branch2(x) 75 | outputs = [branch1,branch2] 76 | return outputs 77 | 78 | def forward(self, x: Tensor) -> Tensor: 79 | x = self.preInference(x) 80 | outputs = self._forward(x) 81 | return self.concat(outputs) 82 | 83 | def __len__(self): 84 | return self.accumulate_len[-1] + 1 85 | 86 | def __getitem__(self, item): 87 | # 如果超出范围 则停止迭代 88 | if item >= self.accumulate_len[-1] + 1: 89 | raise StopIteration() 90 | 91 | # 根据传入的item取出正确的DNN层 92 | part_index = getBlockIndex(item, self.accumulate_len) 93 | if part_index == 0: 94 | layer = self.branch_list[part_index][item] 95 | elif part_index < len(self.accumulate_len): 96 | layer = self.branch_list[part_index][item - self.accumulate_len[part_index - 1]] 97 | else: 98 | layer = self.concat 99 | return layer 100 | 101 | def __iter__(self): 102 | return Inception_SentenceIterator(self.branch_list,self.concat,self.accumulate_len) 103 | 104 | 105 | 106 | class Inception_SentenceIterator(abc.Iterator): 107 | def __init__(self,branch_list,concat,accumulate_len): 108 | self.branch_list = branch_list 109 | self.accumulate_len = accumulate_len 110 | self.concat = concat 111 | 112 | self._index = 0 113 | 114 | 115 | def __next__(self): 116 | # 如果超出范围 则停止迭代 117 | if self._index >= self.accumulate_len[-1] + 1: 118 | raise StopIteration() 119 | 120 | # 根据传入的item取出正确的DNN层 121 | part_index = getBlockIndex(self._index, self.accumulate_len) 122 | if part_index == 0: 123 | layer = self.branch_list[part_index][self._index] 124 | elif part_index < len(self.accumulate_len): 125 | layer = self.branch_list[part_index][self._index - self.accumulate_len[part_index - 1]] 126 | else: 127 | layer = self.concat 128 | 129 | self._index += 1 130 | return layer 131 | 132 | 133 | class easy_dag_part(nn.Module): 134 | def __init__(self,branches): 135 | super(easy_dag_part, self).__init__() 136 | self.branch1 = branches[0] 137 | self.branch2 = branches[1] 138 | self.concat = Operation_Concat() 139 | def forward(self,x): 140 | branch1 = self.branch1(x) 141 | branch2 = self.branch2(x) 142 | outputs = [branch1, branch2] 143 | return self.concat(outputs) 144 | 145 | 146 | class EdgeInception(nn.Module): 147 | """ 148 | edge Inception 用于构建划分好的边端Inception 149 | """ 150 | def __init__(self,edge_branches): 151 | super(EdgeInception, self).__init__() 152 | self.branch1 = edge_branches[0] 153 | self.branch2 = edge_branches[1] 154 | def forward(self,x): 155 | branch1 = self.branch1(x) 156 | branch2 = self.branch2(x) 157 | outputs = [branch1, branch2] 158 | return outputs 159 | 160 | 161 | class CloudInception(nn.Module): 162 | """ 163 | cloud Inception 用于构建划分好的云端Inception 164 | """ 165 | def __init__(self, cloud_branches): 166 | super(CloudInception, self).__init__() 167 | self.branch1 = cloud_branches[0] 168 | self.branch2 = cloud_branches[1] 169 | self.concat = Operation_Concat() 170 | 171 | def forward(self, x): 172 | branch1 = self.branch1(x[0]) 173 | branch2 = self.branch2(x[1]) 174 | outputs = [branch1, branch2] 175 | return self.concat(outputs) 176 | 177 | 178 | def construct_edge_cloud_inception_block(model: EasyModel, model_partition_edge: list): 179 | """ 180 | 构建Inception的边端模型和云端模型 181 | :param model: 传入一个需要划分的Inception block 182 | :param model_partition_edge: Inception的划分点 (start_layer,end_layer) 183 | :return: edge_Inception,cloud_Inception 184 | """ 185 | accumulate_len = model.accumulate_len 186 | edge_model,cloud_model = nn.Sequential(),nn.Sequential() 187 | if len(model_partition_edge) == 1: # 只有一个地方需要划分 188 | partition_point = model_partition_edge[0][0] 189 | assert partition_point <= accumulate_len[0] + 1 190 | idx = 1 191 | for layer in model: 192 | if idx > accumulate_len[0]: break 193 | if idx <= partition_point: 194 | edge_model.add_module(f"{idx}-{layer.__class__.__name__}", layer) 195 | else: 196 | cloud_model.add_module(f"{idx}-{layer.__class__.__name__}", layer) 197 | idx += 1 198 | layer = easy_dag_part(model.branch_list[1:]) 199 | cloud_model.add_module(f"{idx}-{layer.__class__.__name__}", layer) 200 | else: # 需要在4个branch之间进行划分 201 | assert len(model_partition_edge) == 2 202 | branches = model.branch_list[1:] 203 | edge_model.add_module(f"1-preInference", model.preInference) 204 | 205 | edge_branches = [] 206 | cloud_branches = [] 207 | for edge in model_partition_edge: 208 | edge_branch = nn.Sequential() 209 | cloud_branch = nn.Sequential() 210 | 211 | block,tmp_point = None,None 212 | if edge[0] in range(accumulate_len[0] + 1, accumulate_len[1] + 1) or edge[1] in range(accumulate_len[0] + 1,accumulate_len[1] + 1): 213 | block = branches[0] 214 | tmp_point = edge[0] - accumulate_len[0] 215 | elif edge[0] in range(accumulate_len[1] + 1, accumulate_len[2] + 1) or edge[1] in range(accumulate_len[1] + 1, accumulate_len[2] + 1): 216 | block = branches[1] 217 | tmp_point = edge[0] - accumulate_len[1] 218 | 219 | idx = 1 220 | for layer in block: 221 | if idx <= tmp_point: 222 | edge_branch.add_module(f"{idx}-{layer.__class__.__name__}", layer) 223 | else: 224 | cloud_branch.add_module(f"{idx}-{layer.__class__.__name__}", layer) 225 | idx += 1 226 | 227 | edge_branches.append(edge_branch) 228 | cloud_branches.append(cloud_branch) 229 | 230 | # 使用 edge_branches 以及 cloud_branches 构建 EdgeInception 以及 CloudInception 两个类 231 | edge_Inception = EdgeInception(edge_branches) 232 | cloud_Inception = CloudInception(cloud_branches) 233 | 234 | edge_model.add_module(f"2-edge-inception", edge_Inception) 235 | cloud_model.add_module(f"1-cloud-inception", cloud_Inception) 236 | return edge_model, cloud_model 237 | -------------------------------------------------------------------------------- /utils/inference_utils.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | import time 4 | 5 | from models.AlexNet import AlexNet 6 | from models.VggNet import VggNet 7 | from models.EasyModel import EasyModel 8 | from models.InceptionBlock import InceptionBlock 9 | from models.InceptionBlockV2 import InceptionBlockV2 10 | 11 | import models.InceptionBlock as Inception 12 | import models.InceptionBlockV2 as Inception_v2 13 | import models.EasyModel as Easynet 14 | 15 | from utils.excel_utils import * 16 | 17 | 18 | def get_dnn_model(arg: str): 19 | """ 20 | 获取DNN模型 21 | :param arg: 模型名字 22 | :return: 对应的名字 23 | """ 24 | input_channels = 3 25 | if arg == "alex_net": 26 | return AlexNet(input_channels=input_channels) 27 | elif arg == "vgg_net": 28 | return VggNet(input_channels=input_channels) 29 | elif arg == "easy_net": 30 | return EasyModel(in_channels=input_channels) 31 | elif arg == "inception": 32 | return InceptionBlock(in_channels=input_channels) 33 | elif arg == "inception_v2": 34 | return InceptionBlockV2(in_channels=input_channels) 35 | else: 36 | raise RuntimeError("没有对应的DNN模型") 37 | 38 | 39 | def show_model_constructor(model,skip=True): 40 | """ 41 | 展示DNN各层结构 42 | :param model: DNN模型 43 | :param skip: 是否需要跳过 ReLU BatchNorm Dropout等层 44 | :return: 展示DNN各层结构 45 | """ 46 | print("show model constructor as follows: ") 47 | if len(model) > 0: 48 | idx = 1 49 | for layer in model: 50 | if skip is True: 51 | if isinstance(layer, nn.ReLU) or isinstance(layer, nn.BatchNorm2d) or isinstance(layer, nn.Dropout): 52 | continue 53 | print(f'{idx}-{layer}') 54 | idx += 1 55 | else: 56 | print("this model is a empty model") 57 | 58 | 59 | 60 | def show_features(model, input_data, device, epoch_cpu=50, epoch_gpu=100, skip=True, save=False, sheet_name="model", path=None): 61 | """ 62 | 可以输出DNN各层的性质,并将其保存在excel表格中,输出的主要性质如下: 63 | ["index", "layerName", "computation_time(ms)", "output_shape", "transport_num", "transport_size(MB)","accumulate_time(ms)"] 64 | [DNN层下标,层名字,层计算时延,层输出形状,需要传输的浮点数数量,传输大小,从第1层开始的累计推理时延] 65 | :param model: DNN模型 66 | :param input_data: 输入数据 67 | :param device: 指定运行设备 68 | :param epoch_cpu: cpu循环推理次数 69 | :param epoch_gpu: gpu循环推理次数 70 | :param skip: 是否跳过不重要的DNN层 71 | :param save: 是否将内容保存在excel表格中 72 | :param sheet_name: excel中的表格名字 73 | :param path: excel路径 74 | :return: None 75 | """ 76 | if device == "cuda": 77 | if not torch.torch.cuda.is_available(): 78 | raise RuntimeError("运行设备上没有cuda 请调整device参数为cpu") 79 | 80 | # 推理之前对设备进行预热 81 | warmUp(model, input_data, device) 82 | 83 | if save: 84 | sheet_name = sheet_name 85 | value = [["index", "layerName", "computation_time(ms)", "output_shape", "transport_num", 86 | "transport_size(MB)", "accumulate_time(ms)"]] 87 | create_excel_xsl(path, sheet_name, value) 88 | 89 | 90 | if len(model) > 0: 91 | idx = 1 92 | accumulate_time = 0.0 93 | for layer in model: 94 | if skip is True: 95 | if isinstance(layer, nn.ReLU) or isinstance(layer, nn.BatchNorm2d) or isinstance(layer, nn.Dropout): 96 | continue 97 | 98 | temp_x = input_data 99 | # 记录DNN单层的推理时间 100 | input_data, layer_time = recordTime(layer, temp_x, device, epoch_cpu, epoch_gpu) 101 | accumulate_time += layer_time 102 | 103 | # 计算中间传输占用大小为多少MB 104 | total_num = 1 105 | for num in input_data.shape: 106 | total_num *= num 107 | size = total_num * 4 / 1000 / 1000 108 | 109 | print("------------------------------------------------------------------") 110 | print(f'{idx}-{layer} \n' 111 | f'computation time: {layer_time :.3f} ms\n' 112 | f'output shape: {input_data.shape}\t transport_num:{total_num}\t transport_size:{size:.3f}MB\t accumulate time:{accumulate_time:.3f}ms\n') 113 | 114 | # 保存到excel表格中 115 | if save: 116 | sheet_name = input_data 117 | value = [[idx, f"{layer}", round(layer_time, 3), f"{input_data.shape}", total_num, round(size, 3), 118 | round(accumulate_time, 3)]] 119 | write_excel_xls_append(path, sheet_name, value) 120 | idx += 1 121 | return input_data 122 | else: 123 | print("this model is a empty model") 124 | return input_data 125 | 126 | 127 | 128 | def warmUp(model,input_data,device): 129 | """ 130 | 预热操作:不对设备进行预热的话,收集的数据会有时延偏差 131 | :param model: DNN模型 132 | :param input_data: 输入数据 133 | :param device: 运行设备类型 134 | :return: None 135 | """ 136 | epoch = 10 137 | model = model.to(device) 138 | for i in range(1): 139 | if device == "cuda": 140 | warmUpGpu(model, input_data, device, epoch) 141 | elif device == "cpu": 142 | warmUpCpu(model, input_data, device, epoch) 143 | 144 | 145 | def warmUpGpu(model, input_data, device, epoch): 146 | """ GPU 设备预热""" 147 | dummy_input = torch.rand(input_data.shape).to(device) 148 | with torch.no_grad(): 149 | for i in range(10): 150 | _ = model(dummy_input) 151 | 152 | avg_time = 0.0 153 | for i in range(epoch): 154 | starter = torch.cuda.Event(enable_timing=True) 155 | ender = torch.cuda.Event(enable_timing=True) 156 | starter.record() 157 | 158 | _ = model(dummy_input) 159 | 160 | ender.record() 161 | torch.cuda.synchronize() 162 | curr_time = starter.elapsed_time(ender) 163 | avg_time += curr_time 164 | avg_time /= epoch 165 | # print(f"GPU Warm Up : {curr_time:.3f}ms") 166 | # print("==============================================") 167 | 168 | 169 | def warmUpCpu(model, input_data, device, epoch): 170 | """ CPU 设备预热""" 171 | dummy_input = torch.rand(input_data.shape).to(device) 172 | with torch.no_grad(): 173 | for i in range(10): 174 | _ = model(dummy_input) 175 | 176 | avg_time = 0.0 177 | for i in range(epoch): 178 | start = time.perf_counter() 179 | _ = model(dummy_input) 180 | end = time.perf_counter() 181 | curr_time = end - start 182 | avg_time += curr_time 183 | avg_time /= epoch 184 | # print(f"CPU Warm Up : {curr_time * 1000:.3f}ms") 185 | # print("==============================================") 186 | 187 | 188 | 189 | def recordTime(model,input_data,device,epoch_cpu,epoch_gpu): 190 | """ 191 | 记录DNN模型或者DNN层的推理时间 根据设备分发到不同函数上进行计算 192 | :param model: DNN模型 193 | :param input_data: 输入数据 194 | :param device: 运行设备 195 | :param epoch_cpu: cpu循环推理次数 196 | :param epoch_gpu: gpu循环推理次数 197 | :return: 输出结果以及推理时延 198 | """ 199 | model = model.to(device) 200 | res_x, computation_time = None, None 201 | if device == "cuda": 202 | res_x, computation_time = recordTimeGpu(model, input_data, device, epoch_gpu) 203 | elif device == "cpu": 204 | res_x, computation_time = recordTimeCpu(model, input_data, device, epoch_cpu) 205 | return res_x, computation_time 206 | 207 | 208 | 209 | def recordTimeGpu(model, input_data, device, epoch): 210 | all_time = 0.0 211 | with torch.no_grad(): 212 | for i in range(epoch): 213 | if torch.is_tensor(input_data): 214 | input_data = torch.rand(input_data.shape).to(device) 215 | # init loggers 216 | starter = torch.cuda.Event(enable_timing=True) 217 | ender = torch.cuda.Event(enable_timing=True) 218 | 219 | with torch.no_grad(): 220 | starter.record() 221 | res_x = model(input_data) 222 | ender.record() 223 | 224 | # wait for GPU SYNC 225 | # 关于GPU的计算机制 一定要有下面这一行才能准确测量在GPU上的推理时延 226 | torch.cuda.synchronize() 227 | curr_time = starter.elapsed_time(ender) 228 | all_time += curr_time 229 | all_time /= epoch 230 | return res_x, all_time 231 | 232 | 233 | def recordTimeCpu(model, input_data, device, epoch): 234 | all_time = 0.0 235 | for i in range(epoch): 236 | if torch.is_tensor(input_data): 237 | input_data = torch.rand(input_data.shape).to(device) 238 | 239 | with torch.no_grad(): 240 | start_time = time.perf_counter() 241 | res_x = model(input_data) 242 | end_time = time.perf_counter() 243 | 244 | curr_time = end_time - start_time 245 | all_time += curr_time 246 | all_time /= epoch 247 | return res_x, all_time * 1000 248 | 249 | 250 | def model_partition(model, model_partition_edge): 251 | """ 252 | 根据 model_partition_edge 对DNN模型-model进行划分 253 | :param model: 传入的DNN模型 254 | :param model_partition_edge:模型分层点 255 | :return: 边缘端模型 edge_model, 云端模型 cloud_model 256 | """ 257 | # 如果 model_partition_edge 是[],代表模型全部部署在边缘执行 258 | if len(model_partition_edge) == 0: 259 | return model,nn.Sequential() 260 | 261 | # 开始构建边端模型和云端模型 262 | edge_model, cloud_model = nn.Sequential(), nn.Sequential() 263 | if isinstance(model, Inception.InceptionBlock): 264 | return Inception.construct_edge_cloud_inception_block(model, model_partition_edge) 265 | if isinstance(model, Inception_v2.InceptionBlockV2): 266 | return Inception_v2.construct_edge_cloud_inception_block(model, model_partition_edge) 267 | if isinstance(model, Easynet.EasyModel): 268 | return Easynet.construct_edge_cloud_inception_block(model,model_partition_edge) 269 | 270 | if len(model_partition_edge) == 1: # 使用链式结构的划分 271 | partition_point = model_partition_edge[0][0] 272 | idx = 1 273 | for layer in model: 274 | if idx <= partition_point: 275 | edge_model.add_module(f"{idx}-{layer.__class__.__name__}", layer) 276 | else: 277 | cloud_model.add_module(f"{idx}-{layer.__class__.__name__}", layer) 278 | idx += 1 279 | return edge_model, cloud_model 280 | 281 | 282 | -------------------------------------------------------------------------------- /models/InceptionBlock.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | from torch import Tensor 4 | from typing import Optional, List, Callable, Any 5 | from collections import abc 6 | 7 | 8 | def getBlockIndex(item, accumulate_len): 9 | """ 10 | 通过传入的下标item,提供改item代表的层应该在哪个模块中选择 11 | :param item: item or index 层的下标,从0开始计数 12 | :param accumulate_len: 代表各部分累加和的列表 13 | :return: 对应的模块下标 part_index part_index = 0 代表features 以此类推 part_index = 1 代表inception3 14 | """ 15 | for part_index in range(len(accumulate_len)): 16 | part_len = accumulate_len[part_index] 17 | # 找到属于哪个模块 18 | if item < part_len: 19 | return part_index 20 | return len(accumulate_len) 21 | 22 | 23 | class BasicConv2d(torch.nn.Sequential): 24 | """ 25 | 一个简单的块结构, conv+bn+ 26 | """ 27 | def __init__(self,in_channels: int, out_channels: int, **kwargs: Any) -> None: 28 | conv = nn.Conv2d(in_channels, out_channels, bias=False, **kwargs) 29 | bn = nn.BatchNorm2d(out_channels, eps=0.001) 30 | layers = [conv, bn] 31 | super(BasicConv2d, self).__init__(*layers) 32 | 33 | 34 | class Operation_Concat(nn.Module): 35 | """ 36 | Operation_Concat 用于最后的拼接操作 37 | """ 38 | def __init__(self): 39 | super().__init__() 40 | self.res = 0 41 | def forward(self,outputs): 42 | self.res = torch.cat(outputs,1) 43 | return self.res 44 | 45 | 46 | class InceptionBlock(nn.Module): 47 | """ 48 | 构建一个InceptionBlock结构 : 是一个DAG拓扑结构的模型 49 | """ 50 | def __init__(self, 51 | in_channels:int = 3,ch1x1: int = 64,ch3x3red: int = 96, 52 | ch3x3: int = 128,ch5x5red: int = 16,ch5x5: int = 32, 53 | pool_proj: int = 32,conv_block: Optional[Callable[..., nn.Module]] = None, 54 | ) -> None: 55 | super(InceptionBlock, self).__init__() 56 | if conv_block is None: 57 | conv_block = BasicConv2d 58 | 59 | self.preInference = nn.Sequential( 60 | conv_block(in_channels, 64, kernel_size=7, stride=2, padding=3), 61 | nn.MaxPool2d(3, stride=2), 62 | conv_block(64, 64, kernel_size=1), 63 | conv_block(64, 192, kernel_size=3, padding=1), 64 | nn.MaxPool2d(3, stride=2), 65 | ) 66 | out_pre_channels = 192 67 | 68 | self.branch1 = nn.Sequential( 69 | conv_block(out_pre_channels, ch1x1, kernel_size=1) 70 | ) 71 | 72 | self.branch2 = nn.Sequential( 73 | conv_block(out_pre_channels, ch3x3red, kernel_size=1), 74 | conv_block(ch3x3red, ch3x3, kernel_size=3, padding=1) 75 | ) 76 | 77 | self.branch3 = nn.Sequential( 78 | conv_block(out_pre_channels, ch5x5red, kernel_size=1), 79 | conv_block(ch5x5red, ch5x5, kernel_size=5, padding=2), 80 | ) 81 | 82 | self.branch4 = nn.Sequential( 83 | nn.MaxPool2d(kernel_size=3, stride=1, padding=1), 84 | conv_block(out_pre_channels, pool_proj, kernel_size=1), 85 | ) 86 | # concat帮助实现拼接操作 87 | self.concat = Operation_Concat() 88 | self.branch_list = [self.preInference, self.branch1, self.branch2, self.branch3, self.branch4] 89 | 90 | # inception各部分的累加和 用于inception内部的遍历 91 | self.accumulate_len = [] 92 | for i in range(len(self.branch_list)): 93 | if i == 0: 94 | self.accumulate_len.append(len(self.branch_list[i])) 95 | else: 96 | self.accumulate_len.append(self.accumulate_len[i-1] + len(self.branch_list[i])) 97 | 98 | # 如果是DAG拓扑结构需要自己设计好下面几个设定 99 | self.has_dag_topology = True 100 | self.record_output_list = [self.accumulate_len[0], self.accumulate_len[1], self.accumulate_len[2], 101 | self.accumulate_len[3], self.accumulate_len[4]] # 哪几层需要保存输出 102 | self.dag_dict = { # 定义DAG拓扑相关层的输入 103 | self.accumulate_len[0] + 1: self.accumulate_len[0], 104 | self.accumulate_len[1] + 1: self.accumulate_len[0], 105 | self.accumulate_len[2] + 1: self.accumulate_len[0], 106 | self.accumulate_len[3] + 1: self.accumulate_len[0], 107 | self.accumulate_len[4] + 1: [self.accumulate_len[1], self.accumulate_len[2], 108 | self.accumulate_len[3], self.accumulate_len[4],] 109 | } 110 | 111 | 112 | def _forward(self,x: Tensor) -> List[Tensor]: 113 | branch1 = self.branch1(x) 114 | branch2 = self.branch2(x) 115 | branch3 = self.branch3(x) 116 | branch4 = self.branch4(x) 117 | 118 | outputs = [branch1, branch2, branch3, branch4] 119 | return outputs 120 | 121 | def forward(self, x: Tensor) -> Tensor: 122 | x = self.preInference(x) 123 | outputs = self._forward(x) 124 | return self.concat(outputs) 125 | 126 | def __len__(self): 127 | return self.accumulate_len[-1] + 1 128 | 129 | def __getitem__(self, item): 130 | # 如果超出范围 则停止迭代 131 | if item >= self.accumulate_len[-1] + 1: 132 | raise StopIteration() 133 | 134 | # 根据传入的item取出正确的DNN层 135 | part_index = getBlockIndex(item, self.accumulate_len) 136 | if part_index == 0: 137 | layer = self.branch_list[part_index][item] 138 | elif part_index < len(self.accumulate_len): 139 | layer = self.branch_list[part_index][item - self.accumulate_len[part_index - 1]] 140 | else: 141 | layer = self.concat 142 | return layer 143 | 144 | def __iter__(self): 145 | return Inception_SentenceIterator(self.branch_list,self.concat,self.accumulate_len) 146 | 147 | 148 | 149 | class Inception_SentenceIterator(abc.Iterator): 150 | def __init__(self,branch_list,concat,accumulate_len): 151 | self.branch_list = branch_list 152 | self.accumulate_len = accumulate_len 153 | self.concat = concat 154 | 155 | self._index = 0 156 | 157 | def __next__(self): 158 | # 如果超出范围 则停止迭代 159 | if self._index >= self.accumulate_len[-1] + 1: 160 | raise StopIteration() 161 | 162 | # 根据传入的item取出正确的DNN层 163 | part_index = getBlockIndex(self._index, self.accumulate_len) 164 | if part_index == 0: 165 | layer = self.branch_list[part_index][self._index] 166 | elif part_index < len(self.accumulate_len): 167 | layer = self.branch_list[part_index][self._index - self.accumulate_len[part_index - 1]] 168 | else: 169 | layer = self.concat 170 | 171 | self._index += 1 172 | return layer 173 | 174 | 175 | class inception_dag_part(nn.Module): 176 | """ 177 | 提取出inception中的DAG部分 设self.preInference层数为p 178 | 则在第p层(包含第p层)之后进行划分 可以将后面的部分直接使用inception_dag_part 179 | """ 180 | def __init__(self,branches): 181 | super(inception_dag_part, self).__init__() 182 | self.branch1 = branches[0] 183 | self.branch2 = branches[1] 184 | self.branch3 = branches[2] 185 | self.branch4 = branches[3] 186 | self.concat = Operation_Concat() 187 | def forward(self,x): 188 | branch1 = self.branch1(x) 189 | branch2 = self.branch2(x) 190 | branch3 = self.branch3(x) 191 | branch4 = self.branch4(x) 192 | 193 | outputs = [branch1, branch2, branch3, branch4] 194 | return self.concat(outputs) 195 | 196 | 197 | class EdgeInception(nn.Module): 198 | """ 199 | edge Inception 用于构建划分好的边端Inception 200 | """ 201 | def __init__(self,edge_branches): 202 | super(EdgeInception, self).__init__() 203 | self.branch1 = edge_branches[0] 204 | self.branch2 = edge_branches[1] 205 | self.branch3 = edge_branches[2] 206 | self.branch4 = edge_branches[3] 207 | def forward(self,x): 208 | branch1 = self.branch1(x) 209 | branch2 = self.branch2(x) 210 | branch3 = self.branch3(x) 211 | branch4 = self.branch4(x) 212 | 213 | outputs = [branch1, branch2, branch3, branch4] 214 | return outputs 215 | 216 | 217 | class CloudInception(nn.Module): 218 | """ 219 | cloud Inception 用于构建划分好的云端Inception 220 | """ 221 | def __init__(self, cloud_branches): 222 | super(CloudInception, self).__init__() 223 | self.branch1 = cloud_branches[0] 224 | self.branch2 = cloud_branches[1] 225 | self.branch3 = cloud_branches[2] 226 | self.branch4 = cloud_branches[3] 227 | self.concat = Operation_Concat() 228 | 229 | def forward(self, x): 230 | branch1 = self.branch1(x[0]) 231 | branch2 = self.branch2(x[1]) 232 | branch3 = self.branch3(x[2]) 233 | branch4 = self.branch4(x[3]) 234 | 235 | outputs = [branch1, branch2, branch3, branch4] 236 | return self.concat(outputs) 237 | 238 | 239 | def construct_edge_cloud_inception_block(model: InceptionBlock, model_partition_edge: list): 240 | """ 241 | 构建Inception的边端模型和云端模型 242 | :param model: 传入一个需要划分的Inception block 243 | :param model_partition_edge: Inception的划分点 (start_layer,end_layer) 244 | :return: edge_Inception,cloud_Inception 245 | """ 246 | accumulate_len = model.accumulate_len 247 | edge_model,cloud_model = nn.Sequential(),nn.Sequential() 248 | if len(model_partition_edge) == 1: # 只有一个地方需要划分 249 | partition_point = model_partition_edge[0][0] 250 | assert partition_point <= accumulate_len[0] + 1 251 | idx = 1 252 | for layer in model: 253 | if idx > accumulate_len[0]: break 254 | if idx <= partition_point: 255 | edge_model.add_module(f"{idx}-{layer.__class__.__name__}", layer) 256 | else: 257 | cloud_model.add_module(f"{idx}-{layer.__class__.__name__}", layer) 258 | idx += 1 259 | layer = inception_dag_part(model.branch_list[1:]) 260 | cloud_model.add_module(f"{idx}-{layer.__class__.__name__}", layer) 261 | else: # 需要在4个branch之间进行划分 262 | assert len(model_partition_edge) == 4 263 | branches = model.branch_list[1:] 264 | edge_model.add_module(f"1-preInference", model.preInference) 265 | 266 | edge_branches = [] 267 | cloud_branches = [] 268 | for edge in model_partition_edge: 269 | edge_branch = nn.Sequential() 270 | cloud_branch = nn.Sequential() 271 | 272 | block,tmp_point = None,None 273 | if edge[0] in range(accumulate_len[0] + 1, accumulate_len[1] + 1) or edge[1] in range(accumulate_len[0] + 1,accumulate_len[1] + 1): 274 | block = branches[0] 275 | tmp_point = edge[0] - accumulate_len[0] 276 | elif edge[0] in range(accumulate_len[1] + 1, accumulate_len[2] + 1) or edge[1] in range(accumulate_len[1] + 1, accumulate_len[2] + 1): 277 | block = branches[1] 278 | tmp_point = edge[0] - accumulate_len[1] 279 | elif edge[0] in range(accumulate_len[2] + 1, accumulate_len[3] + 1) or edge[1] in range(accumulate_len[2] + 1, accumulate_len[3] + 1): 280 | block = branches[2] 281 | tmp_point = edge[0] - accumulate_len[2] 282 | elif edge[0] in range(accumulate_len[3] + 1, accumulate_len[4] + 1) or edge[1] in range(accumulate_len[3] + 1, accumulate_len[4] + 1): 283 | block = branches[3] 284 | tmp_point = edge[0] - accumulate_len[3] 285 | 286 | idx = 1 287 | for layer in block: 288 | if idx <= tmp_point: 289 | edge_branch.add_module(f"{idx}-{layer.__class__.__name__}", layer) 290 | else: 291 | cloud_branch.add_module(f"{idx}-{layer.__class__.__name__}", layer) 292 | idx += 1 293 | 294 | edge_branches.append(edge_branch) 295 | cloud_branches.append(cloud_branch) 296 | 297 | # 使用 edge_branches 以及 cloud_branches 构建 EdgeInception 以及 CloudInception 两个类 298 | edge_Inception = EdgeInception(edge_branches) 299 | cloud_Inception = CloudInception(cloud_branches) 300 | 301 | edge_model.add_module(f"2-edge-inception", edge_Inception) 302 | cloud_model.add_module(f"1-cloud-inception", cloud_Inception) 303 | return edge_model, cloud_model -------------------------------------------------------------------------------- /models/InceptionBlockV2.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | from torch import Tensor 4 | from typing import Optional, List, Callable, Any 5 | from collections import abc 6 | 7 | 8 | def getBlockIndex(item, accumulate_len): 9 | """ 10 | 通过传入的下标item,提供改item代表的层应该在哪个模块中选择 11 | :param item: item or index 层的下标,从0开始计数 12 | :param accumulate_len: 代表各部分累加和的列表 13 | :return: 对应的模块下标 part_index part_index = 0 代表features 以此类推 part_index = 1 代表inception3 14 | """ 15 | for part_index in range(len(accumulate_len)): 16 | part_len = accumulate_len[part_index] 17 | # 找到属于哪个模块 18 | if item < part_len: 19 | return part_index 20 | return len(accumulate_len) 21 | 22 | 23 | class BasicConv2d(torch.nn.Sequential): 24 | """ 25 | 一个简单的块结构, conv+bn+ 26 | """ 27 | def __init__(self,in_channels: int, out_channels: int, **kwargs: Any) -> None: 28 | conv = nn.Conv2d(in_channels, out_channels, bias=False, **kwargs) 29 | bn = nn.BatchNorm2d(out_channels, eps=0.001) 30 | layers = [conv, bn] 31 | super(BasicConv2d, self).__init__(*layers) 32 | 33 | 34 | class Operation_Concat(nn.Module): 35 | """ 36 | Operation_Concat 用于最后的拼接操作 37 | """ 38 | def __init__(self): 39 | super().__init__() 40 | self.res = 0 41 | def forward(self,outputs): 42 | self.res = torch.cat(outputs,1) 43 | return self.res 44 | 45 | 46 | class InceptionBlockV2(nn.Module): 47 | """ 48 | 构建一个InceptionBlock结构 : 是一个DAG拓扑结构的模型 49 | """ 50 | def __init__(self, 51 | in_channels:int = 3,ch1x1: int = 64,ch3x3red: int = 64, 52 | ch3x3: int = 96, ch3x3_double_red: int = 64, ch3x3_double: int = 96, 53 | pool_proj: int = 32,conv_block: Optional[Callable[..., nn.Module]] = None, 54 | ) -> None: 55 | super(InceptionBlockV2, self).__init__() 56 | if conv_block is None: 57 | conv_block = BasicConv2d 58 | 59 | self.preInference = nn.Sequential( 60 | conv_block(in_channels, 64, kernel_size=7, stride=2, padding=3), 61 | nn.MaxPool2d(3, stride=2), 62 | conv_block(64, 64, kernel_size=1), 63 | conv_block(64, 192, kernel_size=3, padding=1), 64 | nn.MaxPool2d(3, stride=2), 65 | ) 66 | out_pre_channels = 192 67 | 68 | self.branch1 = nn.Sequential( 69 | conv_block(out_pre_channels, ch1x1, kernel_size=1) 70 | ) 71 | 72 | self.branch2 = nn.Sequential( 73 | conv_block(out_pre_channels, ch3x3red, kernel_size=1), 74 | conv_block(ch3x3red, ch3x3, kernel_size=3, padding=1) 75 | ) 76 | 77 | self.branch3 = nn.Sequential( 78 | conv_block(out_pre_channels, ch3x3_double_red, kernel_size=1), 79 | conv_block(ch3x3_double_red, ch3x3, kernel_size=3, padding=1), 80 | conv_block(ch3x3, ch3x3, kernel_size=3, padding=1), 81 | ) 82 | 83 | self.branch4 = nn.Sequential( 84 | nn.MaxPool2d(kernel_size=3, stride=1, padding=1), 85 | conv_block(out_pre_channels, pool_proj, kernel_size=1), 86 | ) 87 | # concat帮助实现拼接操作 88 | self.concat = Operation_Concat() 89 | self.branch_list = [self.preInference, self.branch1, self.branch2, self.branch3, self.branch4] 90 | 91 | # inception各部分的累加和 用于inception内部的遍历 92 | self.accumulate_len = [] 93 | for i in range(len(self.branch_list)): 94 | if i == 0: 95 | self.accumulate_len.append(len(self.branch_list[i])) 96 | else: 97 | self.accumulate_len.append(self.accumulate_len[i-1] + len(self.branch_list[i])) 98 | 99 | 100 | # 如果是DAG拓扑结构需要自己设计好下面几个设定 101 | self.has_dag_topology = True 102 | self.record_output_list = [self.accumulate_len[0], self.accumulate_len[1], self.accumulate_len[2], 103 | self.accumulate_len[3], self.accumulate_len[4]] # 哪几层需要保存输出 104 | self.dag_dict = { # 定义DAG拓扑相关层的输入 105 | self.accumulate_len[0] + 1: self.accumulate_len[0], 106 | self.accumulate_len[1] + 1: self.accumulate_len[0], 107 | self.accumulate_len[2] + 1: self.accumulate_len[0], 108 | self.accumulate_len[3] + 1: self.accumulate_len[0], 109 | self.accumulate_len[4] + 1: [self.accumulate_len[1], self.accumulate_len[2], 110 | self.accumulate_len[3], self.accumulate_len[4], ] 111 | } 112 | 113 | 114 | def _forward(self,x: Tensor) -> List[Tensor]: 115 | branch1 = self.branch1(x) 116 | branch2 = self.branch2(x) 117 | branch3 = self.branch3(x) 118 | branch4 = self.branch4(x) 119 | 120 | outputs = [branch1, branch2, branch3, branch4] 121 | return outputs 122 | 123 | def forward(self, x: Tensor) -> Tensor: 124 | x = self.preInference(x) 125 | outputs = self._forward(x) 126 | return self.concat(outputs) 127 | 128 | def __len__(self): 129 | return self.accumulate_len[-1] + 1 130 | 131 | def __getitem__(self, item): 132 | # 如果超出范围 则停止迭代 133 | if item >= self.accumulate_len[-1] + 1: 134 | raise StopIteration() 135 | 136 | # 根据传入的item取出正确的DNN层 137 | part_index = getBlockIndex(item, self.accumulate_len) 138 | if part_index == 0: 139 | layer = self.branch_list[part_index][item] 140 | elif part_index < len(self.accumulate_len): 141 | layer = self.branch_list[part_index][item - self.accumulate_len[part_index - 1]] 142 | else: 143 | layer = self.concat 144 | return layer 145 | 146 | def __iter__(self): 147 | return Inception_SentenceIterator(self.branch_list,self.concat,self.accumulate_len) 148 | 149 | 150 | 151 | class Inception_SentenceIterator(abc.Iterator): 152 | def __init__(self,branch_list,concat,accumulate_len): 153 | self.branch_list = branch_list 154 | self.accumulate_len = accumulate_len 155 | self.concat = concat 156 | 157 | self._index = 0 158 | 159 | def __next__(self): 160 | # 如果超出范围 则停止迭代 161 | if self._index >= self.accumulate_len[-1] + 1: 162 | raise StopIteration() 163 | 164 | # 根据传入的item取出正确的DNN层 165 | part_index = getBlockIndex(self._index, self.accumulate_len) 166 | if part_index == 0: 167 | layer = self.branch_list[part_index][self._index] 168 | elif part_index < len(self.accumulate_len): 169 | layer = self.branch_list[part_index][self._index - self.accumulate_len[part_index - 1]] 170 | else: 171 | layer = self.concat 172 | 173 | self._index += 1 174 | return layer 175 | 176 | 177 | 178 | class inception_dag_part(nn.Module): 179 | """ 180 | 提取出inception中的DAG部分 设self.preInference层数为p 181 | 则在第p层(包含第p层)之后进行划分 可以将后面的部分直接使用inception_dag_part 182 | """ 183 | def __init__(self,branches): 184 | super(inception_dag_part, self).__init__() 185 | self.branch1 = branches[0] 186 | self.branch2 = branches[1] 187 | self.branch3 = branches[2] 188 | self.branch4 = branches[3] 189 | self.concat = Operation_Concat() 190 | def forward(self,x): 191 | branch1 = self.branch1(x) 192 | branch2 = self.branch2(x) 193 | branch3 = self.branch3(x) 194 | branch4 = self.branch4(x) 195 | 196 | outputs = [branch1, branch2, branch3, branch4] 197 | return self.concat(outputs) 198 | 199 | 200 | class EdgeInception(nn.Module): 201 | """ 202 | edge Inception 用于构建划分好的边端Inception 203 | """ 204 | def __init__(self,edge_branches): 205 | super(EdgeInception, self).__init__() 206 | self.branch1 = edge_branches[0] 207 | self.branch2 = edge_branches[1] 208 | self.branch3 = edge_branches[2] 209 | self.branch4 = edge_branches[3] 210 | def forward(self,x): 211 | branch1 = self.branch1(x) 212 | branch2 = self.branch2(x) 213 | branch3 = self.branch3(x) 214 | branch4 = self.branch4(x) 215 | 216 | outputs = [branch1, branch2, branch3, branch4] 217 | return outputs 218 | 219 | 220 | class CloudInception(nn.Module): 221 | """ 222 | cloud Inception 用于构建划分好的云端Inception 223 | """ 224 | def __init__(self, cloud_branches): 225 | super(CloudInception, self).__init__() 226 | self.branch1 = cloud_branches[0] 227 | self.branch2 = cloud_branches[1] 228 | self.branch3 = cloud_branches[2] 229 | self.branch4 = cloud_branches[3] 230 | self.concat = Operation_Concat() 231 | 232 | def forward(self, x): 233 | branch1 = self.branch1(x[0]) 234 | branch2 = self.branch2(x[1]) 235 | branch3 = self.branch3(x[2]) 236 | branch4 = self.branch4(x[3]) 237 | 238 | outputs = [branch1, branch2, branch3, branch4] 239 | return self.concat(outputs) 240 | 241 | 242 | def construct_edge_cloud_inception_block(model: InceptionBlockV2, model_partition_edge: list): 243 | """ 244 | 构建Inception的边端模型和云端模型 245 | :param model: 传入一个需要划分的Inception block 246 | :param model_partition_edge: Inception的划分点 (start_layer,end_layer) 247 | :return: edge_Inception,cloud_Inception 248 | """ 249 | accumulate_len = model.accumulate_len 250 | edge_model,cloud_model = nn.Sequential(),nn.Sequential() 251 | if len(model_partition_edge) == 1: # 只有一个地方需要划分 252 | partition_point = model_partition_edge[0][0] 253 | assert partition_point <= accumulate_len[0] + 1 254 | idx = 1 255 | for layer in model: 256 | if idx > accumulate_len[0]: break 257 | if idx <= partition_point: 258 | edge_model.add_module(f"{idx}-{layer.__class__.__name__}", layer) 259 | else: 260 | cloud_model.add_module(f"{idx}-{layer.__class__.__name__}", layer) 261 | idx += 1 262 | layer = inception_dag_part(model.branch_list[1:]) 263 | cloud_model.add_module(f"{idx}-{layer.__class__.__name__}", layer) 264 | else: # 需要在4个branch之间进行划分 265 | assert len(model_partition_edge) == 4 266 | branches = model.branch_list[1:] 267 | edge_model.add_module(f"1-preInference", model.preInference) 268 | 269 | edge_branches = [] 270 | cloud_branches = [] 271 | for edge in model_partition_edge: 272 | edge_branch = nn.Sequential() 273 | cloud_branch = nn.Sequential() 274 | 275 | block,tmp_point = None,None 276 | if edge[0] in range(accumulate_len[0] + 1, accumulate_len[1] + 1) or edge[1] in range(accumulate_len[0] + 1,accumulate_len[1] + 1): 277 | block = branches[0] 278 | tmp_point = edge[0] - accumulate_len[0] 279 | elif edge[0] in range(accumulate_len[1] + 1, accumulate_len[2] + 1) or edge[1] in range(accumulate_len[1] + 1, accumulate_len[2] + 1): 280 | block = branches[1] 281 | tmp_point = edge[0] - accumulate_len[1] 282 | elif edge[0] in range(accumulate_len[2] + 1, accumulate_len[3] + 1) or edge[1] in range(accumulate_len[2] + 1, accumulate_len[3] + 1): 283 | block = branches[2] 284 | tmp_point = edge[0] - accumulate_len[2] 285 | elif edge[0] in range(accumulate_len[3] + 1, accumulate_len[4] + 1) or edge[1] in range(accumulate_len[3] + 1, accumulate_len[4] + 1): 286 | block = branches[3] 287 | tmp_point = edge[0] - accumulate_len[3] 288 | 289 | idx = 1 290 | for layer in block: 291 | if idx <= tmp_point: 292 | edge_branch.add_module(f"{idx}-{layer.__class__.__name__}", layer) 293 | else: 294 | cloud_branch.add_module(f"{idx}-{layer.__class__.__name__}", layer) 295 | idx += 1 296 | 297 | edge_branches.append(edge_branch) 298 | cloud_branches.append(cloud_branch) 299 | 300 | # 使用 edge_branches 以及 cloud_branches 构建 EdgeInception 以及 CloudInception 两个类 301 | edge_Inception = EdgeInception(edge_branches) 302 | cloud_Inception = CloudInception(cloud_branches) 303 | 304 | edge_model.add_module(f"2-edge-inception", edge_Inception) 305 | cloud_model.add_module(f"1-cloud-inception", cloud_Inception) 306 | return edge_model, cloud_model 307 | -------------------------------------------------------------------------------- /dads_framework/graph_construct.py: -------------------------------------------------------------------------------- 1 | import networkx as nx 2 | import sys 3 | import torch 4 | from utils.inference_utils import recordTime 5 | from net.net_utils import get_speed 6 | import pickle 7 | 8 | inf = sys.maxsize 9 | construction_time = 0.0 10 | predictor_dict = {} 11 | 12 | 13 | def get_layers_latency(model, device): 14 | """ 15 | 获取模型 model 在云端设备或边端设备上的各层推理时延,用于构建有向图 16 | :param model: DNN模型 17 | :param device: 推理设备 18 | :return: layers_latency[] 代表各层的推理时延 19 | """ 20 | dict_layer_output = {} 21 | input = torch.rand((1, 3, 224, 224)) # 初始输入数据 22 | 23 | layers_latency = [] 24 | for layer_index, layer in enumerate(model): 25 | # 对于某一层先检查其输入是否要进行修改 26 | if model.has_dag_topology and (layer_index + 1) in model.dag_dict.keys(): 27 | pre_input_cond = model.dag_dict[layer_index + 1] # 取出其前置输入条件 28 | if isinstance(pre_input_cond, list): # 如果其是一个列表,代表当前层有多个输入 29 | input = [] 30 | for pre_index in pre_input_cond: # 对于concat操作,输入应为一个列表 31 | input.append(dict_layer_output[pre_index]) 32 | else: # 当前层的的输入从其他层或得 33 | input = dict_layer_output[pre_input_cond] 34 | 35 | if not isinstance(input,list): 36 | input = input.to(device) # 将数据放在相应设备上 37 | 38 | layer = layer.to(device) # 将该层放到相应设备上 39 | input,lat = recordTime(layer, input, device, epoch_cpu=10, epoch_gpu=10) # 记录推理时延 40 | 41 | if model.has_dag_topology and (layer_index+1) in model.record_output_list: 42 | dict_layer_output[layer_index + 1] = input 43 | layers_latency.append(lat) 44 | return layers_latency 45 | 46 | 47 | def add_graph_edge(graph, vertex_index, input, layer_index, layer, 48 | bandwidth, net_type, edge_latency, cloud_latency, 49 | dict_input_size_node_name, dict_node_layer, dict_layer_input_size, dict_layer_output, 50 | record_flag): 51 | """ 52 | 向一个有向图中添加 53 | :param graph: 向哪个有向图中添加 54 | :param vertex_index: 当前构建的顶点编号 55 | :param input: 当前层的输入 56 | :param layer_index: 当前层 57 | :param layer: 当前层类型 58 | :param bandwidth: 网络带宽 59 | :param net_type: 网络类型 60 | :param edge_latency: 在边缘设备上推理时延 61 | :param cloud_latency: 在云端设备上推理时延 62 | :param dict_input_size_node_name: 字典:key:输入 value:对应的顶点编号 63 | :param dict_node_layer: 字典:key:顶点编号 value:对应DNN中第几层 64 | :param dict_layer_input_size: 字典:key:DNN中第几层 value:对应的输入大小 65 | :param dict_layer_output: 字典:key:DNN中第几层 value:对应的输出 66 | :param record_flag: 只有某些关键层才会记录层的输出 67 | :return: 当前构建的顶点数目 vertex_index ,以及当前层的输出(会用于作为下一层的输入) 68 | """ 69 | cloud_vertex = "cloud" # 云端设备节点 70 | edge_vertex = "edge" # 边缘设备节点 71 | 72 | # 获取当前层在边缘端设备上的推理时延以及在云端设备上的推理时延 73 | # edge_lat = predict_model_latency(input, layer, device="edge", predictor_dict=predictor_dict) 74 | # cloud_lat = predict_model_latency(input, layer, device="cloud", predictor_dict=predictor_dict) 75 | 76 | # 获取当前层需要的传输时延 77 | # predict transmission latency,network_type = WI-FI 78 | transport_size = len(pickle.dumps(input)) 79 | speed = get_speed(network_type=net_type,bandwidth=bandwidth) 80 | transmission_lat = transport_size / speed 81 | 82 | # 一层dnn layer可以构建一条边,而构建一条边需要两个顶点 83 | # dict_input_size_node_name 可以根据输入数据大小构建对应的图顶点 84 | # 所以可以在执行dnn layer的前后分别构建 start_node以及end_node 85 | start_node, end_node, record_input = None, None, None 86 | 87 | if isinstance(input,list): 88 | layer_out = None 89 | record_input = input 90 | for one_input in input: 91 | vertex_index, start_node = get_node_name(one_input, vertex_index, dict_input_size_node_name) 92 | layer_out = layer(input) 93 | vertex_index, end_node = get_node_name(layer_out, vertex_index, dict_input_size_node_name) 94 | 95 | # 例如 input 是长度为n的列表,则需要构建n个边 96 | graph.add_edge(start_node, end_node, capacity=transmission_lat) # 添加从前一个节点到当前节点的边 97 | input = layer_out 98 | else: # 常规构建 99 | vertex_index, start_node = get_node_name(input, vertex_index, dict_input_size_node_name) 100 | record_input = input 101 | input = layer(input) 102 | vertex_index, end_node = get_node_name(input, vertex_index, dict_input_size_node_name) 103 | 104 | # 避免无效层覆盖原始数据 用这种方式可以过滤掉relu层或dropout层 105 | if start_node == end_node: 106 | return vertex_index,input # 不需要进行构建 107 | graph.add_edge(start_node, end_node, capacity=transmission_lat) # 添加从前一个节点到当前节点的边 108 | 109 | # 注意:end_node可以用来在有向图中表示当前的 dnn-layer 110 | graph.add_edge(edge_vertex, end_node, capacity=cloud_latency) # 添加从边缘节点到dnn层的边 111 | graph.add_edge(end_node, cloud_vertex, capacity=edge_latency) # 添加从dnn层到云端设备的边 112 | 113 | dict_node_layer[end_node] = layer_index + 1 # 记录有向图中的顶点对应的DNN的第几层 114 | # dict_layer_input_size[layer_index + 1] = record_input.shape # 记录DNN层中第i层对应的输入大小 115 | if record_flag: 116 | dict_layer_output[layer_index+1] = input # 记录DNN层中第i层对应的输出 117 | 118 | return vertex_index,input 119 | 120 | 121 | 122 | def graph_construct(model, input, edge_latency_list, cloud_latency_list, bandwidth, net_type="wifi"): 123 | """ 124 | 传入一个DNN模型,construct_digraph_by_model将DNN模型构建成具有相应权重的有向图 125 | 构建过程主要包括三个方面: 126 | (1) 从边缘设备-dnn层的边 权重设置为云端推理时延 127 | (2) dnn层之间的边 权重设置为传输时延 128 | (3) 从dnn层-云端设备的边 权重设置为边端推理时延 129 | :param model: 传入dnn模型 130 | :param input: dnn模型的初始输入 131 | :param edge_latency_list: 边缘设备上各层的推理时延 132 | :param cloud_latency_list: 云端设备上各层的推理时延 133 | :param bandwidth: 当前网络时延带宽,可由带宽监视器获取 MB/s 134 | :param net_type: 当前网络类型 默认为 wifi 135 | :return: 构建好的有向图graph, dict_vertex_layer, dict_layer_input 136 | 137 | 由于 GoogleNet 和 ResNet 不能用简单地 x = layer(x) 进行下一步执行 138 | 所以需要自定义新的 get_min_cut_value_for_ResBlock 139 | 所以用户如果有新的DAG结构 (1)完善已有创建结构 (2)iterable api 需要自定义 140 | """ 141 | graph = nx.DiGraph() 142 | 143 | """ 144 | dict_for_input 字典的作用: 145 | :key tuple (input.size,input_slice) 字典的键是 输入的形状以及输入的切片(取输入中的前3个数据) 146 | :value 与之对应的构建好的有向图中的顶点 node_name 147 | 通过dict_for_input可以将 DNN layer 转化为有向图中的顶点 node_name 148 | 原理:对于每一个DNN中的layer 其输入数据是唯一的 149 | """ 150 | dict_input_size_node_name = {} 151 | 152 | """ 153 | dict_vertex_layer 字典的作用: 154 | :key node_name 有向图中顶点的名称 155 | :value 对应原DNN中第几层 layer_index 156 | 可以通过有向图的顶点 node_name 找到其对应原DNN模型中第几层 157 | 注意: 158 | layer_index = 0 代表初始输入 159 | layer_index > 0 表示目前顶点代表原DNN层的第layer_index层,若想取出原DNN层应使用 model[layer_index-1] 160 | """ 161 | dict_node_layer = {"v0": 0} # 初始化v0对应的为初始输入 162 | 163 | """ 164 | dict_layer_input 以及 dict_layer_output 字典的作用: 165 | :key 原DNN中第几层 layer_index 166 | :value DNN中第 layer_index 的层输入以及输出是什么 167 | 第 layer_index 层的输入与输出,可以使用 shape 以及前三个元素确定是否为同1输入 168 | 注意: 169 | layer_index = 0 代表初始输入 170 | layer_index = n 获取的是原模型中 model[layer_index-1] 层的输入 171 | """ 172 | dict_layer_input = {0: None} # 第0层为初始输入 其输入记录为None 173 | dict_layer_output = {0: input} # 第0层为初始输入 其输出即为input 174 | 175 | cloud_vertex = "cloud" # 云端设备节点 176 | edge_vertex = "edge" # 边缘设备节点 177 | 178 | print(f"start construct graph for model...") 179 | graph.add_edge(edge_vertex, "v0", capacity=inf) # 构建模型初始输入v0 180 | vertex_index = 0 # 构建图的顶点序号 181 | 182 | for layer_index, layer in enumerate(model): 183 | # print(layer_index,layer) 184 | # 对于某一层先检查其输入是否要进行修改 185 | if model.has_dag_topology and (layer_index+1) in model.dag_dict.keys(): 186 | pre_input_cond = model.dag_dict[layer_index+1] # 取出其前置输入条件 187 | if isinstance(pre_input_cond, list): # 如果其是一个列表,代表当前层有多个输入 188 | input = [] 189 | for pre_index in pre_input_cond: # 对于concat操作,输入应为一个列表 190 | input.append(dict_layer_output[pre_index]) 191 | else: # 当前层的的输入从其他层或得 192 | input = dict_layer_output[pre_input_cond] 193 | 194 | # 标记在模型中 record_output_list 中的DNN层需要记录输出 195 | record_flag = model.has_dag_topology and (layer_index+1) in model.record_output_list 196 | # 枸橘修改后的input进行边的构建 197 | vertex_index, input = add_graph_edge(graph, vertex_index, input, layer_index, layer, 198 | bandwidth, net_type, 199 | edge_latency_list[layer_index],cloud_latency_list[layer_index], 200 | dict_input_size_node_name, dict_node_layer, 201 | dict_layer_input, dict_layer_output, record_flag=record_flag) 202 | 203 | # 主要负责处理出度大于1的顶点 204 | prepare_for_partition(graph, vertex_index, dict_node_layer) 205 | return graph, dict_node_layer, dict_layer_input 206 | 207 | 208 | def get_node_name(input, vertex_index, dict_input_size_node_name): 209 | """ 210 | 根据输入input构建对应的顶点名称 node_name 211 | :param input: 当前层的输入 212 | :param vertex_index: 顶点编号 即目前应该创建哪个顶点 213 | :param dict_input_size_node_name: 通过dict_for_input可以将 DNN layer 转化为有向图中的顶点 node_name 214 | :return: node name,构建DAG边所需要的首位节点name 215 | """ 216 | len_of_shape = len(input.shape) 217 | input_shape = str(input.shape) # 获取当前input的大小 218 | 219 | input_slice = input 220 | for _ in range(len_of_shape-1): 221 | input_slice = input_slice[0] 222 | input_slice = str(input_slice[:3]) # 获取input的前3个数据,保证数据的唯一性 223 | 224 | if (input_shape, input_slice) not in dict_input_size_node_name.keys(): 225 | node_name = "v" + str(vertex_index) 226 | dict_input_size_node_name[(input_shape, input_slice)] = node_name # 创建一个新的节点并保存 227 | vertex_index += 1 228 | else: 229 | node_name = dict_input_size_node_name[(input_shape, input_slice)] # 从字典中取出原有节点 保证正确构建有向图 230 | return vertex_index, node_name 231 | 232 | 233 | def prepare_for_partition(graph, vertex_index, dict_node_layer): 234 | """ 235 | 对根据DNN模型已经构建好的DAG图进行下一步工作: 236 | 1 - 将有多个出点的顶点 记录为start_vex 237 | 2 - 生成新节点为node_name 从node_name -> start_vex 的边代表传输速度,原来从start vex出发的边改为inf 238 | 3 - 找到需要删除的边 :指原终点为 start vex 的边,将其改成到新节点node name的边 239 | 4 - 删除cloud和edge到原节点的边 240 | :param graph : 已经构建好的DAG图 241 | :param vertex_index : 指定下一个生成的节点编号 242 | :param dict_node_layer : 记录有向图中的顶点对应的DNN的第几层 243 | :return: 244 | """ 245 | map_for_vex = [] # 处理 graph - 1个顶点指向多个其他顶点的情况 246 | multiple_out_vex = [] # 保存有多个出点的vex 247 | for edge in graph.edges.data(): 248 | start_vex = edge[0] 249 | end_vex = edge[1] 250 | if start_vex == "edge" or end_vex == "cloud": 251 | continue 252 | if start_vex not in map_for_vex: # 如果当前顶点的前置顶点是第一个出现则进行保存 253 | map_for_vex.append(start_vex) 254 | elif start_vex not in multiple_out_vex: # 如果前置顶点已经出现过 再出现的话说明start_vex出度大于1,将其记录在multiple_out_vex中 255 | multiple_out_vex.append(start_vex) 256 | 257 | for start_vex in multiple_out_vex: 258 | # 生成新的节点 259 | node_name = "v" + str(vertex_index) 260 | vertex_index += 1 261 | dict_node_layer[node_name] = dict_node_layer[start_vex] # 新节点与原节点对应原来的同一层 262 | 263 | # 对旧的节点进行改正 264 | modify_edges = [] # 记录需要修改的边,即起点为start_vex的节点,将其修改为inf 265 | for edge in graph.edges.data(): 266 | if edge[0] == "edge" or edge[1] == "cloud": 267 | continue 268 | if edge[0] == start_vex: 269 | modify_edges.append(edge) 270 | 271 | # 增加新edge 272 | for edge in modify_edges: 273 | graph.add_edge(edge[0], node_name, capacity=edge[2]["capacity"]) # 新增一条从start_vex到node_name的边 274 | graph.add_edge(node_name, edge[1], capacity=inf) # 新增从node_name到edge[1]的边 权重为inf 275 | graph.remove_edge(edge[0],edge[1]) # 删除原有的边 276 | 277 | # 删除 edge - old node 278 | # if graph.has_edge("edge", start_vex): 279 | # data = graph.get_edge_data("edge", start_vex)["capacity"] 280 | # graph.add_edge("edge", node_name, capacity=data) 281 | # graph.remove_edge("edge", start_vex) 282 | # 删除 old node - cloud 283 | # if graph.has_edge(start_vex, "cloud"): 284 | # data = graph.get_edge_data(start_vex, "cloud")["capacity"] 285 | # graph.add_edge(node_name, "cloud", capacity=data) 286 | # graph.remove_edge(start_vex, "cloud") 287 | 288 | # 简化edge的数值 保留三位小数足够计算 289 | for edge in graph.edges.data(): 290 | graph.add_edge(edge[0], edge[1], capacity=round(edge[2]["capacity"], 3)) 291 | return vertex_index 292 | --------------------------------------------------------------------------------