├── LICENSE ├── README.md ├── manager.py └── manager_torch.py /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2017 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # tf_gpu_manager 2 | A GPU devices manager to choice freest gpu. 3 | # Example: 4 | gm=GPUManager() 5 | with gm.auto_choice(): 6 | blabla 7 | -------------------------------------------------------------------------------- /manager.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Mon Aug 7 19:38:30 2017 4 | 5 | @author: Quantum Liu 6 | """ 7 | 8 | ''' 9 | Example: 10 | gm=GPUManager() 11 | with gm.auto_choice(): 12 | blabla 13 | ''' 14 | 15 | import os 16 | import tensorflow as tf 17 | #from tensorflow.python.client import device_lib 18 | 19 | def check_gpus(): 20 | ''' 21 | GPU available check 22 | reference : http://feisky.xyz/machine-learning/tensorflow/gpu_list.html 23 | ''' 24 | # ============================================================================= 25 | # all_gpus = [x.name for x in device_lib.list_local_devices() if x.device_type == 'GPU'] 26 | # ============================================================================= 27 | first_gpus = os.popen('nvidia-smi --query-gpu=index --format=csv,noheader').readlines()[0].strip() 28 | if not first_gpus=='0': 29 | print('This script could only be used to manage NVIDIA GPUs,but no GPU found in your device') 30 | return False 31 | elif not 'NVIDIA System Management' in os.popen('nvidia-smi -h').read(): 32 | print("'nvidia-smi' tool not found.") 33 | return False 34 | return True 35 | 36 | if check_gpus(): 37 | def parse(line,qargs): 38 | ''' 39 | line: 40 | a line of text 41 | qargs: 42 | query arguments 43 | return: 44 | a dict of gpu infos 45 | Pasing a line of csv format text returned by nvidia-smi 46 | 解析一行nvidia-smi返回的csv格式文本 47 | ''' 48 | numberic_args = ['memory.free', 'memory.total', 'power.draw', 'power.limit']#可计数的参数 49 | power_manage_enable=lambda v:(not 'Not Support' in v)#lambda表达式,显卡是否滋瓷power management(笔记本可能不滋瓷) 50 | to_numberic=lambda v:float(v.upper().strip().replace('MIB','').replace('W',''))#带单位字符串去掉单位 51 | process = lambda k,v:((int(to_numberic(v)) if power_manage_enable(v) else 1) if k in numberic_args else v.strip()) 52 | return {k:process(k,v) for k,v in zip(qargs,line.strip().split(','))} 53 | 54 | def query_gpu(qargs=[]): 55 | ''' 56 | qargs: 57 | query arguments 58 | return: 59 | a list of dict 60 | Querying GPUs infos 61 | 查询GPU信息 62 | ''' 63 | qargs =['index','gpu_name', 'memory.free', 'memory.total', 'power.draw', 'power.limit']+ qargs 64 | cmd = 'nvidia-smi --query-gpu={} --format=csv,noheader'.format(','.join(qargs)) 65 | results = os.popen(cmd).readlines() 66 | return [parse(line,qargs) for line in results] 67 | 68 | def by_power(d): 69 | ''' 70 | helper function fo sorting gpus by power 71 | ''' 72 | power_infos=(d['power.draw'],d['power.limit']) 73 | if any(v==1 for v in power_infos): 74 | print('Power management unable for GPU {}'.format(d['index'])) 75 | return 1 76 | return float(d['power.draw'])/d['power.limit'] 77 | 78 | class GPUManager(): 79 | ''' 80 | qargs: 81 | query arguments 82 | A manager which can list all available GPU devices 83 | and sort them and choice the most free one.Unspecified 84 | ones pref. 85 | GPU设备管理器,考虑列举出所有可用GPU设备,并加以排序,自动选出 86 | 最空闲的设备。在一个GPUManager对象内会记录每个GPU是否已被指定, 87 | 优先选择未指定的GPU。 88 | ''' 89 | def __init__(self,qargs=[]): 90 | ''' 91 | ''' 92 | self.qargs=qargs 93 | self.gpus=query_gpu(qargs) 94 | for gpu in self.gpus: 95 | gpu['specified']=False 96 | self.gpu_num=len(self.gpus) 97 | 98 | def _sort_by_memory(self,gpus,by_size=False): 99 | if by_size: 100 | print('Sorted by free memory size') 101 | return sorted(gpus,key=lambda d:d['memory.free'],reverse=True) 102 | else: 103 | print('Sorted by free memory rate') 104 | return sorted(gpus,key=lambda d:float(d['memory.free'])/ d['memory.total'],reverse=True) 105 | 106 | def _sort_by_power(self,gpus): 107 | return sorted(gpus,key=by_power) 108 | 109 | def _sort_by_custom(self,gpus,key,reverse=False,qargs=[]): 110 | if isinstance(key,str) and (key in qargs): 111 | return sorted(gpus,key=lambda d:d[key],reverse=reverse) 112 | if isinstance(key,type(lambda a:a)): 113 | return sorted(gpus,key=key,reverse=reverse) 114 | raise ValueError("The argument 'key' must be a function or a key in query args,please read the documention of nvidia-smi") 115 | 116 | def auto_choice(self,mode=0): 117 | ''' 118 | mode: 119 | 0:(default)sorted by free memory size 120 | return: 121 | a TF device object 122 | Auto choice the freest GPU device,not specified 123 | ones 124 | 自动选择最空闲GPU 125 | ''' 126 | for old_infos,new_infos in zip(self.gpus,query_gpu(self.qargs)): 127 | old_infos.update(new_infos) 128 | unspecified_gpus=[gpu for gpu in self.gpus if not gpu['specified']] or self.gpus 129 | 130 | if mode==0: 131 | print('Choosing the GPU device has largest free memory...') 132 | chosen_gpu=self._sort_by_memory(unspecified_gpus,True)[0] 133 | elif mode==1: 134 | print('Choosing the GPU device has highest free memory rate...') 135 | chosen_gpu=self._sort_by_power(unspecified_gpus)[0] 136 | elif mode==2: 137 | print('Choosing the GPU device by power...') 138 | chosen_gpu=self._sort_by_power(unspecified_gpus)[0] 139 | else: 140 | print('Given an unaviliable mode,will be chosen by memory') 141 | chosen_gpu=self._sort_by_memory(unspecified_gpus)[0] 142 | chosen_gpu['specified']=True 143 | index=chosen_gpu['index'] 144 | print('Using GPU {i}:\n{info}'.format(i=index,info='\n'.join([str(k)+':'+str(v) for k,v in chosen_gpu.items()]))) 145 | return tf.device('/gpu:{}'.format(index)) 146 | else: 147 | raise ImportError('GPU available check failed') 148 | -------------------------------------------------------------------------------- /manager_torch.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Tue Aug 22 19:41:55 2017 4 | 5 | @author: Quantum Liu 6 | """ 7 | ''' 8 | Example: 9 | gm=GPUManager() 10 | with torch.cuda.device(gm.auto_choice()): 11 | blabla 12 | 13 | Or: 14 | gm=GPUManager() 15 | torch.cuda.set_device(gm.auto_choice()) 16 | ''' 17 | 18 | import os 19 | import torch 20 | def check_gpus(): 21 | ''' 22 | GPU available check 23 | http://pytorch-cn.readthedocs.io/zh/latest/package_references/torch-cuda/ 24 | ''' 25 | if not torch.cuda.is_available(): 26 | print('This script could only be used to manage NVIDIA GPUs,but no GPU found in your device') 27 | return False 28 | elif not 'NVIDIA System Management' in os.popen('nvidia-smi -h').read(): 29 | print("'nvidia-smi' tool not found.") 30 | return False 31 | return True 32 | 33 | if check_gpus(): 34 | def parse(line,qargs): 35 | ''' 36 | line: 37 | a line of text 38 | qargs: 39 | query arguments 40 | return: 41 | a dict of gpu infos 42 | Pasing a line of csv format text returned by nvidia-smi 43 | 解析一行nvidia-smi返回的csv格式文本 44 | ''' 45 | numberic_args = ['memory.free', 'memory.total', 'power.draw', 'power.limit']#可计数的参数 46 | power_manage_enable=lambda v:(not 'Not Support' in v)#lambda表达式,显卡是否滋瓷power management(笔记本可能不滋瓷) 47 | to_numberic=lambda v:float(v.upper().strip().replace('MIB','').replace('W',''))#带单位字符串去掉单位 48 | process = lambda k,v:((int(to_numberic(v)) if power_manage_enable(v) else 1) if k in numberic_args else v.strip()) 49 | return {k:process(k,v) for k,v in zip(qargs,line.strip().split(','))} 50 | 51 | def query_gpu(qargs=[]): 52 | ''' 53 | qargs: 54 | query arguments 55 | return: 56 | a list of dict 57 | Querying GPUs infos 58 | 查询GPU信息 59 | ''' 60 | qargs =['index','gpu_name', 'memory.free', 'memory.total', 'power.draw', 'power.limit']+ qargs 61 | cmd = 'nvidia-smi --query-gpu={} --format=csv,noheader'.format(','.join(qargs)) 62 | results = os.popen(cmd).readlines() 63 | return [parse(line,qargs) for line in results] 64 | 65 | def by_power(d): 66 | ''' 67 | helper function fo sorting gpus by power 68 | ''' 69 | power_infos=(d['power.draw'],d['power.limit']) 70 | if any(v==1 for v in power_infos): 71 | print('Power management unable for GPU {}'.format(d['index'])) 72 | return 1 73 | return float(d['power.draw'])/d['power.limit'] 74 | 75 | class GPUManager(): 76 | ''' 77 | qargs: 78 | query arguments 79 | A manager which can list all available GPU devices 80 | and sort them and choice the most free one.Unspecified 81 | ones pref. 82 | GPU设备管理器,考虑列举出所有可用GPU设备,并加以排序,自动选出 83 | 最空闲的设备。在一个GPUManager对象内会记录每个GPU是否已被指定, 84 | 优先选择未指定的GPU。 85 | ''' 86 | def __init__(self,qargs=[]): 87 | ''' 88 | ''' 89 | self.qargs=qargs 90 | self.gpus=query_gpu(qargs) 91 | for gpu in self.gpus: 92 | gpu['specified']=False 93 | self.gpu_num=len(self.gpus) 94 | 95 | def _sort_by_memory(self,gpus,by_size=False): 96 | if by_size: 97 | print('Sorted by free memory size') 98 | return sorted(gpus,key=lambda d:d['memory.free'],reverse=True) 99 | else: 100 | print('Sorted by free memory rate') 101 | return sorted(gpus,key=lambda d:float(d['memory.free'])/ d['memory.total'],reverse=True) 102 | 103 | def _sort_by_power(self,gpus): 104 | return sorted(gpus,key=by_power) 105 | 106 | def _sort_by_custom(self,gpus,key,reverse=False,qargs=[]): 107 | if isinstance(key,str) and (key in qargs): 108 | return sorted(gpus,key=lambda d:d[key],reverse=reverse) 109 | if isinstance(key,type(lambda a:a)): 110 | return sorted(gpus,key=key,reverse=reverse) 111 | raise ValueError("The argument 'key' must be a function or a key in query args,please read the documention of nvidia-smi") 112 | 113 | def auto_choice(self,mode=0): 114 | ''' 115 | mode: 116 | 0:(default)sorted by free memory size 117 | return: 118 | a TF device object 119 | Auto choice the freest GPU device,not specified 120 | ones 121 | 自动选择最空闲GPU,返回索引 122 | ''' 123 | for old_infos,new_infos in zip(self.gpus,query_gpu(self.qargs)): 124 | old_infos.update(new_infos) 125 | unspecified_gpus=[gpu for gpu in self.gpus if not gpu['specified']] or self.gpus 126 | 127 | if mode==0: 128 | print('Choosing the GPU device has largest free memory...') 129 | chosen_gpu=self._sort_by_memory(unspecified_gpus,True)[0] 130 | elif mode==1: 131 | print('Choosing the GPU device has highest free memory rate...') 132 | chosen_gpu=self._sort_by_power(unspecified_gpus)[0] 133 | elif mode==2: 134 | print('Choosing the GPU device by power...') 135 | chosen_gpu=self._sort_by_power(unspecified_gpus)[0] 136 | else: 137 | print('Given an unaviliable mode,will be chosen by memory') 138 | chosen_gpu=self._sort_by_memory(unspecified_gpus)[0] 139 | chosen_gpu['specified']=True 140 | index=chosen_gpu['index'] 141 | print('Using GPU {i}:\n{info}'.format(i=index,info='\n'.join([str(k)+':'+str(v) for k,v in chosen_gpu.items()]))) 142 | return int(index) 143 | else: 144 | raise ImportError('GPU available check failed') 145 | --------------------------------------------------------------------------------