├── LICENSE
├── README.md
├── manager.py
└── manager_torch.py


/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2017 
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # tf_gpu_manager
2 | A GPU devices manager to choice freest gpu. 
3 | # Example:
4 |   gm=GPUManager()  
5 |   with gm.auto_choice():  
6 |     blabla
7 | 


--------------------------------------------------------------------------------
/manager.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | """
  3 | Created on Mon Aug  7 19:38:30 2017
  4 | 
  5 | @author: Quantum Liu
  6 | """
  7 | 
  8 | '''
  9 | Example:
 10 | gm=GPUManager()
 11 | with gm.auto_choice():
 12 |     blabla
 13 | '''
 14 | 
 15 | import os
 16 | import tensorflow as tf
 17 | #from tensorflow.python.client import device_lib
 18 | 
 19 | def check_gpus():
 20 |     '''
 21 |     GPU available check
 22 |     reference : http://feisky.xyz/machine-learning/tensorflow/gpu_list.html
 23 |     '''
 24 | # =============================================================================
 25 | #     all_gpus = [x.name for x in device_lib.list_local_devices() if x.device_type == 'GPU']
 26 | # =============================================================================
 27 |     first_gpus = os.popen('nvidia-smi --query-gpu=index --format=csv,noheader').readlines()[0].strip()
 28 |     if not first_gpus=='0':
 29 |         print('This script could only be used to manage NVIDIA GPUs,but no GPU found in your device')
 30 |         return False
 31 |     elif not 'NVIDIA System Management' in os.popen('nvidia-smi -h').read():
 32 |         print("'nvidia-smi' tool not found.")
 33 |         return False
 34 |     return True
 35 | 
 36 | if check_gpus():
 37 |     def parse(line,qargs):
 38 |         '''
 39 |         line:
 40 |             a line of text
 41 |         qargs:
 42 |             query arguments
 43 |         return:
 44 |             a dict of gpu infos
 45 |         Pasing a line of csv format text returned by nvidia-smi
 46 |         解析一行nvidia-smi返回的csv格式文本
 47 |         '''
 48 |         numberic_args = ['memory.free', 'memory.total', 'power.draw', 'power.limit']#可计数的参数
 49 |         power_manage_enable=lambda v:(not 'Not Support' in v)#lambda表达式，显卡是否滋瓷power management（笔记本可能不滋瓷）
 50 |         to_numberic=lambda v:float(v.upper().strip().replace('MIB','').replace('W',''))#带单位字符串去掉单位
 51 |         process = lambda k,v:((int(to_numberic(v)) if power_manage_enable(v) else 1) if k in numberic_args else v.strip())
 52 |         return {k:process(k,v) for k,v in zip(qargs,line.strip().split(','))}
 53 |     
 54 |     def query_gpu(qargs=[]):
 55 |         '''
 56 |         qargs:
 57 |             query arguments
 58 |         return:
 59 |             a list of dict
 60 |         Querying GPUs infos
 61 |         查询GPU信息
 62 |         '''
 63 |         qargs =['index','gpu_name', 'memory.free', 'memory.total', 'power.draw', 'power.limit']+ qargs
 64 |         cmd = 'nvidia-smi --query-gpu={} --format=csv,noheader'.format(','.join(qargs))
 65 |         results = os.popen(cmd).readlines()
 66 |         return [parse(line,qargs) for line in results]
 67 |     
 68 |     def by_power(d):
 69 |         '''
 70 |         helper function fo sorting gpus by power
 71 |         '''
 72 |         power_infos=(d['power.draw'],d['power.limit'])
 73 |         if any(v==1 for v in power_infos):
 74 |             print('Power management unable for GPU {}'.format(d['index']))
 75 |             return 1
 76 |         return float(d['power.draw'])/d['power.limit']
 77 |     
 78 |     class GPUManager():
 79 |         '''
 80 |         qargs:
 81 |             query arguments
 82 |         A manager which can list all available GPU devices
 83 |         and sort them and choice the most free one.Unspecified 
 84 |         ones pref.
 85 |         GPU设备管理器，考虑列举出所有可用GPU设备，并加以排序，自动选出
 86 |         最空闲的设备。在一个GPUManager对象内会记录每个GPU是否已被指定，
 87 |         优先选择未指定的GPU。
 88 |         '''
 89 |         def __init__(self,qargs=[]):
 90 |             '''
 91 |             '''
 92 |             self.qargs=qargs
 93 |             self.gpus=query_gpu(qargs)
 94 |             for gpu in self.gpus:
 95 |                 gpu['specified']=False
 96 |             self.gpu_num=len(self.gpus)
 97 |     
 98 |         def _sort_by_memory(self,gpus,by_size=False):
 99 |             if by_size:
100 |                 print('Sorted by free memory size')
101 |                 return sorted(gpus,key=lambda d:d['memory.free'],reverse=True)
102 |             else:
103 |                 print('Sorted by free memory rate')
104 |                 return sorted(gpus,key=lambda d:float(d['memory.free'])/ d['memory.total'],reverse=True)
105 |     
106 |         def _sort_by_power(self,gpus):
107 |             return sorted(gpus,key=by_power)
108 |         
109 |         def _sort_by_custom(self,gpus,key,reverse=False,qargs=[]):
110 |             if isinstance(key,str) and (key in qargs):
111 |                 return sorted(gpus,key=lambda d:d[key],reverse=reverse)
112 |             if isinstance(key,type(lambda a:a)):
113 |                 return sorted(gpus,key=key,reverse=reverse)
114 |             raise ValueError("The argument 'key' must be a function or a key in query args,please read the documention of nvidia-smi")
115 | 
116 |         def auto_choice(self,mode=0):
117 |             '''
118 |             mode:
119 |                 0:(default)sorted by free memory size
120 |             return:
121 |                 a TF device object
122 |             Auto choice the freest GPU device,not specified
123 |             ones 
124 |             自动选择最空闲GPU
125 |             '''
126 |             for old_infos,new_infos in zip(self.gpus,query_gpu(self.qargs)):
127 |                 old_infos.update(new_infos)
128 |             unspecified_gpus=[gpu for gpu in self.gpus if not gpu['specified']] or self.gpus
129 |             
130 |             if mode==0:
131 |                 print('Choosing the GPU device has largest free memory...')
132 |                 chosen_gpu=self._sort_by_memory(unspecified_gpus,True)[0]
133 |             elif mode==1:
134 |                 print('Choosing the GPU device has highest free memory rate...')
135 |                 chosen_gpu=self._sort_by_power(unspecified_gpus)[0]
136 |             elif mode==2:
137 |                 print('Choosing the GPU device by power...')
138 |                 chosen_gpu=self._sort_by_power(unspecified_gpus)[0]
139 |             else:
140 |                 print('Given an unaviliable mode,will be chosen by memory')
141 |                 chosen_gpu=self._sort_by_memory(unspecified_gpus)[0]
142 |             chosen_gpu['specified']=True
143 |             index=chosen_gpu['index']
144 |             print('Using GPU {i}:\n{info}'.format(i=index,info='\n'.join([str(k)+':'+str(v) for k,v in chosen_gpu.items()])))
145 |             return tf.device('/gpu:{}'.format(index))
146 | else:
147 |     raise ImportError('GPU available check failed')
148 | 


--------------------------------------------------------------------------------
/manager_torch.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | """
  3 | Created on Tue Aug 22 19:41:55 2017
  4 | 
  5 | @author: Quantum Liu
  6 | """
  7 | '''
  8 | Example:
  9 | gm=GPUManager()
 10 | with torch.cuda.device(gm.auto_choice()):
 11 |     blabla
 12 | 
 13 | Or:
 14 | gm=GPUManager()
 15 | torch.cuda.set_device(gm.auto_choice())
 16 | '''
 17 | 
 18 | import os
 19 | import torch
 20 | def check_gpus():
 21 |     '''
 22 |     GPU available check
 23 |     http://pytorch-cn.readthedocs.io/zh/latest/package_references/torch-cuda/
 24 |     '''
 25 |     if not torch.cuda.is_available():
 26 |         print('This script could only be used to manage NVIDIA GPUs,but no GPU found in your device')
 27 |         return False
 28 |     elif not 'NVIDIA System Management' in os.popen('nvidia-smi -h').read():
 29 |         print("'nvidia-smi' tool not found.")
 30 |         return False
 31 |     return True
 32 | 
 33 | if check_gpus():
 34 |     def parse(line,qargs):
 35 |         '''
 36 |         line:
 37 |             a line of text
 38 |         qargs:
 39 |             query arguments
 40 |         return:
 41 |             a dict of gpu infos
 42 |         Pasing a line of csv format text returned by nvidia-smi
 43 |         解析一行nvidia-smi返回的csv格式文本
 44 |         '''
 45 |         numberic_args = ['memory.free', 'memory.total', 'power.draw', 'power.limit']#可计数的参数
 46 |         power_manage_enable=lambda v:(not 'Not Support' in v)#lambda表达式，显卡是否滋瓷power management（笔记本可能不滋瓷）
 47 |         to_numberic=lambda v:float(v.upper().strip().replace('MIB','').replace('W',''))#带单位字符串去掉单位
 48 |         process = lambda k,v:((int(to_numberic(v)) if power_manage_enable(v) else 1) if k in numberic_args else v.strip())
 49 |         return {k:process(k,v) for k,v in zip(qargs,line.strip().split(','))}
 50 |     
 51 |     def query_gpu(qargs=[]):
 52 |         '''
 53 |         qargs:
 54 |             query arguments
 55 |         return:
 56 |             a list of dict
 57 |         Querying GPUs infos
 58 |         查询GPU信息
 59 |         '''
 60 |         qargs =['index','gpu_name', 'memory.free', 'memory.total', 'power.draw', 'power.limit']+ qargs
 61 |         cmd = 'nvidia-smi --query-gpu={} --format=csv,noheader'.format(','.join(qargs))
 62 |         results = os.popen(cmd).readlines()
 63 |         return [parse(line,qargs) for line in results]
 64 |     
 65 |     def by_power(d):
 66 |         '''
 67 |         helper function fo sorting gpus by power
 68 |         '''
 69 |         power_infos=(d['power.draw'],d['power.limit'])
 70 |         if any(v==1 for v in power_infos):
 71 |             print('Power management unable for GPU {}'.format(d['index']))
 72 |             return 1
 73 |         return float(d['power.draw'])/d['power.limit']
 74 |     
 75 |     class GPUManager():
 76 |         '''
 77 |         qargs:
 78 |             query arguments
 79 |         A manager which can list all available GPU devices
 80 |         and sort them and choice the most free one.Unspecified 
 81 |         ones pref.
 82 |         GPU设备管理器，考虑列举出所有可用GPU设备，并加以排序，自动选出
 83 |         最空闲的设备。在一个GPUManager对象内会记录每个GPU是否已被指定，
 84 |         优先选择未指定的GPU。
 85 |         '''
 86 |         def __init__(self,qargs=[]):
 87 |             '''
 88 |             '''
 89 |             self.qargs=qargs
 90 |             self.gpus=query_gpu(qargs)
 91 |             for gpu in self.gpus:
 92 |                 gpu['specified']=False
 93 |             self.gpu_num=len(self.gpus)
 94 |     
 95 |         def _sort_by_memory(self,gpus,by_size=False):
 96 |             if by_size:
 97 |                 print('Sorted by free memory size')
 98 |                 return sorted(gpus,key=lambda d:d['memory.free'],reverse=True)
 99 |             else:
100 |                 print('Sorted by free memory rate')
101 |                 return sorted(gpus,key=lambda d:float(d['memory.free'])/ d['memory.total'],reverse=True)
102 |     
103 |         def _sort_by_power(self,gpus):
104 |             return sorted(gpus,key=by_power)
105 |         
106 |         def _sort_by_custom(self,gpus,key,reverse=False,qargs=[]):
107 |             if isinstance(key,str) and (key in qargs):
108 |                 return sorted(gpus,key=lambda d:d[key],reverse=reverse)
109 |             if isinstance(key,type(lambda a:a)):
110 |                 return sorted(gpus,key=key,reverse=reverse)
111 |             raise ValueError("The argument 'key' must be a function or a key in query args,please read the documention of nvidia-smi")
112 | 
113 |         def auto_choice(self,mode=0):
114 |             '''
115 |             mode:
116 |                 0:(default)sorted by free memory size
117 |             return:
118 |                 a TF device object
119 |             Auto choice the freest GPU device,not specified
120 |             ones 
121 |             自动选择最空闲GPU,返回索引
122 |             '''
123 |             for old_infos,new_infos in zip(self.gpus,query_gpu(self.qargs)):
124 |                 old_infos.update(new_infos)
125 |             unspecified_gpus=[gpu for gpu in self.gpus if not gpu['specified']] or self.gpus
126 |             
127 |             if mode==0:
128 |                 print('Choosing the GPU device has largest free memory...')
129 |                 chosen_gpu=self._sort_by_memory(unspecified_gpus,True)[0]
130 |             elif mode==1:
131 |                 print('Choosing the GPU device has highest free memory rate...')
132 |                 chosen_gpu=self._sort_by_power(unspecified_gpus)[0]
133 |             elif mode==2:
134 |                 print('Choosing the GPU device by power...')
135 |                 chosen_gpu=self._sort_by_power(unspecified_gpus)[0]
136 |             else:
137 |                 print('Given an unaviliable mode,will be chosen by memory')
138 |                 chosen_gpu=self._sort_by_memory(unspecified_gpus)[0]
139 |             chosen_gpu['specified']=True
140 |             index=chosen_gpu['index']
141 |             print('Using GPU {i}:\n{info}'.format(i=index,info='\n'.join([str(k)+':'+str(v) for k,v in chosen_gpu.items()])))
142 |             return int(index)
143 | else:
144 |     raise ImportError('GPU available check failed')
145 | 


--------------------------------------------------------------------------------