|
- #@@ -1,43 +1,43 @@
- # GPUtil - GPU utilization
- #
- # A Python module for programmically getting the GPU utilization from NVIDA GPUs using nvidia-smi
- #
- # Author: Anders Krogh Mortensen (anderskm)
- # Date: 16 January 2017
- # Web: https://github.com/anderskm/gputil
- #
- # LICENSE
- #
- # MIT License
- #
- # Copyright (c) 2017 anderskm
- #
- # Permission is hereby granted, free of charge, to any person obtaining a copy
- # of this software and associated documentation files (the "Software"), to deal
- # in the Software without restriction, including without limitation the rights
- # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- # copies of the Software, and to permit persons to whom the Software is
- # furnished to do so, subject to the following conditions:
- #
- # The above copyright notice and this permission notice shall be included in all
- # copies or substantial portions of the Software.
- #
- # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- # SOFTWARE.
-
- from subprocess import Popen, PIPE
- from distutils import spawn
- import os
- import math
- import random
- import time
- import sys
- import platform
- import subprocess
- import numpy as np
-
-
- __version__ = '1.4.0'
- class GPU:
- def __init__(self, ID, uuid, load, memoryTotal, memoryUsed, memoryFree, driver, gpu_name, serial, display_mode, display_active, temp_gpu):
- self.id = ID
- self.uuid = uuid
- self.load = load
- self.memoryUtil = float(memoryUsed)/float(memoryTotal)
- self.memoryTotal = memoryTotal
- self.memoryUsed = memoryUsed
- self.memoryFree = memoryFree
- self.driver = driver
- self.name = gpu_name
- self.serial = serial
- self.display_mode = display_mode
- self.display_active = display_active
- self.temperature = temp_gpu
-
- def __str__(self):
- return str(self.__dict__)
-
-
- class GPUProcess:
- def __init__(self, pid, processName, gpuId, gpuUuid, gpuName, usedMemory,
- uid, uname):
- self.pid = pid
- self.processName = processName
- self.gpuId = gpuId
- self.gpuUuid = gpuUuid
- self.gpuName = gpuName
- self.usedMemory = usedMemory
- self.uid = uid
- self.uname = uname
-
- def __str__(self):
- return str(self.__dict__)
-
- def safeFloatCast(strNumber):
- try:
- number = float(strNumber)
- except ValueError:
- number = float('nan')
- return number
-
- #def getGPUs():
- def getNvidiaSmiCmd():
- if platform.system() == "Windows":
- # If the platform is Windows and nvidia-smi
- # could not be found from the environment path,
- #@@ -75,57 +94,97 @@ def getGPUs():
- nvidia_smi = "%s\\Program Files\\NVIDIA Corporation\\NVSMI\\nvidia-smi.exe" % os.environ['systemdrive']
- else:
- nvidia_smi = "nvidia-smi"
- return nvidia_smi
-
-
- def getGPUs():
- # Get ID, processing and memory utilization for all GPUs
- nvidia_smi = getNvidiaSmiCmd()
- try:
- p = Popen([nvidia_smi,"--query-gpu=index,uuid,utilization.gpu,memory.total,memory.used,memory.free,driver_version,name,gpu_serial,display_active,display_mode,temperature.gpu", "--format=csv,noheader,nounits"], stdout=PIPE)
- stdout, stderror = p.communicate()
- p = subprocess.run([
- nvidia_smi,
- "--query-gpu=index,uuid,utilization.gpu,memory.total,memory.used,memory.free,driver_version,name,gpu_serial,display_active,display_mode,temperature.gpu",
- "--format=csv,noheader,nounits"
- ], stdout=subprocess.PIPE, encoding='utf8')
- stdout, stderror = p.stdout, p.stderr
- except:
- return []
- output = stdout;#output = stdout.decode('UTF-8')
- # output = output[2:-1] # Remove b' and ' from string added by python
- #print(output)
- output = stdout
- ## Parse output
- # Split on line break
- lines = output.split(os.linesep)
- #print(lines)
- numDevices = len(lines)-1
- GPUs = []
- for g in range(numDevices):
- line = lines[g]
- #print(line)
- vals = line.split(', ')
- #print(vals)
- for i in range(12):
- # print(vals[i])
- if (i == 0):
- deviceIds = int(vals[i])
- elif (i == 1):
- uuid = vals[i]
- elif (i == 2):
- gpuUtil = safeFloatCast(vals[i])/100
- elif (i == 3):
- memTotal = safeFloatCast(vals[i])
- elif (i == 4):
- memUsed = safeFloatCast(vals[i])
- elif (i == 5):
- memFree = safeFloatCast(vals[i])
- elif (i == 6):
- driver = vals[i]
- elif (i == 7):
- gpu_name = vals[i]
- elif (i == 8):
- serial = vals[i]
- elif (i == 9):
- display_active = vals[i]
- elif (i == 10):
- display_mode = vals[i]
- elif (i == 11):
- temp_gpu = safeFloatCast(vals[i]);
- deviceIds = int(vals[0])
- uuid = vals[1]
- gpuUtil = safeFloatCast(vals[2]) / 100
- memTotal = safeFloatCast(vals[3])
- memUsed = safeFloatCast(vals[4])
- memFree = safeFloatCast(vals[5])
- driver = vals[6]
- gpu_name = vals[7]
- serial = vals[8]
- display_active = vals[9]
- display_mode = vals[10]
- temp_gpu = safeFloatCast(vals[11]);
- GPUs.append(GPU(deviceIds, uuid, gpuUtil, memTotal, memUsed, memFree, driver, gpu_name, serial, display_mode, display_active, temp_gpu))
- return GPUs # (deviceIds, gpuUtil, memUtil)
-
-
-
- def getGPUProcesses():
- """Get all gpu compute processes."""
-
- global gpuUuidToIdMap
- gpuUuidToIdMap = {}
- try:
- gpus = getGPUs()
- for gpu in gpus:
- gpuUuidToIdMap[gpu.uuid] = gpu.id
- del gpus
- except:
- pass
-
-
- nvidia_smi = getNvidiaSmiCmd()
- try:
- p = subprocess.run([
- nvidia_smi,
- "--query-compute-apps=pid,process_name,gpu_uuid,gpu_name,used_memory",
- "--format=csv,noheader,nounits"
- ], stdout=subprocess.PIPE, encoding='utf8')
- stdout, stderror = p.stdout, p.stderr
- except:
- return []
- output = stdout
- ## Parse output
- # Split on line break
- lines = output.split(os.linesep)
- numProcesses = len(lines) - 1
- processes = []
- for g in range(numProcesses):
- line = lines[g]
- #print(line)
- vals = line.split(', ')
- #print(vals)
- pid = int(vals[0])
- processName = vals[1]
- gpuUuid = vals[2]
- gpuName = vals[3]
- usedMemory = safeFloatCast(vals[4])
- gpuId = gpuUuidToIdMap[gpuUuid]
- if gpuId is None:
- gpuId = -1
-
- # get uid and uname owner of the pid
- try:
- p = subprocess.run(['ps', f'-p{pid}', '-oruid=,ruser='],
- stdout=subprocess.PIPE, encoding='utf8')
- uid, uname = p.stdout.split()
- uid = int(uid)
- except:
- uid, uname = -1, ''
-
- processes.append(GPUProcess(pid, processName, gpuId, gpuUuid,
- gpuName, usedMemory, uid, uname))
- return processes
-
-
- def getAvailable(order = 'first', limit=1, maxLoad=0.5, maxMemory=0.5, memoryFree=0, includeNan=False, excludeID=[], excludeUUID=[]):
- # order = first | last | random | load | memory
- # first --> select the GPU with the lowest ID (DEFAULT)
- # last --> select the GPU with the highest ID
- # random --> select a random available GPU
- # load --> select the GPU with the lowest load
- # memory --> select the GPU with the most memory available
- # limit = 1 (DEFAULT), 2, ..., Inf
- # Limit sets the upper limit for the number of GPUs to return. E.g. if limit = 2, but only one is available, only one is returned.
- # Get device IDs, load and memory usage
- GPUs = getGPUs()
- # Determine, which GPUs are available
- GPUavailability = getAvailability(GPUs, maxLoad=maxLoad, maxMemory=maxMemory, memoryFree=memoryFree, includeNan=includeNan, excludeID=excludeID, excludeUUID=excludeUUID)
- availAbleGPUindex = [idx for idx in range(0,len(GPUavailability)) if (GPUavailability[idx] == 1)]
- # Discard unavailable GPUs
- GPUs = [GPUs[g] for g in availAbleGPUindex]
- # Sort available GPUs according to the order argument
- if (order == 'first'):
- GPUs.sort(key=lambda x: float('inf') if math.isnan(x.id) else x.id, reverse=False)
- elif (order == 'last'):
- GPUs.sort(key=lambda x: float('-inf') if math.isnan(x.id) else x.id, reverse=True)
- elif (order == 'random'):
- GPUs = [GPUs[g] for g in random.sample(range(0,len(GPUs)),len(GPUs))]
- elif (order == 'load'):
- GPUs.sort(key=lambda x: float('inf') if math.isnan(x.load) else x.load, reverse=False)
- elif (order == 'memory'):
- GPUs.sort(key=lambda x: float('inf') if math.isnan(x.memoryUtil) else x.memoryUtil, reverse=False)
- # Extract the number of desired GPUs, but limited to the total number of available GPUs
- GPUs = GPUs[0:min(limit, len(GPUs))]
- # Extract the device IDs from the GPUs and return them
- deviceIds = [gpu.id for gpu in GPUs]
- return deviceIds
- #def getAvailability(GPUs, maxLoad = 0.5, maxMemory = 0.5, includeNan = False):
- # # Determine, which GPUs are available
- # GPUavailability = np.zeros(len(GPUs))
- # for i in range(len(GPUs)):
- # if (GPUs[i].load < maxLoad or (includeNan and np.isnan(GPUs[i].load))) and (GPUs[i].memoryUtil < maxMemory or (includeNan and np.isnan(GPUs[i].memoryUtil))):
- # GPUavailability[i] = 1
- def getAvailability(GPUs, maxLoad=0.5, maxMemory=0.5, memoryFree=0, includeNan=False, excludeID=[], excludeUUID=[]):
- # Determine, which GPUs are available
- GPUavailability = [1 if (gpu.memoryFree>=memoryFree) and (gpu.load < maxLoad or (includeNan and math.isnan(gpu.load))) and (gpu.memoryUtil < maxMemory or (includeNan and math.isnan(gpu.memoryUtil))) and ((gpu.id not in excludeID) and (gpu.uuid not in excludeUUID)) else 0 for gpu in GPUs]
- return GPUavailability
- def getFirstAvailable(order = 'first', maxLoad=0.5, maxMemory=0.5, attempts=1, interval=900, verbose=False, includeNan=False, excludeID=[], excludeUUID=[]):
- #GPUs = getGPUs()
- #firstAvailableGPU = np.NaN
- #for i in range(len(GPUs)):
- # if (GPUs[i].load < maxLoad) & (GPUs[i].memory < maxMemory):
- # firstAvailableGPU = GPUs[i].id
- # break
- #return firstAvailableGPU
- for i in range(attempts):
- if (verbose):
- print('Attempting (' + str(i+1) + '/' + str(attempts) + ') to locate available GPU.')
- # Get first available GPU
- available = getAvailable(order=order, limit=1, maxLoad=maxLoad, maxMemory=maxMemory, includeNan=includeNan, excludeID=excludeID, excludeUUID=excludeUUID)
- # If an available GPU was found, break for loop.
- if (available):
- if (verbose):
- print('GPU ' + str(available) + ' located!')
- break
- # If this is not the last attempt, sleep for 'interval' seconds
- if (i != attempts-1):
- time.sleep(interval)
- # Check if an GPU was found, or if the attempts simply ran out. Throw error, if no GPU was found
- if (not(available)):
- raise RuntimeError('Could not find an available GPU after ' + str(attempts) + ' attempts with ' + str(interval) + ' seconds interval.')
- # Return found GPU
- return available
- def showUtilization(all=False, attrList=None, useOldCode=False):
- GPUs = getGPUs()
- if (all):
- if (useOldCode):
- print(' ID | Name | Serial | UUID || GPU util. | Memory util. || Memory total | Memory used | Memory free || Display mode | Display active |')
- print('------------------------------------------------------------------------------------------------------------------------------')
- for gpu in GPUs:
- print(' {0:2d} | {1:s} | {2:s} | {3:s} || {4:3.0f}% | {5:3.0f}% || {6:.0f}MB | {7:.0f}MB | {8:.0f}MB || {9:s} | {10:s}'.format(gpu.id,gpu.name,gpu.serial,gpu.uuid,gpu.load*100,gpu.memoryUtil*100,gpu.memoryTotal,gpu.memoryUsed,gpu.memoryFree,gpu.display_mode,gpu.display_active))
- else:
- attrList = [[{'attr':'id','name':'ID'},
- {'attr':'name','name':'Name'},
- {'attr':'serial','name':'Serial'},
- {'attr':'uuid','name':'UUID'}],
- [{'attr':'temperature','name':'GPU temp.','suffix':'C','transform': lambda x: x,'precision':0},
- {'attr':'load','name':'GPU util.','suffix':'%','transform': lambda x: x*100,'precision':0},
- {'attr':'memoryUtil','name':'Memory util.','suffix':'%','transform': lambda x: x*100,'precision':0}],
- [{'attr':'memoryTotal','name':'Memory total','suffix':'MB','precision':0},
- {'attr':'memoryUsed','name':'Memory used','suffix':'MB','precision':0},
- {'attr':'memoryFree','name':'Memory free','suffix':'MB','precision':0}],
- [{'attr':'display_mode','name':'Display mode'},
- {'attr':'display_active','name':'Display active'}]]
-
- else:
- if (useOldCode):
- print(' ID GPU MEM')
- print('--------------')
- for gpu in GPUs:
- print(' {0:2d} {1:3.0f}% {2:3.0f}%'.format(gpu.id, gpu.load*100, gpu.memoryUtil*100))
- else:
- attrList = [[{'attr':'id','name':'ID'},
- {'attr':'load','name':'GPU','suffix':'%','transform': lambda x: x*100,'precision':0},
- {'attr':'memoryUtil','name':'MEM','suffix':'%','transform': lambda x: x*100,'precision':0}],
- ]
-
- if (not useOldCode):
- if (attrList is not None):
- headerString = ''
- GPUstrings = ['']*len(GPUs)
- for attrGroup in attrList:
- #print(attrGroup)
- for attrDict in attrGroup:
- headerString = headerString + '| ' + attrDict['name'] + ' '
- headerWidth = len(attrDict['name'])
- minWidth = len(attrDict['name'])
-
- attrPrecision = '.' + str(attrDict['precision']) if ('precision' in attrDict.keys()) else ''
- attrSuffix = str(attrDict['suffix']) if ('suffix' in attrDict.keys()) else ''
- attrTransform = attrDict['transform'] if ('transform' in attrDict.keys()) else lambda x : x
- for gpu in GPUs:
- attr = getattr(gpu,attrDict['attr'])
-
- attr = attrTransform(attr)
-
- if (isinstance(attr,float)):
- attrStr = ('{0:' + attrPrecision + 'f}').format(attr)
- elif (isinstance(attr,int)):
- attrStr = ('{0:d}').format(attr)
- elif (isinstance(attr,str)):
- attrStr = attr;
- elif (sys.version_info[0] == 2):
- if (isinstance(attr,unicode)):
- attrStr = attr.encode('ascii','ignore')
- else:
- raise TypeError('Unhandled object type (' + str(type(attr)) + ') for attribute \'' + attrDict['name'] + '\'')
-
- attrStr += attrSuffix
-
- minWidth = max(minWidth,len(attrStr))
-
- headerString += ' '*max(0,minWidth-headerWidth)
-
- minWidthStr = str(minWidth - len(attrSuffix))
-
- for gpuIdx,gpu in enumerate(GPUs):
- attr = getattr(gpu,attrDict['attr'])
-
- attr = attrTransform(attr)
-
- if (isinstance(attr,float)):
- attrStr = ('{0:'+ minWidthStr + attrPrecision + 'f}').format(attr)
- elif (isinstance(attr,int)):
- attrStr = ('{0:' + minWidthStr + 'd}').format(attr)
- elif (isinstance(attr,str)):
- attrStr = ('{0:' + minWidthStr + 's}').format(attr);
- elif (sys.version_info[0] == 2):
- if (isinstance(attr,unicode)):
- attrStr = ('{0:' + minWidthStr + 's}').format(attr.encode('ascii','ignore'))
- else:
- raise TypeError('Unhandled object type (' + str(type(attr)) + ') for attribute \'' + attrDict['name'] + '\'')
-
- attrStr += attrSuffix
-
- GPUstrings[gpuIdx] += '| ' + attrStr + ' '
-
- headerString = headerString + '|'
- for gpuIdx,gpu in enumerate(GPUs):
- GPUstrings[gpuIdx] += '|'
-
- headerSpacingString = '-' * len(headerString)
- print(headerString)
- print(headerSpacingString)
- for GPUstring in GPUstrings:
- print(GPUstring)
-
-
- # Generate gpu uuid to id map
- gpuUuidToIdMap = {}
- try:
- gpus = getGPUs()
- for gpu in gpus:
- gpuUuidToIdMap[gpu.uuid] = gpu.id
- del gpus
- except:
- pass
- def getGPUInfos():
- ###返回gpus:list,一个GPU为一个元素-对象
- ###########:有属性,'id','load','memoryFree',
- ###########:'memoryTotal','memoryUsed','memoryUtil','name','serial''temperature','uuid',process
- ###其中process:每一个计算进程是一个元素--对象
- ############:有属性,'gpuId','gpuName','gpuUuid',
- ############:'gpuid','pid','processName','uid', 'uname','usedMemory'
- gpus = getGPUs()
- gpuUuidToIdMap={}
- for gpu in gpus:
- gpuUuidToIdMap[gpu.uuid] = gpu.id
- gpu.process=[]
- indexx = [x.id for x in gpus ]
-
- process = getGPUProcesses()
- for pre in process:
- pre.gpuid = gpuUuidToIdMap[pre.gpuUuid]
- gpuId = indexx.index(pre.gpuid )
- gpus[gpuId].process.append(pre )
- return gpus
-
- def get_available_gpu(gpuStatus):
- ##判断是否有空闲的显卡,如果有返回id,没有返回None
- cuda=None
- for gpus in gpuStatus:
- if len(gpus.process) == 0:
- cuda = gpus.id
- return str(cuda)
- return cuda
- def get_whether_gpuProcess():
- ##判断是否有空闲的显卡,如果有返回id,没有返回None
- gpuStatus=getGPUInfos()
- gpuProcess=True
- for gpus in gpuStatus:
- if len(gpus.process) != 0:
- gpuProcess = False
- return gpuProcess
-
- def get_offlineProcess_gpu(gpuStatus,pidInfos):
- gpu_onLine = []
- for gpu in gpuStatus:
- for gpuProcess in gpu.process:
- pid = gpuProcess.pid
- if pid in pidInfos.keys():
- pidType = pidInfos[pid]['type']
- if pidType == 'onLine':
- gpu_onLine.append(gpu)
- gpu_offLine = set(gpuStatus) - set(gpu_onLine)
- return list(gpu_offLine)
- def arrange_offlineProcess(gpuStatus,pidInfos,modelMemory=1500):
- cudaArrange=[]
- gpu_offLine = get_offlineProcess_gpu(gpuStatus,pidInfos)
- for gpu in gpu_offLine:
- leftMemory = gpu.memoryTotal*0.9 - gpu.memoryUsed
- modelCnt = int(leftMemory// modelMemory)
-
- cudaArrange.extend( [gpu.id] * modelCnt )
- return cudaArrange
- def get_potential_gpu(gpuStatus,pidInfos):
- ###所有GPU上都有计算。需要为“在线任务”空出一块显卡。
- ###step1:查看所有显卡上是否有“在线任务”
-
- gpu_offLine = get_offlineProcess_gpu(gpuStatus,pidInfos)
- if len(gpu_offLine) == 0 :
- return False
-
- ###step2,找出每张显卡上离线进程的数目
- offLineCnt = [ len(gpu.process) for gpu in gpu_offLine ]
- minCntIndex =offLineCnt.index( min(offLineCnt))
-
- pids = [x.pid for x in gpu_offLine[minCntIndex].process]
- return {'cuda':gpu_offLine[minCntIndex].id,'pids':pids }
- if __name__=='__main__':
- #pres = getGPUProcesses()
- #print('###line404:',pres)
- gpus = getGPUs()
- for gpu in gpus:
- gpuUuidToIdMap[gpu.uuid] = gpu.id
- print(gpu)
- print(gpuUuidToIdMap)
- pres = getGPUProcesses()
- print('###line404:',pres)
- for pre in pres:
- print('#'*20)
- for ken in ['gpuName','gpuUuid','pid','processName','uid','uname','usedMemory' ]:
- print(ken,' ',pre.__getattribute__(ken ))
- print(' ')
-
-
|