502 lines
21 KiB
Python
502 lines
21 KiB
Python
|
|
#@@ -1,43 +1,43 @@
|
|||
|
|
# GPUtil - GPU utilization
|
|||
|
|
#
|
|||
|
|
# A Python module for programmically getting the GPU utilization from NVIDA GPUs using nvidia-smi
|
|||
|
|
#
|
|||
|
|
# Author: Anders Krogh Mortensen (anderskm)
|
|||
|
|
# Date: 16 January 2017
|
|||
|
|
# Web: https://github.com/anderskm/gputil
|
|||
|
|
#
|
|||
|
|
# LICENSE
|
|||
|
|
#
|
|||
|
|
# MIT License
|
|||
|
|
#
|
|||
|
|
# Copyright (c) 2017 anderskm
|
|||
|
|
#
|
|||
|
|
# Permission is hereby granted, free of charge, to any person obtaining a copy
|
|||
|
|
# of this software and associated documentation files (the "Software"), to deal
|
|||
|
|
# in the Software without restriction, including without limitation the rights
|
|||
|
|
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|||
|
|
# copies of the Software, and to permit persons to whom the Software is
|
|||
|
|
# furnished to do so, subject to the following conditions:
|
|||
|
|
#
|
|||
|
|
# The above copyright notice and this permission notice shall be included in all
|
|||
|
|
# copies or substantial portions of the Software.
|
|||
|
|
#
|
|||
|
|
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|||
|
|
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|||
|
|
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|||
|
|
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|||
|
|
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|||
|
|
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|||
|
|
# SOFTWARE.
|
|||
|
|
|
|||
|
|
from subprocess import Popen, PIPE
|
|||
|
|
from distutils import spawn
|
|||
|
|
import os
|
|||
|
|
import math
|
|||
|
|
import random
|
|||
|
|
import time
|
|||
|
|
import sys
|
|||
|
|
import platform
|
|||
|
|
import subprocess
|
|||
|
|
import numpy as np
|
|||
|
|
|
|||
|
|
|
|||
|
|
__version__ = '1.4.0'
|
|||
|
|
class GPU:
|
|||
|
|
def __init__(self, ID, uuid, load, memoryTotal, memoryUsed, memoryFree, driver, gpu_name, serial, display_mode, display_active, temp_gpu):
|
|||
|
|
self.id = ID
|
|||
|
|
self.uuid = uuid
|
|||
|
|
self.load = load
|
|||
|
|
self.memoryUtil = float(memoryUsed)/float(memoryTotal)
|
|||
|
|
self.memoryTotal = memoryTotal
|
|||
|
|
self.memoryUsed = memoryUsed
|
|||
|
|
self.memoryFree = memoryFree
|
|||
|
|
self.driver = driver
|
|||
|
|
self.name = gpu_name
|
|||
|
|
self.serial = serial
|
|||
|
|
self.display_mode = display_mode
|
|||
|
|
self.display_active = display_active
|
|||
|
|
self.temperature = temp_gpu
|
|||
|
|
|
|||
|
|
def __str__(self):
|
|||
|
|
return str(self.__dict__)
|
|||
|
|
|
|||
|
|
|
|||
|
|
class GPUProcess:
|
|||
|
|
def __init__(self, pid, processName, gpuId, gpuUuid, gpuName, usedMemory,
|
|||
|
|
uid, uname):
|
|||
|
|
self.pid = pid
|
|||
|
|
self.processName = processName
|
|||
|
|
self.gpuId = gpuId
|
|||
|
|
self.gpuUuid = gpuUuid
|
|||
|
|
self.gpuName = gpuName
|
|||
|
|
self.usedMemory = usedMemory
|
|||
|
|
self.uid = uid
|
|||
|
|
self.uname = uname
|
|||
|
|
|
|||
|
|
def __str__(self):
|
|||
|
|
return str(self.__dict__)
|
|||
|
|
|
|||
|
|
def safeFloatCast(strNumber):
|
|||
|
|
try:
|
|||
|
|
number = float(strNumber)
|
|||
|
|
except ValueError:
|
|||
|
|
number = float('nan')
|
|||
|
|
return number
|
|||
|
|
|
|||
|
|
#def getGPUs():
|
|||
|
|
def getNvidiaSmiCmd():
|
|||
|
|
if platform.system() == "Windows":
|
|||
|
|
# If the platform is Windows and nvidia-smi
|
|||
|
|
# could not be found from the environment path,
|
|||
|
|
#@@ -75,57 +94,97 @@ def getGPUs():
|
|||
|
|
nvidia_smi = "%s\\Program Files\\NVIDIA Corporation\\NVSMI\\nvidia-smi.exe" % os.environ['systemdrive']
|
|||
|
|
else:
|
|||
|
|
nvidia_smi = "nvidia-smi"
|
|||
|
|
return nvidia_smi
|
|||
|
|
|
|||
|
|
|
|||
|
|
def getGPUs():
|
|||
|
|
# Get ID, processing and memory utilization for all GPUs
|
|||
|
|
nvidia_smi = getNvidiaSmiCmd()
|
|||
|
|
try:
|
|||
|
|
p = Popen([nvidia_smi,"--query-gpu=index,uuid,utilization.gpu,memory.total,memory.used,memory.free,driver_version,name,gpu_serial,display_active,display_mode,temperature.gpu", "--format=csv,noheader,nounits"], stdout=PIPE)
|
|||
|
|
stdout, stderror = p.communicate()
|
|||
|
|
p = subprocess.run([
|
|||
|
|
nvidia_smi,
|
|||
|
|
"--query-gpu=index,uuid,utilization.gpu,memory.total,memory.used,memory.free,driver_version,name,gpu_serial,display_active,display_mode,temperature.gpu",
|
|||
|
|
"--format=csv,noheader,nounits"
|
|||
|
|
], stdout=subprocess.PIPE, encoding='utf8')
|
|||
|
|
stdout, stderror = p.stdout, p.stderr
|
|||
|
|
except:
|
|||
|
|
return []
|
|||
|
|
output = stdout;#output = stdout.decode('UTF-8')
|
|||
|
|
# output = output[2:-1] # Remove b' and ' from string added by python
|
|||
|
|
#print(output)
|
|||
|
|
output = stdout
|
|||
|
|
## Parse output
|
|||
|
|
# Split on line break
|
|||
|
|
lines = output.split(os.linesep)
|
|||
|
|
#print(lines)
|
|||
|
|
numDevices = len(lines)-1
|
|||
|
|
GPUs = []
|
|||
|
|
for g in range(numDevices):
|
|||
|
|
line = lines[g]
|
|||
|
|
#print(line)
|
|||
|
|
vals = line.split(', ')
|
|||
|
|
#print(vals)
|
|||
|
|
for i in range(12):
|
|||
|
|
# print(vals[i])
|
|||
|
|
if (i == 0):
|
|||
|
|
deviceIds = int(vals[i])
|
|||
|
|
elif (i == 1):
|
|||
|
|
uuid = vals[i]
|
|||
|
|
elif (i == 2):
|
|||
|
|
gpuUtil = safeFloatCast(vals[i])/100
|
|||
|
|
elif (i == 3):
|
|||
|
|
memTotal = safeFloatCast(vals[i])
|
|||
|
|
elif (i == 4):
|
|||
|
|
memUsed = safeFloatCast(vals[i])
|
|||
|
|
elif (i == 5):
|
|||
|
|
memFree = safeFloatCast(vals[i])
|
|||
|
|
elif (i == 6):
|
|||
|
|
driver = vals[i]
|
|||
|
|
elif (i == 7):
|
|||
|
|
gpu_name = vals[i]
|
|||
|
|
elif (i == 8):
|
|||
|
|
serial = vals[i]
|
|||
|
|
elif (i == 9):
|
|||
|
|
display_active = vals[i]
|
|||
|
|
elif (i == 10):
|
|||
|
|
display_mode = vals[i]
|
|||
|
|
elif (i == 11):
|
|||
|
|
temp_gpu = safeFloatCast(vals[i]);
|
|||
|
|
deviceIds = int(vals[0])
|
|||
|
|
uuid = vals[1]
|
|||
|
|
gpuUtil = safeFloatCast(vals[2]) / 100
|
|||
|
|
memTotal = safeFloatCast(vals[3])
|
|||
|
|
memUsed = safeFloatCast(vals[4])
|
|||
|
|
memFree = safeFloatCast(vals[5])
|
|||
|
|
driver = vals[6]
|
|||
|
|
gpu_name = vals[7]
|
|||
|
|
serial = vals[8]
|
|||
|
|
display_active = vals[9]
|
|||
|
|
display_mode = vals[10]
|
|||
|
|
temp_gpu = safeFloatCast(vals[11]);
|
|||
|
|
GPUs.append(GPU(deviceIds, uuid, gpuUtil, memTotal, memUsed, memFree, driver, gpu_name, serial, display_mode, display_active, temp_gpu))
|
|||
|
|
return GPUs # (deviceIds, gpuUtil, memUtil)
|
|||
|
|
|
|||
|
|
|
|||
|
|
|
|||
|
|
def getGPUProcesses():
|
|||
|
|
"""Get all gpu compute processes."""
|
|||
|
|
|
|||
|
|
global gpuUuidToIdMap
|
|||
|
|
gpuUuidToIdMap = {}
|
|||
|
|
try:
|
|||
|
|
gpus = getGPUs()
|
|||
|
|
for gpu in gpus:
|
|||
|
|
gpuUuidToIdMap[gpu.uuid] = gpu.id
|
|||
|
|
del gpus
|
|||
|
|
except:
|
|||
|
|
pass
|
|||
|
|
|
|||
|
|
|
|||
|
|
nvidia_smi = getNvidiaSmiCmd()
|
|||
|
|
try:
|
|||
|
|
p = subprocess.run([
|
|||
|
|
nvidia_smi,
|
|||
|
|
"--query-compute-apps=pid,process_name,gpu_uuid,gpu_name,used_memory",
|
|||
|
|
"--format=csv,noheader,nounits"
|
|||
|
|
], stdout=subprocess.PIPE, encoding='utf8')
|
|||
|
|
stdout, stderror = p.stdout, p.stderr
|
|||
|
|
except:
|
|||
|
|
return []
|
|||
|
|
output = stdout
|
|||
|
|
## Parse output
|
|||
|
|
# Split on line break
|
|||
|
|
lines = output.split(os.linesep)
|
|||
|
|
numProcesses = len(lines) - 1
|
|||
|
|
processes = []
|
|||
|
|
for g in range(numProcesses):
|
|||
|
|
line = lines[g]
|
|||
|
|
#print(line)
|
|||
|
|
vals = line.split(', ')
|
|||
|
|
#print(vals)
|
|||
|
|
pid = int(vals[0])
|
|||
|
|
processName = vals[1]
|
|||
|
|
gpuUuid = vals[2]
|
|||
|
|
gpuName = vals[3]
|
|||
|
|
usedMemory = safeFloatCast(vals[4])
|
|||
|
|
gpuId = gpuUuidToIdMap[gpuUuid]
|
|||
|
|
if gpuId is None:
|
|||
|
|
gpuId = -1
|
|||
|
|
|
|||
|
|
# get uid and uname owner of the pid
|
|||
|
|
try:
|
|||
|
|
p = subprocess.run(['ps', f'-p{pid}', '-oruid=,ruser='],
|
|||
|
|
stdout=subprocess.PIPE, encoding='utf8')
|
|||
|
|
uid, uname = p.stdout.split()
|
|||
|
|
uid = int(uid)
|
|||
|
|
except:
|
|||
|
|
uid, uname = -1, ''
|
|||
|
|
|
|||
|
|
processes.append(GPUProcess(pid, processName, gpuId, gpuUuid,
|
|||
|
|
gpuName, usedMemory, uid, uname))
|
|||
|
|
return processes
|
|||
|
|
|
|||
|
|
|
|||
|
|
def getAvailable(order = 'first', limit=1, maxLoad=0.5, maxMemory=0.5, memoryFree=0, includeNan=False, excludeID=[], excludeUUID=[]):
|
|||
|
|
# order = first | last | random | load | memory
|
|||
|
|
# first --> select the GPU with the lowest ID (DEFAULT)
|
|||
|
|
# last --> select the GPU with the highest ID
|
|||
|
|
# random --> select a random available GPU
|
|||
|
|
# load --> select the GPU with the lowest load
|
|||
|
|
# memory --> select the GPU with the most memory available
|
|||
|
|
# limit = 1 (DEFAULT), 2, ..., Inf
|
|||
|
|
# Limit sets the upper limit for the number of GPUs to return. E.g. if limit = 2, but only one is available, only one is returned.
|
|||
|
|
# Get device IDs, load and memory usage
|
|||
|
|
GPUs = getGPUs()
|
|||
|
|
# Determine, which GPUs are available
|
|||
|
|
GPUavailability = getAvailability(GPUs, maxLoad=maxLoad, maxMemory=maxMemory, memoryFree=memoryFree, includeNan=includeNan, excludeID=excludeID, excludeUUID=excludeUUID)
|
|||
|
|
availAbleGPUindex = [idx for idx in range(0,len(GPUavailability)) if (GPUavailability[idx] == 1)]
|
|||
|
|
# Discard unavailable GPUs
|
|||
|
|
GPUs = [GPUs[g] for g in availAbleGPUindex]
|
|||
|
|
# Sort available GPUs according to the order argument
|
|||
|
|
if (order == 'first'):
|
|||
|
|
GPUs.sort(key=lambda x: float('inf') if math.isnan(x.id) else x.id, reverse=False)
|
|||
|
|
elif (order == 'last'):
|
|||
|
|
GPUs.sort(key=lambda x: float('-inf') if math.isnan(x.id) else x.id, reverse=True)
|
|||
|
|
elif (order == 'random'):
|
|||
|
|
GPUs = [GPUs[g] for g in random.sample(range(0,len(GPUs)),len(GPUs))]
|
|||
|
|
elif (order == 'load'):
|
|||
|
|
GPUs.sort(key=lambda x: float('inf') if math.isnan(x.load) else x.load, reverse=False)
|
|||
|
|
elif (order == 'memory'):
|
|||
|
|
GPUs.sort(key=lambda x: float('inf') if math.isnan(x.memoryUtil) else x.memoryUtil, reverse=False)
|
|||
|
|
# Extract the number of desired GPUs, but limited to the total number of available GPUs
|
|||
|
|
GPUs = GPUs[0:min(limit, len(GPUs))]
|
|||
|
|
# Extract the device IDs from the GPUs and return them
|
|||
|
|
deviceIds = [gpu.id for gpu in GPUs]
|
|||
|
|
return deviceIds
|
|||
|
|
#def getAvailability(GPUs, maxLoad = 0.5, maxMemory = 0.5, includeNan = False):
|
|||
|
|
# # Determine, which GPUs are available
|
|||
|
|
# GPUavailability = np.zeros(len(GPUs))
|
|||
|
|
# for i in range(len(GPUs)):
|
|||
|
|
# if (GPUs[i].load < maxLoad or (includeNan and np.isnan(GPUs[i].load))) and (GPUs[i].memoryUtil < maxMemory or (includeNan and np.isnan(GPUs[i].memoryUtil))):
|
|||
|
|
# GPUavailability[i] = 1
|
|||
|
|
def getAvailability(GPUs, maxLoad=0.5, maxMemory=0.5, memoryFree=0, includeNan=False, excludeID=[], excludeUUID=[]):
|
|||
|
|
# Determine, which GPUs are available
|
|||
|
|
GPUavailability = [1 if (gpu.memoryFree>=memoryFree) and (gpu.load < maxLoad or (includeNan and math.isnan(gpu.load))) and (gpu.memoryUtil < maxMemory or (includeNan and math.isnan(gpu.memoryUtil))) and ((gpu.id not in excludeID) and (gpu.uuid not in excludeUUID)) else 0 for gpu in GPUs]
|
|||
|
|
return GPUavailability
|
|||
|
|
def getFirstAvailable(order = 'first', maxLoad=0.5, maxMemory=0.5, attempts=1, interval=900, verbose=False, includeNan=False, excludeID=[], excludeUUID=[]):
|
|||
|
|
#GPUs = getGPUs()
|
|||
|
|
#firstAvailableGPU = np.NaN
|
|||
|
|
#for i in range(len(GPUs)):
|
|||
|
|
# if (GPUs[i].load < maxLoad) & (GPUs[i].memory < maxMemory):
|
|||
|
|
# firstAvailableGPU = GPUs[i].id
|
|||
|
|
# break
|
|||
|
|
#return firstAvailableGPU
|
|||
|
|
for i in range(attempts):
|
|||
|
|
if (verbose):
|
|||
|
|
print('Attempting (' + str(i+1) + '/' + str(attempts) + ') to locate available GPU.')
|
|||
|
|
# Get first available GPU
|
|||
|
|
available = getAvailable(order=order, limit=1, maxLoad=maxLoad, maxMemory=maxMemory, includeNan=includeNan, excludeID=excludeID, excludeUUID=excludeUUID)
|
|||
|
|
# If an available GPU was found, break for loop.
|
|||
|
|
if (available):
|
|||
|
|
if (verbose):
|
|||
|
|
print('GPU ' + str(available) + ' located!')
|
|||
|
|
break
|
|||
|
|
# If this is not the last attempt, sleep for 'interval' seconds
|
|||
|
|
if (i != attempts-1):
|
|||
|
|
time.sleep(interval)
|
|||
|
|
# Check if an GPU was found, or if the attempts simply ran out. Throw error, if no GPU was found
|
|||
|
|
if (not(available)):
|
|||
|
|
raise RuntimeError('Could not find an available GPU after ' + str(attempts) + ' attempts with ' + str(interval) + ' seconds interval.')
|
|||
|
|
# Return found GPU
|
|||
|
|
return available
|
|||
|
|
def showUtilization(all=False, attrList=None, useOldCode=False):
|
|||
|
|
GPUs = getGPUs()
|
|||
|
|
if (all):
|
|||
|
|
if (useOldCode):
|
|||
|
|
print(' ID | Name | Serial | UUID || GPU util. | Memory util. || Memory total | Memory used | Memory free || Display mode | Display active |')
|
|||
|
|
print('------------------------------------------------------------------------------------------------------------------------------')
|
|||
|
|
for gpu in GPUs:
|
|||
|
|
print(' {0:2d} | {1:s} | {2:s} | {3:s} || {4:3.0f}% | {5:3.0f}% || {6:.0f}MB | {7:.0f}MB | {8:.0f}MB || {9:s} | {10:s}'.format(gpu.id,gpu.name,gpu.serial,gpu.uuid,gpu.load*100,gpu.memoryUtil*100,gpu.memoryTotal,gpu.memoryUsed,gpu.memoryFree,gpu.display_mode,gpu.display_active))
|
|||
|
|
else:
|
|||
|
|
attrList = [[{'attr':'id','name':'ID'},
|
|||
|
|
{'attr':'name','name':'Name'},
|
|||
|
|
{'attr':'serial','name':'Serial'},
|
|||
|
|
{'attr':'uuid','name':'UUID'}],
|
|||
|
|
[{'attr':'temperature','name':'GPU temp.','suffix':'C','transform': lambda x: x,'precision':0},
|
|||
|
|
{'attr':'load','name':'GPU util.','suffix':'%','transform': lambda x: x*100,'precision':0},
|
|||
|
|
{'attr':'memoryUtil','name':'Memory util.','suffix':'%','transform': lambda x: x*100,'precision':0}],
|
|||
|
|
[{'attr':'memoryTotal','name':'Memory total','suffix':'MB','precision':0},
|
|||
|
|
{'attr':'memoryUsed','name':'Memory used','suffix':'MB','precision':0},
|
|||
|
|
{'attr':'memoryFree','name':'Memory free','suffix':'MB','precision':0}],
|
|||
|
|
[{'attr':'display_mode','name':'Display mode'},
|
|||
|
|
{'attr':'display_active','name':'Display active'}]]
|
|||
|
|
|
|||
|
|
else:
|
|||
|
|
if (useOldCode):
|
|||
|
|
print(' ID GPU MEM')
|
|||
|
|
print('--------------')
|
|||
|
|
for gpu in GPUs:
|
|||
|
|
print(' {0:2d} {1:3.0f}% {2:3.0f}%'.format(gpu.id, gpu.load*100, gpu.memoryUtil*100))
|
|||
|
|
else:
|
|||
|
|
attrList = [[{'attr':'id','name':'ID'},
|
|||
|
|
{'attr':'load','name':'GPU','suffix':'%','transform': lambda x: x*100,'precision':0},
|
|||
|
|
{'attr':'memoryUtil','name':'MEM','suffix':'%','transform': lambda x: x*100,'precision':0}],
|
|||
|
|
]
|
|||
|
|
|
|||
|
|
if (not useOldCode):
|
|||
|
|
if (attrList is not None):
|
|||
|
|
headerString = ''
|
|||
|
|
GPUstrings = ['']*len(GPUs)
|
|||
|
|
for attrGroup in attrList:
|
|||
|
|
#print(attrGroup)
|
|||
|
|
for attrDict in attrGroup:
|
|||
|
|
headerString = headerString + '| ' + attrDict['name'] + ' '
|
|||
|
|
headerWidth = len(attrDict['name'])
|
|||
|
|
minWidth = len(attrDict['name'])
|
|||
|
|
|
|||
|
|
attrPrecision = '.' + str(attrDict['precision']) if ('precision' in attrDict.keys()) else ''
|
|||
|
|
attrSuffix = str(attrDict['suffix']) if ('suffix' in attrDict.keys()) else ''
|
|||
|
|
attrTransform = attrDict['transform'] if ('transform' in attrDict.keys()) else lambda x : x
|
|||
|
|
for gpu in GPUs:
|
|||
|
|
attr = getattr(gpu,attrDict['attr'])
|
|||
|
|
|
|||
|
|
attr = attrTransform(attr)
|
|||
|
|
|
|||
|
|
if (isinstance(attr,float)):
|
|||
|
|
attrStr = ('{0:' + attrPrecision + 'f}').format(attr)
|
|||
|
|
elif (isinstance(attr,int)):
|
|||
|
|
attrStr = ('{0:d}').format(attr)
|
|||
|
|
elif (isinstance(attr,str)):
|
|||
|
|
attrStr = attr;
|
|||
|
|
elif (sys.version_info[0] == 2):
|
|||
|
|
if (isinstance(attr,unicode)):
|
|||
|
|
attrStr = attr.encode('ascii','ignore')
|
|||
|
|
else:
|
|||
|
|
raise TypeError('Unhandled object type (' + str(type(attr)) + ') for attribute \'' + attrDict['name'] + '\'')
|
|||
|
|
|
|||
|
|
attrStr += attrSuffix
|
|||
|
|
|
|||
|
|
minWidth = max(minWidth,len(attrStr))
|
|||
|
|
|
|||
|
|
headerString += ' '*max(0,minWidth-headerWidth)
|
|||
|
|
|
|||
|
|
minWidthStr = str(minWidth - len(attrSuffix))
|
|||
|
|
|
|||
|
|
for gpuIdx,gpu in enumerate(GPUs):
|
|||
|
|
attr = getattr(gpu,attrDict['attr'])
|
|||
|
|
|
|||
|
|
attr = attrTransform(attr)
|
|||
|
|
|
|||
|
|
if (isinstance(attr,float)):
|
|||
|
|
attrStr = ('{0:'+ minWidthStr + attrPrecision + 'f}').format(attr)
|
|||
|
|
elif (isinstance(attr,int)):
|
|||
|
|
attrStr = ('{0:' + minWidthStr + 'd}').format(attr)
|
|||
|
|
elif (isinstance(attr,str)):
|
|||
|
|
attrStr = ('{0:' + minWidthStr + 's}').format(attr);
|
|||
|
|
elif (sys.version_info[0] == 2):
|
|||
|
|
if (isinstance(attr,unicode)):
|
|||
|
|
attrStr = ('{0:' + minWidthStr + 's}').format(attr.encode('ascii','ignore'))
|
|||
|
|
else:
|
|||
|
|
raise TypeError('Unhandled object type (' + str(type(attr)) + ') for attribute \'' + attrDict['name'] + '\'')
|
|||
|
|
|
|||
|
|
attrStr += attrSuffix
|
|||
|
|
|
|||
|
|
GPUstrings[gpuIdx] += '| ' + attrStr + ' '
|
|||
|
|
|
|||
|
|
headerString = headerString + '|'
|
|||
|
|
for gpuIdx,gpu in enumerate(GPUs):
|
|||
|
|
GPUstrings[gpuIdx] += '|'
|
|||
|
|
|
|||
|
|
headerSpacingString = '-' * len(headerString)
|
|||
|
|
print(headerString)
|
|||
|
|
print(headerSpacingString)
|
|||
|
|
for GPUstring in GPUstrings:
|
|||
|
|
print(GPUstring)
|
|||
|
|
|
|||
|
|
|
|||
|
|
# Generate gpu uuid to id map
|
|||
|
|
gpuUuidToIdMap = {}
|
|||
|
|
try:
|
|||
|
|
gpus = getGPUs()
|
|||
|
|
for gpu in gpus:
|
|||
|
|
gpuUuidToIdMap[gpu.uuid] = gpu.id
|
|||
|
|
del gpus
|
|||
|
|
except:
|
|||
|
|
pass
|
|||
|
|
def getGPUInfos():
|
|||
|
|
###返回gpus:list,一个GPU为一个元素-对象
|
|||
|
|
###########:有属性,'id','load','memoryFree',
|
|||
|
|
###########:'memoryTotal','memoryUsed','memoryUtil','name','serial''temperature','uuid',process
|
|||
|
|
###其中process:每一个计算进程是一个元素--对象
|
|||
|
|
############:有属性,'gpuId','gpuName','gpuUuid',
|
|||
|
|
############:'gpuid','pid','processName','uid', 'uname','usedMemory'
|
|||
|
|
gpus = getGPUs()
|
|||
|
|
gpuUuidToIdMap={}
|
|||
|
|
for gpu in gpus:
|
|||
|
|
gpuUuidToIdMap[gpu.uuid] = gpu.id
|
|||
|
|
gpu.process=[]
|
|||
|
|
indexx = [x.id for x in gpus ]
|
|||
|
|
|
|||
|
|
process = getGPUProcesses()
|
|||
|
|
for pre in process:
|
|||
|
|
pre.gpuid = gpuUuidToIdMap[pre.gpuUuid]
|
|||
|
|
gpuId = indexx.index(pre.gpuid )
|
|||
|
|
gpus[gpuId].process.append(pre )
|
|||
|
|
return gpus
|
|||
|
|
|
|||
|
|
def get_available_gpu(gpuStatus):
|
|||
|
|
##判断是否有空闲的显卡,如果有返回id,没有返回None
|
|||
|
|
cuda=None
|
|||
|
|
for gpus in gpuStatus:
|
|||
|
|
if len(gpus.process) == 0:
|
|||
|
|
cuda = gpus.id
|
|||
|
|
return str(cuda)
|
|||
|
|
return cuda
|
|||
|
|
def get_whether_gpuProcess():
|
|||
|
|
##判断是否有空闲的显卡,如果有返回id,没有返回None
|
|||
|
|
gpuStatus=getGPUInfos()
|
|||
|
|
gpuProcess=True
|
|||
|
|
for gpus in gpuStatus:
|
|||
|
|
if len(gpus.process) != 0:
|
|||
|
|
gpuProcess = False
|
|||
|
|
return gpuProcess
|
|||
|
|
|
|||
|
|
def get_offlineProcess_gpu(gpuStatus,pidInfos):
|
|||
|
|
gpu_onLine = []
|
|||
|
|
for gpu in gpuStatus:
|
|||
|
|
for gpuProcess in gpu.process:
|
|||
|
|
pid = gpuProcess.pid
|
|||
|
|
if pid in pidInfos.keys():
|
|||
|
|
pidType = pidInfos[pid]['type']
|
|||
|
|
if pidType == 'onLine':
|
|||
|
|
gpu_onLine.append(gpu)
|
|||
|
|
gpu_offLine = set(gpuStatus) - set(gpu_onLine)
|
|||
|
|
return list(gpu_offLine)
|
|||
|
|
def arrange_offlineProcess(gpuStatus,pidInfos,modelMemory=1500):
|
|||
|
|
cudaArrange=[]
|
|||
|
|
gpu_offLine = get_offlineProcess_gpu(gpuStatus,pidInfos)
|
|||
|
|
for gpu in gpu_offLine:
|
|||
|
|
leftMemory = gpu.memoryTotal*0.9 - gpu.memoryUsed
|
|||
|
|
modelCnt = int(leftMemory// modelMemory)
|
|||
|
|
|
|||
|
|
cudaArrange.extend( [gpu.id] * modelCnt )
|
|||
|
|
return cudaArrange
|
|||
|
|
def get_potential_gpu(gpuStatus,pidInfos):
|
|||
|
|
###所有GPU上都有计算。需要为“在线任务”空出一块显卡。
|
|||
|
|
###step1:查看所有显卡上是否有“在线任务”
|
|||
|
|
|
|||
|
|
gpu_offLine = get_offlineProcess_gpu(gpuStatus,pidInfos)
|
|||
|
|
if len(gpu_offLine) == 0 :
|
|||
|
|
return False
|
|||
|
|
|
|||
|
|
###step2,找出每张显卡上离线进程的数目
|
|||
|
|
offLineCnt = [ len(gpu.process) for gpu in gpu_offLine ]
|
|||
|
|
minCntIndex =offLineCnt.index( min(offLineCnt))
|
|||
|
|
|
|||
|
|
pids = [x.pid for x in gpu_offLine[minCntIndex].process]
|
|||
|
|
return {'cuda':gpu_offLine[minCntIndex].id,'pids':pids }
|
|||
|
|
if __name__=='__main__':
|
|||
|
|
#pres = getGPUProcesses()
|
|||
|
|
#print('###line404:',pres)
|
|||
|
|
gpus = getGPUs()
|
|||
|
|
for gpu in gpus:
|
|||
|
|
gpuUuidToIdMap[gpu.uuid] = gpu.id
|
|||
|
|
print(gpu)
|
|||
|
|
print(gpuUuidToIdMap)
|
|||
|
|
pres = getGPUProcesses()
|
|||
|
|
print('###line404:',pres)
|
|||
|
|
for pre in pres:
|
|||
|
|
print('#'*20)
|
|||
|
|
for ken in ['gpuName','gpuUuid','pid','processName','uid','uname','usedMemory' ]:
|
|||
|
|
print(ken,' ',pre.__getattribute__(ken ))
|
|||
|
|
print(' ')
|
|||
|
|
|
|||
|
|
|