wangjin0928
/
kafka_yolov5

#@@ -1,43 +1,43 @@
# GPUtil - GPU utilization
#
# A Python module for programmically getting the GPU utilization from NVIDA GPUs using nvidia-smi
#
# Author: Anders Krogh Mortensen (anderskm)
# Date:   16 January 2017
# Web:    https://github.com/anderskm/gputil
#
# LICENSE
#
# MIT License
#
# Copyright (c) 2017 anderskm
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in all
# copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE.

from subprocess import Popen, PIPE
from distutils import spawn
import os
import math
import random
import time
import sys
import platform
import subprocess
import numpy as np


__version__ = '1.4.0'
class GPU:
    def __init__(self, ID, uuid, load, memoryTotal, memoryUsed, memoryFree, driver, gpu_name, serial, display_mode, display_active, temp_gpu):
        self.id = ID
        self.uuid = uuid
        self.load = load
        self.memoryUtil = float(memoryUsed)/float(memoryTotal)
        self.memoryTotal = memoryTotal
        self.memoryUsed = memoryUsed
        self.memoryFree = memoryFree
        self.driver = driver
        self.name = gpu_name
        self.serial = serial
        self.display_mode = display_mode
        self.display_active = display_active
        self.temperature = temp_gpu

    def __str__(self):
        return str(self.__dict__)


class GPUProcess:
    def __init__(self, pid, processName, gpuId, gpuUuid, gpuName, usedMemory,
                 uid, uname):
        self.pid = pid
        self.processName = processName
        self.gpuId = gpuId
        self.gpuUuid = gpuUuid
        self.gpuName = gpuName
        self.usedMemory = usedMemory
        self.uid = uid
        self.uname = uname

    def __str__(self):
        return str(self.__dict__)

def safeFloatCast(strNumber):
    try:
        number = float(strNumber)
    except ValueError:
        number = float('nan')
    return number

#def getGPUs():
def getNvidiaSmiCmd():
    if platform.system() == "Windows":
        # If the platform is Windows and nvidia-smi 
        # could not be found from the environment path, 
        #@@ -75,57 +94,97 @@ def getGPUs():
        nvidia_smi = "%s\\Program Files\\NVIDIA Corporation\\NVSMI\\nvidia-smi.exe" % os.environ['systemdrive']
    else:
        nvidia_smi = "nvidia-smi"
    return nvidia_smi


def getGPUs():
    # Get ID, processing and memory utilization for all GPUs
    nvidia_smi = getNvidiaSmiCmd()
    try:
        p = Popen([nvidia_smi,"--query-gpu=index,uuid,utilization.gpu,memory.total,memory.used,memory.free,driver_version,name,gpu_serial,display_active,display_mode,temperature.gpu", "--format=csv,noheader,nounits"], stdout=PIPE)
        stdout, stderror = p.communicate()
        p = subprocess.run([
            nvidia_smi,
            "--query-gpu=index,uuid,utilization.gpu,memory.total,memory.used,memory.free,driver_version,name,gpu_serial,display_active,display_mode,temperature.gpu",
            "--format=csv,noheader,nounits"
        ], stdout=subprocess.PIPE, encoding='utf8')
        stdout, stderror = p.stdout, p.stderr
    except:
        return []
    output = stdout;#output = stdout.decode('UTF-8')
    # output = output[2:-1] # Remove b' and ' from string added by python
    #print(output)
    output = stdout
    ## Parse output
    # Split on line break
    lines = output.split(os.linesep)
    #print(lines)
    numDevices = len(lines)-1
    GPUs = []
    for g in range(numDevices):
        line = lines[g]
        #print(line)
        vals = line.split(', ')
        #print(vals)
        for i in range(12):
            # print(vals[i])
            if (i == 0):
                deviceIds = int(vals[i])
            elif (i == 1):
                uuid = vals[i]
            elif (i == 2):
                gpuUtil = safeFloatCast(vals[i])/100
            elif (i == 3):
                memTotal = safeFloatCast(vals[i])
            elif (i == 4):
                memUsed = safeFloatCast(vals[i])
            elif (i == 5):
                memFree = safeFloatCast(vals[i])
            elif (i == 6):
                driver = vals[i]
            elif (i == 7):
                gpu_name = vals[i]
            elif (i == 8):
                serial = vals[i]
            elif (i == 9):
                display_active = vals[i]
            elif (i == 10):
                display_mode = vals[i]
            elif (i == 11):
                temp_gpu = safeFloatCast(vals[i]);
        deviceIds = int(vals[0])
        uuid = vals[1]
        gpuUtil = safeFloatCast(vals[2]) / 100
        memTotal = safeFloatCast(vals[3])
        memUsed = safeFloatCast(vals[4])
        memFree = safeFloatCast(vals[5])
        driver = vals[6]
        gpu_name = vals[7]
        serial = vals[8]
        display_active = vals[9]
        display_mode = vals[10]
        temp_gpu = safeFloatCast(vals[11]);
        GPUs.append(GPU(deviceIds, uuid, gpuUtil, memTotal, memUsed, memFree, driver, gpu_name, serial, display_mode, display_active, temp_gpu))
    return GPUs  # (deviceIds, gpuUtil, memUtil)


def getGPUProcesses():
    """Get all gpu compute processes."""
    
    global gpuUuidToIdMap
    gpuUuidToIdMap = {}
    try:
        gpus = getGPUs()
        for gpu in gpus:
            gpuUuidToIdMap[gpu.uuid] = gpu.id
        del gpus
    except:
       pass
    
    
    nvidia_smi = getNvidiaSmiCmd()
    try:
        p = subprocess.run([
            nvidia_smi,
            "--query-compute-apps=pid,process_name,gpu_uuid,gpu_name,used_memory",
            "--format=csv,noheader,nounits"
        ], stdout=subprocess.PIPE, encoding='utf8')
        stdout, stderror = p.stdout, p.stderr
    except:
        return []
    output = stdout
    ## Parse output
    # Split on line break
    lines = output.split(os.linesep)
    numProcesses = len(lines) - 1
    processes = []
    for g in range(numProcesses):
        line = lines[g]
        #print(line)
        vals = line.split(', ')
        #print(vals)
        pid = int(vals[0])
        processName = vals[1]
        gpuUuid = vals[2]
        gpuName = vals[3]
        usedMemory = safeFloatCast(vals[4])
        gpuId = gpuUuidToIdMap[gpuUuid]
        if gpuId is None:
            gpuId = -1

        # get uid and uname owner of the pid
        try:
            p = subprocess.run(['ps', f'-p{pid}', '-oruid=,ruser='],
                               stdout=subprocess.PIPE, encoding='utf8')
            uid, uname = p.stdout.split()
            uid = int(uid)
        except:
            uid, uname = -1, ''

        processes.append(GPUProcess(pid, processName, gpuId, gpuUuid,
                                    gpuName, usedMemory, uid, uname))
    return processes


def getAvailable(order = 'first', limit=1, maxLoad=0.5, maxMemory=0.5, memoryFree=0, includeNan=False, excludeID=[], excludeUUID=[]):
    # order = first | last | random | load | memory
    #    first --> select the GPU with the lowest ID (DEFAULT)
    #    last --> select the GPU with the highest ID
    #    random --> select a random available GPU
    #    load --> select the GPU with the lowest load
    #    memory --> select the GPU with the most memory available
    # limit = 1 (DEFAULT), 2, ..., Inf
    #     Limit sets the upper limit for the number of GPUs to return. E.g. if limit = 2, but only one is available, only one is returned.
    # Get device IDs, load and memory usage
    GPUs = getGPUs()
    # Determine, which GPUs are available
    GPUavailability = getAvailability(GPUs, maxLoad=maxLoad, maxMemory=maxMemory, memoryFree=memoryFree, includeNan=includeNan, excludeID=excludeID, excludeUUID=excludeUUID)
    availAbleGPUindex = [idx for idx in range(0,len(GPUavailability)) if (GPUavailability[idx] == 1)]
    # Discard unavailable GPUs
    GPUs = [GPUs[g] for g in availAbleGPUindex]
    # Sort available GPUs according to the order argument
    if (order == 'first'):
        GPUs.sort(key=lambda x: float('inf') if math.isnan(x.id) else x.id, reverse=False)
    elif (order == 'last'):
        GPUs.sort(key=lambda x: float('-inf') if math.isnan(x.id) else x.id, reverse=True)
    elif (order == 'random'):
        GPUs = [GPUs[g] for g in random.sample(range(0,len(GPUs)),len(GPUs))]
    elif (order == 'load'):
        GPUs.sort(key=lambda x: float('inf') if math.isnan(x.load) else x.load, reverse=False)
    elif (order == 'memory'):
        GPUs.sort(key=lambda x: float('inf') if math.isnan(x.memoryUtil) else x.memoryUtil, reverse=False)
    # Extract the number of desired GPUs, but limited to the total number of available GPUs
    GPUs = GPUs[0:min(limit, len(GPUs))]
    # Extract the device IDs from the GPUs and return them
    deviceIds = [gpu.id for gpu in GPUs]
    return deviceIds
#def getAvailability(GPUs, maxLoad = 0.5, maxMemory = 0.5, includeNan = False):
#    # Determine, which GPUs are available
#    GPUavailability = np.zeros(len(GPUs))
#    for i in range(len(GPUs)):
#        if (GPUs[i].load < maxLoad or (includeNan and np.isnan(GPUs[i].load))) and (GPUs[i].memoryUtil < maxMemory  or (includeNan and np.isnan(GPUs[i].memoryUtil))):
#            GPUavailability[i] = 1
def getAvailability(GPUs, maxLoad=0.5, maxMemory=0.5, memoryFree=0, includeNan=False, excludeID=[], excludeUUID=[]):
    # Determine, which GPUs are available
    GPUavailability = [1 if (gpu.memoryFree>=memoryFree) and (gpu.load < maxLoad or (includeNan and math.isnan(gpu.load))) and (gpu.memoryUtil < maxMemory  or (includeNan and math.isnan(gpu.memoryUtil))) and ((gpu.id not in excludeID) and (gpu.uuid not in excludeUUID)) else 0 for gpu in GPUs]
    return GPUavailability
def getFirstAvailable(order = 'first', maxLoad=0.5, maxMemory=0.5, attempts=1, interval=900, verbose=False, includeNan=False, excludeID=[], excludeUUID=[]):
    #GPUs = getGPUs()
    #firstAvailableGPU = np.NaN
    #for i in range(len(GPUs)):
    #    if (GPUs[i].load < maxLoad) & (GPUs[i].memory < maxMemory):
    #        firstAvailableGPU = GPUs[i].id
    #        break
    #return firstAvailableGPU
    for i in range(attempts):
        if (verbose):
            print('Attempting (' + str(i+1) + '/' + str(attempts) + ') to locate available GPU.')
        # Get first available GPU
        available = getAvailable(order=order, limit=1, maxLoad=maxLoad, maxMemory=maxMemory, includeNan=includeNan, excludeID=excludeID, excludeUUID=excludeUUID)
        # If an available GPU was found, break for loop.
        if (available):
            if (verbose):
                print('GPU ' + str(available) + ' located!')
            break
        # If this is not the last attempt, sleep for 'interval' seconds
        if (i != attempts-1):
            time.sleep(interval)
    # Check if an GPU was found, or if the attempts simply ran out. Throw error, if no GPU was found
    if (not(available)):
        raise RuntimeError('Could not find an available GPU after ' + str(attempts) + ' attempts with ' + str(interval) + ' seconds interval.')
    # Return found GPU
    return available
def showUtilization(all=False, attrList=None, useOldCode=False):
    GPUs = getGPUs()
    if (all):
        if (useOldCode):
            print(' ID | Name | Serial | UUID || GPU util. | Memory util. || Memory total | Memory used | Memory free || Display mode | Display active |')
            print('------------------------------------------------------------------------------------------------------------------------------')
            for gpu in GPUs:
                print(' {0:2d} | {1:s}  | {2:s} | {3:s} || {4:3.0f}% | {5:3.0f}% || {6:.0f}MB | {7:.0f}MB | {8:.0f}MB || {9:s} | {10:s}'.format(gpu.id,gpu.name,gpu.serial,gpu.uuid,gpu.load*100,gpu.memoryUtil*100,gpu.memoryTotal,gpu.memoryUsed,gpu.memoryFree,gpu.display_mode,gpu.display_active))
        else:
            attrList = [[{'attr':'id','name':'ID'},
                         {'attr':'name','name':'Name'},
                         {'attr':'serial','name':'Serial'},
                         {'attr':'uuid','name':'UUID'}],
                        [{'attr':'temperature','name':'GPU temp.','suffix':'C','transform': lambda x: x,'precision':0},
						 {'attr':'load','name':'GPU util.','suffix':'%','transform': lambda x: x*100,'precision':0},
                         {'attr':'memoryUtil','name':'Memory util.','suffix':'%','transform': lambda x: x*100,'precision':0}],
                        [{'attr':'memoryTotal','name':'Memory total','suffix':'MB','precision':0},
                         {'attr':'memoryUsed','name':'Memory used','suffix':'MB','precision':0},
                         {'attr':'memoryFree','name':'Memory free','suffix':'MB','precision':0}],
                        [{'attr':'display_mode','name':'Display mode'},
                         {'attr':'display_active','name':'Display active'}]]
        
    else:
        if (useOldCode):
            print(' ID  GPU  MEM')
            print('--------------')
            for gpu in GPUs:
                print(' {0:2d} {1:3.0f}% {2:3.0f}%'.format(gpu.id, gpu.load*100, gpu.memoryUtil*100))
        else:
            attrList = [[{'attr':'id','name':'ID'},
                         {'attr':'load','name':'GPU','suffix':'%','transform': lambda x: x*100,'precision':0},
                         {'attr':'memoryUtil','name':'MEM','suffix':'%','transform': lambda x: x*100,'precision':0}],
                        ]
        
    if (not useOldCode):
        if (attrList is not None):
            headerString = ''
            GPUstrings = ['']*len(GPUs)
            for attrGroup in attrList:
                #print(attrGroup)
                for attrDict in attrGroup:
                    headerString = headerString + '| ' + attrDict['name'] + ' '
                    headerWidth = len(attrDict['name'])
                    minWidth = len(attrDict['name'])
                    
                    attrPrecision = '.' + str(attrDict['precision']) if ('precision' in attrDict.keys()) else ''
                    attrSuffix = str(attrDict['suffix']) if ('suffix' in attrDict.keys()) else ''
                    attrTransform = attrDict['transform'] if ('transform' in attrDict.keys()) else lambda x : x
                    for gpu in GPUs:
                        attr = getattr(gpu,attrDict['attr'])
                        
                        attr = attrTransform(attr)
                        
                        if (isinstance(attr,float)):
                            attrStr = ('{0:' + attrPrecision + 'f}').format(attr)
                        elif (isinstance(attr,int)):
                            attrStr = ('{0:d}').format(attr)
                        elif (isinstance(attr,str)):
                            attrStr = attr;
                        elif  (sys.version_info[0] == 2):
                            if (isinstance(attr,unicode)):
                                attrStr = attr.encode('ascii','ignore')
                        else:
                            raise TypeError('Unhandled object type (' + str(type(attr)) + ') for attribute \'' + attrDict['name'] + '\'')
                                            
                        attrStr += attrSuffix
                        
                        minWidth = max(minWidth,len(attrStr))
    
                    headerString += ' '*max(0,minWidth-headerWidth)
                    
                    minWidthStr = str(minWidth - len(attrSuffix))
                    
                    for gpuIdx,gpu in enumerate(GPUs):
                        attr = getattr(gpu,attrDict['attr'])
                        
                        attr = attrTransform(attr)
                        
                        if (isinstance(attr,float)):
                            attrStr = ('{0:'+ minWidthStr + attrPrecision + 'f}').format(attr)
                        elif (isinstance(attr,int)):
                            attrStr = ('{0:' + minWidthStr + 'd}').format(attr)
                        elif (isinstance(attr,str)):
                            attrStr = ('{0:' + minWidthStr + 's}').format(attr);
                        elif (sys.version_info[0] == 2):
                            if (isinstance(attr,unicode)):
                                attrStr = ('{0:' + minWidthStr + 's}').format(attr.encode('ascii','ignore'))
                        else:
                            raise TypeError('Unhandled object type (' + str(type(attr)) + ') for attribute \'' + attrDict['name'] + '\'')
                                            
                        attrStr += attrSuffix
                        
                        GPUstrings[gpuIdx] += '| ' + attrStr + ' '
                                            
                headerString = headerString + '|'
                for gpuIdx,gpu in enumerate(GPUs):
                    GPUstrings[gpuIdx] += '|'
                    
            headerSpacingString = '-' * len(headerString)
            print(headerString)
            print(headerSpacingString)
            for GPUstring in GPUstrings:
                print(GPUstring)


# Generate gpu uuid to id map
gpuUuidToIdMap = {}
try:
    gpus = getGPUs()
    for gpu in gpus:
        gpuUuidToIdMap[gpu.uuid] = gpu.id
    del gpus
except:
    pass
def getGPUInfos():
    ###返回gpus：list,一个GPU为一个元素-对象
    ###########：有属性，'id','load','memoryFree',
    ###########：'memoryTotal','memoryUsed','memoryUtil','name','serial''temperature','uuid',process
    ###其中process：每一个计算进程是一个元素--对象
    ############：有属性，'gpuId','gpuName','gpuUuid',
    ############：'gpuid','pid','processName','uid', 'uname','usedMemory'    
    gpus = getGPUs()
    gpuUuidToIdMap={}
    for gpu in gpus:
        gpuUuidToIdMap[gpu.uuid] = gpu.id
        gpu.process=[]
    indexx = [x.id for x in gpus ] 
    
    process = getGPUProcesses()   
    for pre in process:
        pre.gpuid =    gpuUuidToIdMap[pre.gpuUuid] 
        gpuId = indexx.index(pre.gpuid )
        gpus[gpuId].process.append(pre  )
    return gpus

def get_available_gpu(gpuStatus):
    ##判断是否有空闲的显卡，如果有返回id，没有返回None
    cuda=None
    for gpus in gpuStatus:
        if len(gpus.process) == 0:
            cuda = gpus.id
            return str(cuda)
    return cuda
def get_whether_gpuProcess():
    ##判断是否有空闲的显卡，如果有返回id，没有返回None
    gpuStatus=getGPUInfos()
    gpuProcess=True
    for gpus in gpuStatus:
        if len(gpus.process) != 0:
            gpuProcess = False          
    return gpuProcess
    
def get_offlineProcess_gpu(gpuStatus,pidInfos):
    gpu_onLine = []  
    for gpu in gpuStatus:       
        for gpuProcess in  gpu.process:
            pid =  gpuProcess.pid
            if pid in   pidInfos.keys():
                pidType =   pidInfos[pid]['type']
                if pidType == 'onLine':
                    gpu_onLine.append(gpu)  
    gpu_offLine = set(gpuStatus) - set(gpu_onLine) 
    return list(gpu_offLine)
def arrange_offlineProcess(gpuStatus,pidInfos,modelMemory=1500):
    cudaArrange=[]
    gpu_offLine =  get_offlineProcess_gpu(gpuStatus,pidInfos)
    for gpu in gpu_offLine:
        leftMemory = gpu.memoryTotal*0.9 - gpu.memoryUsed
        modelCnt =   int(leftMemory// modelMemory) 

        cudaArrange.extend( [gpu.id] * modelCnt )
    return cudaArrange    
def get_potential_gpu(gpuStatus,pidInfos):
    ###所有GPU上都有计算。需要为“在线任务”空出一块显卡。
    ###step1：查看所有显卡上是否有“在线任务”
     
    gpu_offLine =  get_offlineProcess_gpu(gpuStatus,pidInfos)
    if len(gpu_offLine)  == 0 :
        return False
        
    ###step2,找出每张显卡上离线进程的数目
    offLineCnt = [ len(gpu.process) for gpu in  gpu_offLine    ]
    minCntIndex =offLineCnt.index( min(offLineCnt))
     
    pids = [x.pid for x  in  gpu_offLine[minCntIndex].process]
    return {'cuda':gpu_offLine[minCntIndex].id,'pids':pids }    
if __name__=='__main__':
    #pres = getGPUProcesses()
    #print('###line404:',pres)
    gpus = getGPUs()
    for gpu in gpus:
        gpuUuidToIdMap[gpu.uuid] = gpu.id
        print(gpu)
    print(gpuUuidToIdMap)     
    pres = getGPUProcesses()   
    print('###line404:',pres) 
    for pre in pres:
        print('#'*20)
        for ken in ['gpuName','gpuUuid','pid','processName','uid','uname','usedMemory' ]:
            print(ken,'  ',pre.__getattribute__(ken  ))   
        print(' ')