Ви не можете вибрати більше 25 тем Теми мають розпочинатися з літери або цифри, можуть містити дефіси (-) і не повинні перевищувати 35 символів.

502 lines
21KB

  1. #@@ -1,43 +1,43 @@
  2. # GPUtil - GPU utilization
  3. #
  4. # A Python module for programmically getting the GPU utilization from NVIDA GPUs using nvidia-smi
  5. #
  6. # Author: Anders Krogh Mortensen (anderskm)
  7. # Date: 16 January 2017
  8. # Web: https://github.com/anderskm/gputil
  9. #
  10. # LICENSE
  11. #
  12. # MIT License
  13. #
  14. # Copyright (c) 2017 anderskm
  15. #
  16. # Permission is hereby granted, free of charge, to any person obtaining a copy
  17. # of this software and associated documentation files (the "Software"), to deal
  18. # in the Software without restriction, including without limitation the rights
  19. # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
  20. # copies of the Software, and to permit persons to whom the Software is
  21. # furnished to do so, subject to the following conditions:
  22. #
  23. # The above copyright notice and this permission notice shall be included in all
  24. # copies or substantial portions of the Software.
  25. #
  26. # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  27. # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  28. # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
  29. # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  30. # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  31. # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  32. # SOFTWARE.
  33. from subprocess import Popen, PIPE
  34. from distutils import spawn
  35. import os
  36. import math
  37. import random
  38. import time
  39. import sys
  40. import platform
  41. import subprocess
  42. import numpy as np
  43. __version__ = '1.4.0'
  44. class GPU:
  45. def __init__(self, ID, uuid, load, memoryTotal, memoryUsed, memoryFree, driver, gpu_name, serial, display_mode, display_active, temp_gpu):
  46. self.id = ID
  47. self.uuid = uuid
  48. self.load = load
  49. self.memoryUtil = float(memoryUsed)/float(memoryTotal)
  50. self.memoryTotal = memoryTotal
  51. self.memoryUsed = memoryUsed
  52. self.memoryFree = memoryFree
  53. self.driver = driver
  54. self.name = gpu_name
  55. self.serial = serial
  56. self.display_mode = display_mode
  57. self.display_active = display_active
  58. self.temperature = temp_gpu
  59. def __str__(self):
  60. return str(self.__dict__)
  61. class GPUProcess:
  62. def __init__(self, pid, processName, gpuId, gpuUuid, gpuName, usedMemory,
  63. uid, uname):
  64. self.pid = pid
  65. self.processName = processName
  66. self.gpuId = gpuId
  67. self.gpuUuid = gpuUuid
  68. self.gpuName = gpuName
  69. self.usedMemory = usedMemory
  70. self.uid = uid
  71. self.uname = uname
  72. def __str__(self):
  73. return str(self.__dict__)
  74. def safeFloatCast(strNumber):
  75. try:
  76. number = float(strNumber)
  77. except ValueError:
  78. number = float('nan')
  79. return number
  80. #def getGPUs():
  81. def getNvidiaSmiCmd():
  82. if platform.system() == "Windows":
  83. # If the platform is Windows and nvidia-smi
  84. # could not be found from the environment path,
  85. #@@ -75,57 +94,97 @@ def getGPUs():
  86. nvidia_smi = "%s\\Program Files\\NVIDIA Corporation\\NVSMI\\nvidia-smi.exe" % os.environ['systemdrive']
  87. else:
  88. nvidia_smi = "nvidia-smi"
  89. return nvidia_smi
  90. def getGPUs():
  91. # Get ID, processing and memory utilization for all GPUs
  92. nvidia_smi = getNvidiaSmiCmd()
  93. try:
  94. p = Popen([nvidia_smi,"--query-gpu=index,uuid,utilization.gpu,memory.total,memory.used,memory.free,driver_version,name,gpu_serial,display_active,display_mode,temperature.gpu", "--format=csv,noheader,nounits"], stdout=PIPE)
  95. stdout, stderror = p.communicate()
  96. p = subprocess.run([
  97. nvidia_smi,
  98. "--query-gpu=index,uuid,utilization.gpu,memory.total,memory.used,memory.free,driver_version,name,gpu_serial,display_active,display_mode,temperature.gpu",
  99. "--format=csv,noheader,nounits"
  100. ], stdout=subprocess.PIPE, encoding='utf8')
  101. stdout, stderror = p.stdout, p.stderr
  102. except:
  103. return []
  104. output = stdout;#output = stdout.decode('UTF-8')
  105. # output = output[2:-1] # Remove b' and ' from string added by python
  106. #print(output)
  107. output = stdout
  108. ## Parse output
  109. # Split on line break
  110. lines = output.split(os.linesep)
  111. #print(lines)
  112. numDevices = len(lines)-1
  113. GPUs = []
  114. for g in range(numDevices):
  115. line = lines[g]
  116. #print(line)
  117. vals = line.split(', ')
  118. #print(vals)
  119. for i in range(12):
  120. # print(vals[i])
  121. if (i == 0):
  122. deviceIds = int(vals[i])
  123. elif (i == 1):
  124. uuid = vals[i]
  125. elif (i == 2):
  126. gpuUtil = safeFloatCast(vals[i])/100
  127. elif (i == 3):
  128. memTotal = safeFloatCast(vals[i])
  129. elif (i == 4):
  130. memUsed = safeFloatCast(vals[i])
  131. elif (i == 5):
  132. memFree = safeFloatCast(vals[i])
  133. elif (i == 6):
  134. driver = vals[i]
  135. elif (i == 7):
  136. gpu_name = vals[i]
  137. elif (i == 8):
  138. serial = vals[i]
  139. elif (i == 9):
  140. display_active = vals[i]
  141. elif (i == 10):
  142. display_mode = vals[i]
  143. elif (i == 11):
  144. temp_gpu = safeFloatCast(vals[i]);
  145. deviceIds = int(vals[0])
  146. uuid = vals[1]
  147. gpuUtil = safeFloatCast(vals[2]) / 100
  148. memTotal = safeFloatCast(vals[3])
  149. memUsed = safeFloatCast(vals[4])
  150. memFree = safeFloatCast(vals[5])
  151. driver = vals[6]
  152. gpu_name = vals[7]
  153. serial = vals[8]
  154. display_active = vals[9]
  155. display_mode = vals[10]
  156. temp_gpu = safeFloatCast(vals[11]);
  157. GPUs.append(GPU(deviceIds, uuid, gpuUtil, memTotal, memUsed, memFree, driver, gpu_name, serial, display_mode, display_active, temp_gpu))
  158. return GPUs # (deviceIds, gpuUtil, memUtil)
  159. def getGPUProcesses():
  160. """Get all gpu compute processes."""
  161. global gpuUuidToIdMap
  162. gpuUuidToIdMap = {}
  163. try:
  164. gpus = getGPUs()
  165. for gpu in gpus:
  166. gpuUuidToIdMap[gpu.uuid] = gpu.id
  167. del gpus
  168. except:
  169. pass
  170. nvidia_smi = getNvidiaSmiCmd()
  171. try:
  172. p = subprocess.run([
  173. nvidia_smi,
  174. "--query-compute-apps=pid,process_name,gpu_uuid,gpu_name,used_memory",
  175. "--format=csv,noheader,nounits"
  176. ], stdout=subprocess.PIPE, encoding='utf8')
  177. stdout, stderror = p.stdout, p.stderr
  178. except:
  179. return []
  180. output = stdout
  181. ## Parse output
  182. # Split on line break
  183. lines = output.split(os.linesep)
  184. numProcesses = len(lines) - 1
  185. processes = []
  186. for g in range(numProcesses):
  187. line = lines[g]
  188. #print(line)
  189. vals = line.split(', ')
  190. #print(vals)
  191. pid = int(vals[0])
  192. processName = vals[1]
  193. gpuUuid = vals[2]
  194. gpuName = vals[3]
  195. usedMemory = safeFloatCast(vals[4])
  196. gpuId = gpuUuidToIdMap[gpuUuid]
  197. if gpuId is None:
  198. gpuId = -1
  199. # get uid and uname owner of the pid
  200. try:
  201. p = subprocess.run(['ps', f'-p{pid}', '-oruid=,ruser='],
  202. stdout=subprocess.PIPE, encoding='utf8')
  203. uid, uname = p.stdout.split()
  204. uid = int(uid)
  205. except:
  206. uid, uname = -1, ''
  207. processes.append(GPUProcess(pid, processName, gpuId, gpuUuid,
  208. gpuName, usedMemory, uid, uname))
  209. return processes
  210. def getAvailable(order = 'first', limit=1, maxLoad=0.5, maxMemory=0.5, memoryFree=0, includeNan=False, excludeID=[], excludeUUID=[]):
  211. # order = first | last | random | load | memory
  212. # first --> select the GPU with the lowest ID (DEFAULT)
  213. # last --> select the GPU with the highest ID
  214. # random --> select a random available GPU
  215. # load --> select the GPU with the lowest load
  216. # memory --> select the GPU with the most memory available
  217. # limit = 1 (DEFAULT), 2, ..., Inf
  218. # Limit sets the upper limit for the number of GPUs to return. E.g. if limit = 2, but only one is available, only one is returned.
  219. # Get device IDs, load and memory usage
  220. GPUs = getGPUs()
  221. # Determine, which GPUs are available
  222. GPUavailability = getAvailability(GPUs, maxLoad=maxLoad, maxMemory=maxMemory, memoryFree=memoryFree, includeNan=includeNan, excludeID=excludeID, excludeUUID=excludeUUID)
  223. availAbleGPUindex = [idx for idx in range(0,len(GPUavailability)) if (GPUavailability[idx] == 1)]
  224. # Discard unavailable GPUs
  225. GPUs = [GPUs[g] for g in availAbleGPUindex]
  226. # Sort available GPUs according to the order argument
  227. if (order == 'first'):
  228. GPUs.sort(key=lambda x: float('inf') if math.isnan(x.id) else x.id, reverse=False)
  229. elif (order == 'last'):
  230. GPUs.sort(key=lambda x: float('-inf') if math.isnan(x.id) else x.id, reverse=True)
  231. elif (order == 'random'):
  232. GPUs = [GPUs[g] for g in random.sample(range(0,len(GPUs)),len(GPUs))]
  233. elif (order == 'load'):
  234. GPUs.sort(key=lambda x: float('inf') if math.isnan(x.load) else x.load, reverse=False)
  235. elif (order == 'memory'):
  236. GPUs.sort(key=lambda x: float('inf') if math.isnan(x.memoryUtil) else x.memoryUtil, reverse=False)
  237. # Extract the number of desired GPUs, but limited to the total number of available GPUs
  238. GPUs = GPUs[0:min(limit, len(GPUs))]
  239. # Extract the device IDs from the GPUs and return them
  240. deviceIds = [gpu.id for gpu in GPUs]
  241. return deviceIds
  242. #def getAvailability(GPUs, maxLoad = 0.5, maxMemory = 0.5, includeNan = False):
  243. # # Determine, which GPUs are available
  244. # GPUavailability = np.zeros(len(GPUs))
  245. # for i in range(len(GPUs)):
  246. # if (GPUs[i].load < maxLoad or (includeNan and np.isnan(GPUs[i].load))) and (GPUs[i].memoryUtil < maxMemory or (includeNan and np.isnan(GPUs[i].memoryUtil))):
  247. # GPUavailability[i] = 1
  248. def getAvailability(GPUs, maxLoad=0.5, maxMemory=0.5, memoryFree=0, includeNan=False, excludeID=[], excludeUUID=[]):
  249. # Determine, which GPUs are available
  250. GPUavailability = [1 if (gpu.memoryFree>=memoryFree) and (gpu.load < maxLoad or (includeNan and math.isnan(gpu.load))) and (gpu.memoryUtil < maxMemory or (includeNan and math.isnan(gpu.memoryUtil))) and ((gpu.id not in excludeID) and (gpu.uuid not in excludeUUID)) else 0 for gpu in GPUs]
  251. return GPUavailability
  252. def getFirstAvailable(order = 'first', maxLoad=0.5, maxMemory=0.5, attempts=1, interval=900, verbose=False, includeNan=False, excludeID=[], excludeUUID=[]):
  253. #GPUs = getGPUs()
  254. #firstAvailableGPU = np.NaN
  255. #for i in range(len(GPUs)):
  256. # if (GPUs[i].load < maxLoad) & (GPUs[i].memory < maxMemory):
  257. # firstAvailableGPU = GPUs[i].id
  258. # break
  259. #return firstAvailableGPU
  260. for i in range(attempts):
  261. if (verbose):
  262. print('Attempting (' + str(i+1) + '/' + str(attempts) + ') to locate available GPU.')
  263. # Get first available GPU
  264. available = getAvailable(order=order, limit=1, maxLoad=maxLoad, maxMemory=maxMemory, includeNan=includeNan, excludeID=excludeID, excludeUUID=excludeUUID)
  265. # If an available GPU was found, break for loop.
  266. if (available):
  267. if (verbose):
  268. print('GPU ' + str(available) + ' located!')
  269. break
  270. # If this is not the last attempt, sleep for 'interval' seconds
  271. if (i != attempts-1):
  272. time.sleep(interval)
  273. # Check if an GPU was found, or if the attempts simply ran out. Throw error, if no GPU was found
  274. if (not(available)):
  275. raise RuntimeError('Could not find an available GPU after ' + str(attempts) + ' attempts with ' + str(interval) + ' seconds interval.')
  276. # Return found GPU
  277. return available
  278. def showUtilization(all=False, attrList=None, useOldCode=False):
  279. GPUs = getGPUs()
  280. if (all):
  281. if (useOldCode):
  282. print(' ID | Name | Serial | UUID || GPU util. | Memory util. || Memory total | Memory used | Memory free || Display mode | Display active |')
  283. print('------------------------------------------------------------------------------------------------------------------------------')
  284. for gpu in GPUs:
  285. print(' {0:2d} | {1:s} | {2:s} | {3:s} || {4:3.0f}% | {5:3.0f}% || {6:.0f}MB | {7:.0f}MB | {8:.0f}MB || {9:s} | {10:s}'.format(gpu.id,gpu.name,gpu.serial,gpu.uuid,gpu.load*100,gpu.memoryUtil*100,gpu.memoryTotal,gpu.memoryUsed,gpu.memoryFree,gpu.display_mode,gpu.display_active))
  286. else:
  287. attrList = [[{'attr':'id','name':'ID'},
  288. {'attr':'name','name':'Name'},
  289. {'attr':'serial','name':'Serial'},
  290. {'attr':'uuid','name':'UUID'}],
  291. [{'attr':'temperature','name':'GPU temp.','suffix':'C','transform': lambda x: x,'precision':0},
  292. {'attr':'load','name':'GPU util.','suffix':'%','transform': lambda x: x*100,'precision':0},
  293. {'attr':'memoryUtil','name':'Memory util.','suffix':'%','transform': lambda x: x*100,'precision':0}],
  294. [{'attr':'memoryTotal','name':'Memory total','suffix':'MB','precision':0},
  295. {'attr':'memoryUsed','name':'Memory used','suffix':'MB','precision':0},
  296. {'attr':'memoryFree','name':'Memory free','suffix':'MB','precision':0}],
  297. [{'attr':'display_mode','name':'Display mode'},
  298. {'attr':'display_active','name':'Display active'}]]
  299. else:
  300. if (useOldCode):
  301. print(' ID GPU MEM')
  302. print('--------------')
  303. for gpu in GPUs:
  304. print(' {0:2d} {1:3.0f}% {2:3.0f}%'.format(gpu.id, gpu.load*100, gpu.memoryUtil*100))
  305. else:
  306. attrList = [[{'attr':'id','name':'ID'},
  307. {'attr':'load','name':'GPU','suffix':'%','transform': lambda x: x*100,'precision':0},
  308. {'attr':'memoryUtil','name':'MEM','suffix':'%','transform': lambda x: x*100,'precision':0}],
  309. ]
  310. if (not useOldCode):
  311. if (attrList is not None):
  312. headerString = ''
  313. GPUstrings = ['']*len(GPUs)
  314. for attrGroup in attrList:
  315. #print(attrGroup)
  316. for attrDict in attrGroup:
  317. headerString = headerString + '| ' + attrDict['name'] + ' '
  318. headerWidth = len(attrDict['name'])
  319. minWidth = len(attrDict['name'])
  320. attrPrecision = '.' + str(attrDict['precision']) if ('precision' in attrDict.keys()) else ''
  321. attrSuffix = str(attrDict['suffix']) if ('suffix' in attrDict.keys()) else ''
  322. attrTransform = attrDict['transform'] if ('transform' in attrDict.keys()) else lambda x : x
  323. for gpu in GPUs:
  324. attr = getattr(gpu,attrDict['attr'])
  325. attr = attrTransform(attr)
  326. if (isinstance(attr,float)):
  327. attrStr = ('{0:' + attrPrecision + 'f}').format(attr)
  328. elif (isinstance(attr,int)):
  329. attrStr = ('{0:d}').format(attr)
  330. elif (isinstance(attr,str)):
  331. attrStr = attr;
  332. elif (sys.version_info[0] == 2):
  333. if (isinstance(attr,unicode)):
  334. attrStr = attr.encode('ascii','ignore')
  335. else:
  336. raise TypeError('Unhandled object type (' + str(type(attr)) + ') for attribute \'' + attrDict['name'] + '\'')
  337. attrStr += attrSuffix
  338. minWidth = max(minWidth,len(attrStr))
  339. headerString += ' '*max(0,minWidth-headerWidth)
  340. minWidthStr = str(minWidth - len(attrSuffix))
  341. for gpuIdx,gpu in enumerate(GPUs):
  342. attr = getattr(gpu,attrDict['attr'])
  343. attr = attrTransform(attr)
  344. if (isinstance(attr,float)):
  345. attrStr = ('{0:'+ minWidthStr + attrPrecision + 'f}').format(attr)
  346. elif (isinstance(attr,int)):
  347. attrStr = ('{0:' + minWidthStr + 'd}').format(attr)
  348. elif (isinstance(attr,str)):
  349. attrStr = ('{0:' + minWidthStr + 's}').format(attr);
  350. elif (sys.version_info[0] == 2):
  351. if (isinstance(attr,unicode)):
  352. attrStr = ('{0:' + minWidthStr + 's}').format(attr.encode('ascii','ignore'))
  353. else:
  354. raise TypeError('Unhandled object type (' + str(type(attr)) + ') for attribute \'' + attrDict['name'] + '\'')
  355. attrStr += attrSuffix
  356. GPUstrings[gpuIdx] += '| ' + attrStr + ' '
  357. headerString = headerString + '|'
  358. for gpuIdx,gpu in enumerate(GPUs):
  359. GPUstrings[gpuIdx] += '|'
  360. headerSpacingString = '-' * len(headerString)
  361. print(headerString)
  362. print(headerSpacingString)
  363. for GPUstring in GPUstrings:
  364. print(GPUstring)
  365. # Generate gpu uuid to id map
  366. gpuUuidToIdMap = {}
  367. try:
  368. gpus = getGPUs()
  369. for gpu in gpus:
  370. gpuUuidToIdMap[gpu.uuid] = gpu.id
  371. del gpus
  372. except:
  373. pass
  374. def getGPUInfos():
  375. ###返回gpus:list,一个GPU为一个元素-对象
  376. ###########:有属性,'id','load','memoryFree',
  377. ###########:'memoryTotal','memoryUsed','memoryUtil','name','serial''temperature','uuid',process
  378. ###其中process:每一个计算进程是一个元素--对象
  379. ############:有属性,'gpuId','gpuName','gpuUuid',
  380. ############:'gpuid','pid','processName','uid', 'uname','usedMemory'
  381. gpus = getGPUs()
  382. gpuUuidToIdMap={}
  383. for gpu in gpus:
  384. gpuUuidToIdMap[gpu.uuid] = gpu.id
  385. gpu.process=[]
  386. indexx = [x.id for x in gpus ]
  387. process = getGPUProcesses()
  388. for pre in process:
  389. pre.gpuid = gpuUuidToIdMap[pre.gpuUuid]
  390. gpuId = indexx.index(pre.gpuid )
  391. gpus[gpuId].process.append(pre )
  392. return gpus
  393. def get_available_gpu(gpuStatus):
  394. ##判断是否有空闲的显卡,如果有返回id,没有返回None
  395. cuda=None
  396. for gpus in gpuStatus:
  397. if len(gpus.process) == 0:
  398. cuda = gpus.id
  399. return str(cuda)
  400. return cuda
  401. def get_whether_gpuProcess():
  402. ##判断是否有空闲的显卡,如果有返回id,没有返回None
  403. gpuStatus=getGPUInfos()
  404. gpuProcess=True
  405. for gpus in gpuStatus:
  406. if len(gpus.process) != 0:
  407. gpuProcess = False
  408. return gpuProcess
  409. def get_offlineProcess_gpu(gpuStatus,pidInfos):
  410. gpu_onLine = []
  411. for gpu in gpuStatus:
  412. for gpuProcess in gpu.process:
  413. pid = gpuProcess.pid
  414. if pid in pidInfos.keys():
  415. pidType = pidInfos[pid]['type']
  416. if pidType == 'onLine':
  417. gpu_onLine.append(gpu)
  418. gpu_offLine = set(gpuStatus) - set(gpu_onLine)
  419. return list(gpu_offLine)
  420. def arrange_offlineProcess(gpuStatus,pidInfos,modelMemory=1500):
  421. cudaArrange=[]
  422. gpu_offLine = get_offlineProcess_gpu(gpuStatus,pidInfos)
  423. for gpu in gpu_offLine:
  424. leftMemory = gpu.memoryTotal*0.9 - gpu.memoryUsed
  425. modelCnt = int(leftMemory// modelMemory)
  426. cudaArrange.extend( [gpu.id] * modelCnt )
  427. return cudaArrange
  428. def get_potential_gpu(gpuStatus,pidInfos):
  429. ###所有GPU上都有计算。需要为“在线任务”空出一块显卡。
  430. ###step1:查看所有显卡上是否有“在线任务”
  431. gpu_offLine = get_offlineProcess_gpu(gpuStatus,pidInfos)
  432. if len(gpu_offLine) == 0 :
  433. return False
  434. ###step2,找出每张显卡上离线进程的数目
  435. offLineCnt = [ len(gpu.process) for gpu in gpu_offLine ]
  436. minCntIndex =offLineCnt.index( min(offLineCnt))
  437. pids = [x.pid for x in gpu_offLine[minCntIndex].process]
  438. return {'cuda':gpu_offLine[minCntIndex].id,'pids':pids }
  439. if __name__=='__main__':
  440. #pres = getGPUProcesses()
  441. #print('###line404:',pres)
  442. gpus = getGPUs()
  443. for gpu in gpus:
  444. gpuUuidToIdMap[gpu.uuid] = gpu.id
  445. print(gpu)
  446. print(gpuUuidToIdMap)
  447. pres = getGPUProcesses()
  448. print('###line404:',pres)
  449. for pre in pres:
  450. print('#'*20)
  451. for ken in ['gpuName','gpuUuid','pid','processName','uid','uname','usedMemory' ]:
  452. print(ken,' ',pre.__getattribute__(ken ))
  453. print(' ')