tuoheng_algN/concurrency/uploadGPU.py

153 lines
5.2 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

# -*- coding: utf-8 -*-
from threading import Thread
from time import sleep, time
from traceback import format_exc
#from common.Constant import init_progess
import json,os,psutil,GPUtil,platform,socket
from kafka import KafkaProducer, KafkaConsumer
#from util.KafkaUtils import CustomerKafkaProducer
from common.YmlConstant import service_yml_path, kafka_yml_path
class uploadGPUinfos(Thread):
__slots__ = ('__kafka_config', "_context")
def __init__(self, *args):
super().__init__()
self.__context,self.__kafka_config = args
self.__uploadInterval = self.__context['GPUpollInterval']
#kafkaProducer = CustomerKafkaProducer(self.__kafka_config)
self.__producer = KafkaProducer(
bootstrap_servers=self.__kafka_config['bootstrap_servers'],#tencent yun
value_serializer=lambda v: v.encode('utf-8'))
self.__topic = self.__kafka_config["topicGPU"]
def run(self):
while True:
try:
#获取当前的gpu状态信息
msg_dict = get_system_info()
#发送GPU状态到指定的topic
msg = json.dumps(msg_dict)
# 假设生产的消息为键值对不是一定要键值对且序列化方式为json
#future = kafkaProducer.sender(topic_on,msg)
future = self.__producer .send(self.__topic,msg)
try:
future.get(timeout=10)
except kafka_errors:
traceback.format_exc()
sleep(self.__uploadInterval)
except Exception as e:
print(e)
continue
#logger.error("上传GPU服务器线程状态异常:{}, requestId:{}", format_exc(), request_id)
def get_system_info():
# 初始化一个字典来存储系统信息
system_info = {}
# 获取CPU信息
system_info['CPU'] = {
'Physical Cores': psutil.cpu_count(logical=False), # 物理核心数
'Logical Cores': psutil.cpu_count(logical=True), # 逻辑核心数
'Current Frequency': psutil.cpu_freq().current, # 当前频率
'Usage Per Core': psutil.cpu_percent(interval=1, percpu=True), # 每个核心的使用率
'Total Usage': psutil.cpu_percent(interval=1) # 总体CPU使用率
}
# 获取内存信息
memory = psutil.virtual_memory()
system_info['Memory'] = {
'Total': memory.total / (1024 ** 3), # 总内存单位为GB
'Available': memory.available / (1024 ** 3), # 可用内存
'Used': memory.used / (1024 ** 3), # 已用内存
'Usage Percentage': memory.percent # 内存使用率
}
# 获取GPU信息
gpus = GPUtil.getGPUs()
system_info['GPU'] = []
for gpu in gpus:
gpu_info = {
'ID': gpu.id,
'Name': gpu.name,
'Load': gpu.load * 100, # GPU负载百分比
'Memory Total': gpu.memoryTotal, # 总显存单位为MB
'Memory Used': gpu.memoryUsed, # 已用显存
'Memory Free': gpu.memoryFree, # 可用显存
'Temperature': gpu.temperature # GPU温度
}
system_info['GPU'].append(gpu_info)
# 获取系统信息
system_info['System'] = {
'Platform': platform.system(), # 操作系统类型
'Platform Version': platform.version(), # 操作系统版本
'Platform Release': platform.release(), # 操作系统发行版本
'Platform Node': platform.node(), # 网络名称
'Machine': platform.machine(), # 硬件架构
'Processor': platform.processor() # CPU架构
}
# 获取本机局域网IP地址非回环地址
try:
# 获取所有网络接口信息
net_if_addrs = psutil.net_if_addrs()
for interface, addrs in net_if_addrs.items():
for addr in addrs:
# 筛选IPv4地址且非回环地址
if addr.family == socket.AF_INET and not addr.address.startswith("127."):
system_info['System']['Local IP Address'] = addr.address
break
if 'Local IP Address' in system_info['System']:
break
else:
system_info['System']['Local IP Address'] = "No local IP found"
except Exception as e:
system_info['System']['Local IP Address'] = "Unable to retrieve local IP address"
return system_info
if __name__=="__main__":
context = {
'GPUpollInterval':1,
'topic':'server-status',
}
kafka_config = {
'bootstrap_servers':['192.168.10.66:9092']
}
base_dir, env = '/home/thsw2/WJ/test/tuoheng_algN','test'
upload_thread = uploadGPUinfos(context,kafka_config)
upload_thread.setDaemon(False)
upload_thread.start()
# 主线程等待守护线程运行
try:
while True:
sleep(1)
except KeyboardInterrupt:
print("主线程退出")