algN/concurrency/uploadGPU.py

153 lines
5.2 KiB
Python
Raw Normal View History

2025-08-23 10:12:26 +08:00
# -*- coding: utf-8 -*-
from threading import Thread
from time import sleep, time
from traceback import format_exc
#from common.Constant import init_progess
import json,os,psutil,GPUtil,platform,socket
from kafka import KafkaProducer, KafkaConsumer
#from util.KafkaUtils import CustomerKafkaProducer
from common.YmlConstant import service_yml_path, kafka_yml_path
class uploadGPUinfos(Thread):
__slots__ = ('__kafka_config', "_context")
def __init__(self, *args):
super().__init__()
self.__context,self.__kafka_config = args
self.__uploadInterval = self.__context['GPUpollInterval']
#kafkaProducer = CustomerKafkaProducer(self.__kafka_config)
self.__producer = KafkaProducer(
bootstrap_servers=self.__kafka_config['bootstrap_servers'],#tencent yun
value_serializer=lambda v: v.encode('utf-8'))
self.__topic = self.__kafka_config["topicGPU"]
def run(self):
while True:
try:
#获取当前的gpu状态信息
msg_dict = get_system_info()
#发送GPU状态到指定的topic
msg = json.dumps(msg_dict)
# 假设生产的消息为键值对不是一定要键值对且序列化方式为json
#future = kafkaProducer.sender(topic_on,msg)
future = self.__producer .send(self.__topic,msg)
try:
future.get(timeout=10)
except kafka_errors:
traceback.format_exc()
sleep(self.__uploadInterval)
except Exception as e:
print(e)
continue
#logger.error("上传GPU服务器线程状态异常:{}, requestId:{}", format_exc(), request_id)
def get_system_info():
# 初始化一个字典来存储系统信息
system_info = {}
# 获取CPU信息
system_info['CPU'] = {
'Physical Cores': psutil.cpu_count(logical=False), # 物理核心数
'Logical Cores': psutil.cpu_count(logical=True), # 逻辑核心数
'Current Frequency': psutil.cpu_freq().current, # 当前频率
'Usage Per Core': psutil.cpu_percent(interval=1, percpu=True), # 每个核心的使用率
'Total Usage': psutil.cpu_percent(interval=1) # 总体CPU使用率
}
# 获取内存信息
memory = psutil.virtual_memory()
system_info['Memory'] = {
'Total': memory.total / (1024 ** 3), # 总内存单位为GB
'Available': memory.available / (1024 ** 3), # 可用内存
'Used': memory.used / (1024 ** 3), # 已用内存
'Usage Percentage': memory.percent # 内存使用率
}
# 获取GPU信息
gpus = GPUtil.getGPUs()
system_info['GPU'] = []
for gpu in gpus:
gpu_info = {
'ID': gpu.id,
'Name': gpu.name,
'Load': gpu.load * 100, # GPU负载百分比
'Memory Total': gpu.memoryTotal, # 总显存单位为MB
'Memory Used': gpu.memoryUsed, # 已用显存
'Memory Free': gpu.memoryFree, # 可用显存
'Temperature': gpu.temperature # GPU温度
}
system_info['GPU'].append(gpu_info)
# 获取系统信息
system_info['System'] = {
'Platform': platform.system(), # 操作系统类型
'Platform Version': platform.version(), # 操作系统版本
'Platform Release': platform.release(), # 操作系统发行版本
'Platform Node': platform.node(), # 网络名称
'Machine': platform.machine(), # 硬件架构
'Processor': platform.processor() # CPU架构
}
# 获取本机局域网IP地址非回环地址
try:
# 获取所有网络接口信息
net_if_addrs = psutil.net_if_addrs()
for interface, addrs in net_if_addrs.items():
for addr in addrs:
# 筛选IPv4地址且非回环地址
if addr.family == socket.AF_INET and not addr.address.startswith("127."):
system_info['System']['Local IP Address'] = addr.address
break
if 'Local IP Address' in system_info['System']:
break
else:
system_info['System']['Local IP Address'] = "No local IP found"
except Exception as e:
system_info['System']['Local IP Address'] = "Unable to retrieve local IP address"
return system_info
if __name__=="__main__":
context = {
'GPUpollInterval':1,
'topic':'server-status',
}
kafka_config = {
'bootstrap_servers':['192.168.10.66:9092']
}
base_dir, env = '/home/thsw2/WJ/test/tuoheng_algN','test'
upload_thread = uploadGPUinfos(context,kafka_config)
upload_thread.setDaemon(False)
upload_thread.start()
# 主线程等待守护线程运行
try:
while True:
sleep(1)
except KeyboardInterrupt:
print("主线程退出")