153 lines
5.2 KiB
Python
153 lines
5.2 KiB
Python
|
|
# -*- coding: utf-8 -*-
|
|||
|
|
from threading import Thread
|
|||
|
|
from time import sleep, time
|
|||
|
|
from traceback import format_exc
|
|||
|
|
|
|||
|
|
#from common.Constant import init_progess
|
|||
|
|
import json,os,psutil,GPUtil,platform,socket
|
|||
|
|
from kafka import KafkaProducer, KafkaConsumer
|
|||
|
|
#from util.KafkaUtils import CustomerKafkaProducer
|
|||
|
|
from common.YmlConstant import service_yml_path, kafka_yml_path
|
|||
|
|
class uploadGPUinfos(Thread):
|
|||
|
|
__slots__ = ('__kafka_config', "_context")
|
|||
|
|
|
|||
|
|
def __init__(self, *args):
|
|||
|
|
super().__init__()
|
|||
|
|
self.__context,self.__kafka_config = args
|
|||
|
|
self.__uploadInterval = self.__context['GPUpollInterval']
|
|||
|
|
#kafkaProducer = CustomerKafkaProducer(self.__kafka_config)
|
|||
|
|
self.__producer = KafkaProducer(
|
|||
|
|
bootstrap_servers=self.__kafka_config['bootstrap_servers'],#tencent yun
|
|||
|
|
value_serializer=lambda v: v.encode('utf-8'))
|
|||
|
|
|
|||
|
|
self.__topic = self.__kafka_config["topicGPU"]
|
|||
|
|
def run(self):
|
|||
|
|
while True:
|
|||
|
|
|
|||
|
|
try:
|
|||
|
|
#获取当前的gpu状态信息
|
|||
|
|
msg_dict = get_system_info()
|
|||
|
|
#发送GPU状态到指定的topic
|
|||
|
|
msg = json.dumps(msg_dict)
|
|||
|
|
|
|||
|
|
# 假设生产的消息为键值对(不是一定要键值对),且序列化方式为json
|
|||
|
|
|
|||
|
|
#future = kafkaProducer.sender(topic_on,msg)
|
|||
|
|
future = self.__producer .send(self.__topic,msg)
|
|||
|
|
try:
|
|||
|
|
future.get(timeout=10)
|
|||
|
|
except kafka_errors:
|
|||
|
|
traceback.format_exc()
|
|||
|
|
|
|||
|
|
sleep(self.__uploadInterval)
|
|||
|
|
|
|||
|
|
except Exception as e:
|
|||
|
|
print(e)
|
|||
|
|
continue
|
|||
|
|
#logger.error("上传GPU服务器线程状态异常:{}, requestId:{}", format_exc(), request_id)
|
|||
|
|
|
|||
|
|
|
|||
|
|
|
|||
|
|
|
|||
|
|
def get_system_info():
|
|||
|
|
# 初始化一个字典来存储系统信息
|
|||
|
|
system_info = {}
|
|||
|
|
|
|||
|
|
# 获取CPU信息
|
|||
|
|
system_info['CPU'] = {
|
|||
|
|
'Physical Cores': psutil.cpu_count(logical=False), # 物理核心数
|
|||
|
|
'Logical Cores': psutil.cpu_count(logical=True), # 逻辑核心数
|
|||
|
|
'Current Frequency': psutil.cpu_freq().current, # 当前频率
|
|||
|
|
'Usage Per Core': psutil.cpu_percent(interval=1, percpu=True), # 每个核心的使用率
|
|||
|
|
'Total Usage': psutil.cpu_percent(interval=1) # 总体CPU使用率
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
# 获取内存信息
|
|||
|
|
memory = psutil.virtual_memory()
|
|||
|
|
system_info['Memory'] = {
|
|||
|
|
'Total': memory.total / (1024 ** 3), # 总内存,单位为GB
|
|||
|
|
'Available': memory.available / (1024 ** 3), # 可用内存
|
|||
|
|
'Used': memory.used / (1024 ** 3), # 已用内存
|
|||
|
|
'Usage Percentage': memory.percent # 内存使用率
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
# 获取GPU信息
|
|||
|
|
gpus = GPUtil.getGPUs()
|
|||
|
|
system_info['GPU'] = []
|
|||
|
|
for gpu in gpus:
|
|||
|
|
gpu_info = {
|
|||
|
|
'ID': gpu.id,
|
|||
|
|
'Name': gpu.name,
|
|||
|
|
'Load': gpu.load * 100, # GPU负载,百分比
|
|||
|
|
'Memory Total': gpu.memoryTotal, # 总显存,单位为MB
|
|||
|
|
'Memory Used': gpu.memoryUsed, # 已用显存
|
|||
|
|
'Memory Free': gpu.memoryFree, # 可用显存
|
|||
|
|
'Temperature': gpu.temperature # GPU温度
|
|||
|
|
}
|
|||
|
|
system_info['GPU'].append(gpu_info)
|
|||
|
|
|
|||
|
|
# 获取系统信息
|
|||
|
|
system_info['System'] = {
|
|||
|
|
'Platform': platform.system(), # 操作系统类型
|
|||
|
|
'Platform Version': platform.version(), # 操作系统版本
|
|||
|
|
'Platform Release': platform.release(), # 操作系统发行版本
|
|||
|
|
'Platform Node': platform.node(), # 网络名称
|
|||
|
|
'Machine': platform.machine(), # 硬件架构
|
|||
|
|
'Processor': platform.processor() # CPU架构
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
# 获取本机局域网IP地址(非回环地址)
|
|||
|
|
try:
|
|||
|
|
# 获取所有网络接口信息
|
|||
|
|
net_if_addrs = psutil.net_if_addrs()
|
|||
|
|
for interface, addrs in net_if_addrs.items():
|
|||
|
|
for addr in addrs:
|
|||
|
|
# 筛选IPv4地址且非回环地址
|
|||
|
|
if addr.family == socket.AF_INET and not addr.address.startswith("127."):
|
|||
|
|
system_info['System']['Local IP Address'] = addr.address
|
|||
|
|
break
|
|||
|
|
if 'Local IP Address' in system_info['System']:
|
|||
|
|
break
|
|||
|
|
else:
|
|||
|
|
system_info['System']['Local IP Address'] = "No local IP found"
|
|||
|
|
except Exception as e:
|
|||
|
|
system_info['System']['Local IP Address'] = "Unable to retrieve local IP address"
|
|||
|
|
|
|||
|
|
return system_info
|
|||
|
|
|
|||
|
|
|
|||
|
|
|
|||
|
|
|
|||
|
|
if __name__=="__main__":
|
|||
|
|
|
|||
|
|
|
|||
|
|
context = {
|
|||
|
|
'GPUpollInterval':1,
|
|||
|
|
'topic':'server-status',
|
|||
|
|
}
|
|||
|
|
kafka_config = {
|
|||
|
|
'bootstrap_servers':['192.168.10.66:9092']
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
base_dir, env = '/home/thsw2/WJ/test/tuoheng_algN','test'
|
|||
|
|
|
|||
|
|
|
|||
|
|
|
|||
|
|
|
|||
|
|
|
|||
|
|
upload_thread = uploadGPUinfos(context,kafka_config)
|
|||
|
|
upload_thread.setDaemon(False)
|
|||
|
|
|
|||
|
|
upload_thread.start()
|
|||
|
|
|
|||
|
|
|
|||
|
|
# 主线程等待守护线程运行
|
|||
|
|
try:
|
|||
|
|
while True:
|
|||
|
|
sleep(1)
|
|||
|
|
except KeyboardInterrupt:
|
|||
|
|
print("主线程退出")
|
|||
|
|
|
|||
|
|
|
|||
|
|
|
|||
|
|
|