# -*- coding: utf-8 -*- from threading import Thread from time import sleep, time from traceback import format_exc #from common.Constant import init_progess import json,os,psutil,GPUtil,platform,socket from kafka import KafkaProducer, KafkaConsumer #from util.KafkaUtils import CustomerKafkaProducer from common.YmlConstant import service_yml_path, kafka_yml_path class uploadGPUinfos(Thread): __slots__ = ('__kafka_config', "_context") def __init__(self, *args): super().__init__() self.__context,self.__kafka_config = args self.__uploadInterval = self.__context['GPUpollInterval'] #kafkaProducer = CustomerKafkaProducer(self.__kafka_config) self.__producer = KafkaProducer( bootstrap_servers=self.__kafka_config['bootstrap_servers'],#tencent yun value_serializer=lambda v: v.encode('utf-8')) self.__topic = self.__kafka_config["topicGPU"] def run(self): while True: try: #获取当前的gpu状态信息 msg_dict = get_system_info() #发送GPU状态到指定的topic msg = json.dumps(msg_dict) # 假设生产的消息为键值对(不是一定要键值对),且序列化方式为json #future = kafkaProducer.sender(topic_on,msg) future = self.__producer .send(self.__topic,msg) try: future.get(timeout=10) except kafka_errors: traceback.format_exc() sleep(self.__uploadInterval) except Exception as e: print(e) continue #logger.error("上传GPU服务器线程状态异常:{}, requestId:{}", format_exc(), request_id) def get_system_info(): # 初始化一个字典来存储系统信息 system_info = {} # 获取CPU信息 system_info['CPU'] = { 'Physical Cores': psutil.cpu_count(logical=False), # 物理核心数 'Logical Cores': psutil.cpu_count(logical=True), # 逻辑核心数 'Current Frequency': psutil.cpu_freq().current, # 当前频率 'Usage Per Core': psutil.cpu_percent(interval=1, percpu=True), # 每个核心的使用率 'Total Usage': psutil.cpu_percent(interval=1) # 总体CPU使用率 } # 获取内存信息 memory = psutil.virtual_memory() system_info['Memory'] = { 'Total': memory.total / (1024 ** 3), # 总内存,单位为GB 'Available': memory.available / (1024 ** 3), # 可用内存 'Used': memory.used / (1024 ** 3), # 已用内存 'Usage Percentage': memory.percent # 内存使用率 } # 获取GPU信息 gpus = GPUtil.getGPUs() system_info['GPU'] = [] for gpu in gpus: gpu_info = { 'ID': gpu.id, 'Name': gpu.name, 'Load': gpu.load * 100, # GPU负载,百分比 'Memory Total': gpu.memoryTotal, # 总显存,单位为MB 'Memory Used': gpu.memoryUsed, # 已用显存 'Memory Free': gpu.memoryFree, # 可用显存 'Temperature': gpu.temperature # GPU温度 } system_info['GPU'].append(gpu_info) # 获取系统信息 system_info['System'] = { 'Platform': platform.system(), # 操作系统类型 'Platform Version': platform.version(), # 操作系统版本 'Platform Release': platform.release(), # 操作系统发行版本 'Platform Node': platform.node(), # 网络名称 'Machine': platform.machine(), # 硬件架构 'Processor': platform.processor() # CPU架构 } # 获取本机局域网IP地址(非回环地址) try: # 获取所有网络接口信息 net_if_addrs = psutil.net_if_addrs() for interface, addrs in net_if_addrs.items(): for addr in addrs: # 筛选IPv4地址且非回环地址 if addr.family == socket.AF_INET and not addr.address.startswith("127."): system_info['System']['Local IP Address'] = addr.address break if 'Local IP Address' in system_info['System']: break else: system_info['System']['Local IP Address'] = "No local IP found" except Exception as e: system_info['System']['Local IP Address'] = "Unable to retrieve local IP address" return system_info if __name__=="__main__": context = { 'GPUpollInterval':1, 'topic':'server-status', } kafka_config = { 'bootstrap_servers':['192.168.10.66:9092'] } base_dir, env = '/home/thsw2/WJ/test/tuoheng_algN','test' upload_thread = uploadGPUinfos(context,kafka_config) upload_thread.setDaemon(False) upload_thread.start() # 主线程等待守护线程运行 try: while True: sleep(1) except KeyboardInterrupt: print("主线程退出")