tuoheng_algN/vodsdk/service/Dispatcher.py

483 lines
26 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

# -*- coding: utf-8 -*-
import time,os
from os.path import join
from traceback import format_exc
import json
from cerberus import Validator
from common.Constant import ONLINE_START_SCHEMA, ONLINE_STOP_SCHEMA, OFFLINE_START_SCHEMA, OFFLINE_STOP_SCHEMA, \
IMAGE_SCHEMA, RECORDING_START_SCHEMA, RECORDING_STOP_SCHEMA, PULL2PUSH_START_SCHEMA, PULL2PUSH_STOP_SCHEMA
from common.YmlConstant import service_yml_path, kafka_yml_path
from concurrency.FeedbackThread import FeedbackThread
from concurrency.uploadGPU import uploadGPUinfos
from concurrency.IntelligentRecognitionProcess2 import OnlineIntelligentRecognitionProcess2, \
OfflineIntelligentRecognitionProcess2, PhotosIntelligentRecognitionProcess2
from concurrency.Pull2PushStreamProcess import PushStreamProcess
from entity.FeedBack import message_feedback, recording_feedback, pull_stream_feedback
from enums.AnalysisStatusEnum import AnalysisStatus
from enums.AnalysisTypeEnum import AnalysisType
from enums.ExceptionEnum import ExceptionType
from enums.ModelTypeEnum import ModelMethodTypeEnum, ModelType
from enums.RecordingStatusEnum import RecordingStatus
from enums.StatusEnum import PushStreamStatus, ExecuteStatus
from exception.CustomerException import ServiceException
from loguru import logger
from multiprocessing import Queue
from concurrency.IntelligentRecognitionProcess import OnlineIntelligentRecognitionProcess, \
OfflineIntelligentRecognitionProcess, PhotosIntelligentRecognitionProcess, ScreenRecordingProcess
from util.CpuUtils import print_cpu_ex_status
from util.FileUtils import create_dir_not_exist
from util.GPUtils import get_first_gpu_name, print_gpu_ex_status, check_cude_is_available,select_best_server
from util.KafkaUtils import CustomerKafkaConsumer
from util.QueUtil import put_queue
from util.RWUtils import getConfigs
from kafka import KafkaProducer, KafkaConsumer
'''
分发服务
'''
class DispatcherService:
__slots__ = ('__context', '__feedbackThread', '__listeningProcesses', '__fbQueue', '__topics','__taskType', '__task_type',
'__kafka_config', '__recordingProcesses', '__pull2PushProcesses','__topicsPort','__gpuTopic','__role','__uploadGPUThread','__gpuDics','__producer')
def __init__(self, base_dir, env):
# 检测cuda是否活动
check_cude_is_available()
# 获取全局上下文配置
self.__context = getConfigs(join(base_dir, service_yml_path % env))
# 创建任务执行, 视频保存路径
create_dir_not_exist(join(base_dir, self.__context["video"]["file_path"]))
# 将根路径和环境设置到上下文中
self.__context["base_dir"], self.__context["env"] = base_dir, env
# 问题反馈线程
self.__feedbackThread,self.__uploadGPUThread, self.__fbQueue = None,None, Queue()
# 实时、离线、图片任务进程字典
self.__listeningProcesses = {}
# 录屏任务进程字典
self.__recordingProcesses = {}
# 转推流任务进程字典
self.__pull2PushProcesses = {}
self.__kafka_config = getConfigs(join(base_dir, kafka_yml_path % env))
self.__producer = KafkaProducer(
bootstrap_servers=self.__kafka_config['bootstrap_servers'],#tencent yun
value_serializer=lambda v: v.encode('utf-8'))
self.__gpuDics = { }#用于存储gpu信息的字典
self.__role = self.__context["role"]
self.__topics = [
self.__kafka_config["topic"]["dsp-alg-online-tasks-topic"], # 实时监听topic
self.__kafka_config["topic"]["dsp-alg-offline-tasks-topic"], # 离线监听topic
self.__kafka_config["topic"]["dsp-alg-image-tasks-topic"], # 图片监听topic
self.__kafka_config["topic"]["dsp-recording-task-topic"], # 录屏监听topic
self.__kafka_config["topic"]["dsp-push-stream-task-topic"] # 推流监听topic
]
self.__topicsPort = [
self.__kafka_config["topicPort"]["dsp-alg-online-tasks-topic"], # 实时监听topic
self.__kafka_config["topicPort"]["dsp-alg-offline-tasks-topic"], # 离线监听topic
self.__kafka_config["topicPort"]["dsp-alg-image-tasks-topic"], # 图片监听topic
self.__kafka_config["topicPort"]["dsp-recording-task-topic"], # 录屏监听topic
self.__kafka_config["topicPort"]["dsp-push-stream-task-topic"] # 推流监听topic
]
self.__gpuTopic = [self.__kafka_config["topicGPU"]]
if self.__role==1:
self.__topics = self.__topics + self.__topicsPort + self.__gpuTopic
# 对应topic的各个lambda表达式
self.__task_type = {
self.__topics[0]: (AnalysisType.ONLINE.value, lambda x, y: self.online(x, y),
lambda x, y, z: self.identify_method(x, y, z)),
self.__topics[1]: (AnalysisType.OFFLINE.value, lambda x, y: self.offline(x, y),
lambda x, y, z: self.identify_method(x, y, z)),
self.__topics[2]: (AnalysisType.IMAGE.value, lambda x, y: self.image(x, y),
lambda x, y, z: self.identify_method(x, y, z)),
self.__topics[3]: (AnalysisType.RECORDING.value, lambda x, y: self.recording(x, y),
lambda x, y, z: self.recording_method(x, y, z)),
self.__topics[4]: (AnalysisType.PULLTOPUSH.value, lambda x, y: self.pullStream(x, y),
lambda x, y, z: self.push_stream_method(x, y, z))
}
self.__taskType={
self.__kafka_config["topic"]["dsp-alg-online-tasks-topic"]:0, # 实时监听topic
self.__kafka_config["topic"]["dsp-alg-offline-tasks-topic"]:1, # 离线监听topic
self.__kafka_config["topic"]["dsp-alg-image-tasks-topic"]:2, # 图片监听topic
self.__kafka_config["topic"]["dsp-recording-task-topic"]:3, # 录屏监听topic
self.__kafka_config["topic"]["dsp-push-stream-task-topic"]:4 # 推流监听topic
}
gpu_name_array = get_first_gpu_name()
gpu_array = [g for g in ('3090', '2080', '4090', 'A10') if g in gpu_name_array]
gpu_name = '2080Ti'
if len(gpu_array) > 0:
if gpu_array[0] != '2080':
gpu_name = gpu_array[0]
else:
raise Exception("GPU资源不在提供的模型所支持的范围内请先提供对应的GPU模型")
logger.info("当前服务环境为: {}, 服务器GPU使用型号: {}", env, gpu_name)
self.__context["gpu_name"] = gpu_name
self.start_service()
# 服务调用启动方法
def start_service(self):
# 初始化kafka监听者
customerKafkaConsumer = CustomerKafkaConsumer(self.__kafka_config, topics=self.__topics)
####增加一个线程用于试试监控和发送gpu状态####
####
logger.info("(♥◠‿◠)ノ゙ DSP【算法调度服务】启动成功 服务器IP:{}".format(self.__kafka_config['bootstrap_servers'] ))
while True:
try:
# 检查任务进程运行情况,去除结束的任务
self.check_process_task()
# 启动反馈线程
self.start_feedback_thread()
self.start_uploadGPU_thread()
msg = customerKafkaConsumer.poll()
if msg is not None and len(msg) > 0:
for k, v in msg.items():
for m in v:
message = m.value
#如果收到的信息是gpu状态的话收到信息后更新自己的gpu服务器状态下面不再执行
if m.topic in self.__gpuTopic:
customerKafkaConsumer.commit_offset(m,'x'*16,False)
#更新机器资源现状
ip = message['System']['Local IP Address']
self.__gpuDics[ip]=message
continue
#如果收到的信息是门户消息收到信息后要根据Gpu状态转发到对应的机器。
elif m.topic in self.__topicsPort:
customerKafkaConsumer.commit_offset(m, 'y'*16)
#状态分析
#recondGpu={'hostname':'thsw2','IP':'192.168.10.66','gpuId':0}
recondGpu= select_best_server(self.__gpuDics)
if recondGpu is None:
print( 'recondGpu',recondGpu, ' self.__gpuDics: ',self.__gpuDics,' topic:',m.topic, ' message:',message )
continue
#转发消息
message['transmit_topic'] = m.topic + '-' + recondGpu['IP']
transmitMsg={'transmit':message}
msg_json = json.dumps( message )
future = self.__producer.send( message['transmit_topic'] ,msg_json)
try:
future.get(timeout=2)
logger.info( "转发消息成功消息topic:{},消息内容:{}",message['transmit_topic'],message )
except kafka_errors as e:
print('------transmitted error:',e)
logger.info("转发消息失败")
traceback.format_exc()
else:
requestId = message.get("request_id")
if requestId is None:
logger.error("请求参数格式错误, 请检查请求体格式是否正确message:%s"%(message))
continue
customerKafkaConsumer.commit_offset(m, requestId)
logger.info("当前拉取到的消息, topic:{}, offset:{}, partition: {}, body: {}, requestId:{}",
m.topic, m.offset, m.partition, message, requestId)
message['taskType']=self.__taskType[m.topic]
topic_method = self.__task_type[m.topic]
topic_method[2](topic_method[1], message, topic_method[0])
else:
print_gpu_ex_status()
print_cpu_ex_status(self.__context["base_dir"])
time.sleep(1)
except Exception:
logger.error("主线程异常:{}", format_exc())
def identify_method(self, handle_method, message, analysisType):
try:
check_cude_is_available()
handle_method(message, analysisType)
except ServiceException as s:
logger.error("消息监听异常:{}, requestId: {}", s.msg, message["request_id"])
put_queue(self.__fbQueue, message_feedback(message["request_id"], AnalysisStatus.FAILED.value, analysisType,
s.code, s.msg), timeout=1)
except Exception:
logger.error("消息监听异常:{}, requestId: {}", format_exc(), message["request_id"])
put_queue(self.__fbQueue, message_feedback(message["request_id"], AnalysisStatus.FAILED.value, analysisType,
ExceptionType.SERVICE_INNER_EXCEPTION.value[0],
ExceptionType.SERVICE_INNER_EXCEPTION.value[1]), timeout=1)
finally:
del message
def push_stream_method(self, handle_method, message, analysisType):
try:
check_cude_is_available()
handle_method(message, analysisType)
except ServiceException as s:
logger.error("消息监听异常:{}, requestId: {}", s.msg, message['request_id'])
videoInfo = [{"id": url.get("id"), "status": PushStreamStatus.FAILED.value[0]} for url in
message.get("video_urls", []) if url.get("id") is not None]
put_queue(self.__fbQueue, pull_stream_feedback(message['request_id'], ExecuteStatus.FAILED.value[0],
s.code, s.msg, videoInfo), timeout=1)
except Exception:
logger.error("消息监听异常:{}, requestId: {}", format_exc(), message['request_id'])
videoInfo = [{"id": url.get("id"), "status": PushStreamStatus.FAILED.value[0]} for url in
message.get("video_urls", []) if url.get("id") is not None]
put_queue(self.__fbQueue, pull_stream_feedback(message.get("request_id"), ExecuteStatus.FAILED.value[0],
ExceptionType.SERVICE_INNER_EXCEPTION.value[0],
ExceptionType.SERVICE_INNER_EXCEPTION.value[1], videoInfo),
timeout=1)
finally:
del message
def recording_method(self, handle_method, message, analysisType):
try:
check_cude_is_available()
handle_method(message, analysisType)
except ServiceException as s:
logger.error("消息监听异常:{}, requestId: {}", s.msg, message["request_id"])
put_queue(self.__fbQueue,
recording_feedback(message["request_id"], RecordingStatus.RECORDING_FAILED.value[0],
error_code=s.code, error_msg=s.msg), timeout=1)
except Exception:
logger.error("消息监听异常:{}, requestId: {}", format_exc(), message["request_id"])
put_queue(self.__fbQueue,
recording_feedback(message["request_id"], RecordingStatus.RECORDING_FAILED.value[0],
ExceptionType.SERVICE_INNER_EXCEPTION.value[0],
ExceptionType.SERVICE_INNER_EXCEPTION.value[1]), timeout=1)
finally:
del message
# 开启实时进程
def startOnlineProcess(self, msg, analysisType):
if self.__listeningProcesses.get(msg["request_id"]):
logger.warning("实时重复任务请稍后再试requestId:{}", msg["request_id"])
return
model_type = self.__context["service"]["model"]["model_type"]
codes = [model.get("code") for model in msg["models"] if model.get("code")]
if ModelMethodTypeEnum.NORMAL.value == model_type or ModelType.ILLPARKING_MODEL.value[1] in codes:
coir = OnlineIntelligentRecognitionProcess(self.__fbQueue, msg, analysisType, self.__context)
else:
coir = OnlineIntelligentRecognitionProcess2(self.__fbQueue, msg, analysisType, self.__context)
coir.start()
logger.info("开始实时进程requestId:{},pid:{}, ppid:{}", msg["request_id"],os.getpid(),os.getppid())
self.__listeningProcesses[msg["request_id"]] = coir
# 结束实时进程
def stopOnlineProcess(self, msg):
ps = self.__listeningProcesses.get(msg["request_id"])
if ps is None:
logger.warning("未查询到该任务无法停止任务requestId:{}", msg["request_id"])
return
ps.sendEvent({"command": "stop"})
# 新增该函数用于向子任务发送命令algStartalgStop
def sendCmdToChildProcess(self, msg,cmd="algStart"):
ps = self.__listeningProcesses.get(msg["request_id"])
if ps is None:
logger.warning("未查询到该任务无法停止任务requestId:{}", msg["request_id"])
return
ps.sendEvent({"command": cmd})
@staticmethod
def check_process(listeningProcess):
for requestId in list(listeningProcess.keys()):
if not listeningProcess[requestId].is_alive():
del listeningProcess[requestId]
def check_process_task(self):
self.check_process(self.__listeningProcesses)
self.check_process(self.__recordingProcesses)
self.check_process(self.__pull2PushProcesses)
# 开启离线进程
def startOfflineProcess(self, msg, analysisType):
if self.__listeningProcesses.get(msg["request_id"]):
logger.warning("离线重复任务请稍后再试requestId:{}", msg["request_id"])
return
model_type = self.__context["service"]["model"]["model_type"]
codes = [model.get("code") for model in msg["models"] if model.get("code")]
if ModelMethodTypeEnum.NORMAL.value == model_type:
first = OfflineIntelligentRecognitionProcess(self.__fbQueue, msg, analysisType, self.__context)
else:
first = OfflineIntelligentRecognitionProcess2(self.__fbQueue, msg, analysisType, self.__context)
first.start()
self.__listeningProcesses[msg["request_id"]] = first
# 结束离线进程
def stopOfflineProcess(self, msg):
ps = self.__listeningProcesses.get(msg["request_id"])
if ps is None:
logger.warning("未查询到该任务无法停止任务requestId:{}", msg["request_id"])
return
ps.sendEvent({"command": "stop"})
# 开启图片分析进程
def startImageProcess(self, msg, analysisType):
pp = self.__listeningProcesses.get(msg["request_id"])
if pp is not None:
logger.warning("重复任务请稍后再试requestId:{}", msg["request_id"])
return
model_type = self.__context["service"]["model"]["model_type"]
codes = [model.get("code") for model in msg["models"] if model.get("code")]
if ModelMethodTypeEnum.NORMAL.value == model_type or ModelType.ILLPARKING_MODEL.value[1] in codes:
imaged = PhotosIntelligentRecognitionProcess(self.__fbQueue, msg, analysisType, self.__context)
else:
imaged = PhotosIntelligentRecognitionProcess2(self.__fbQueue, msg, analysisType, self.__context)
# 创建在线识别进程并启动
imaged.start()
self.__listeningProcesses[msg["request_id"]] = imaged
'''
校验kafka消息
'''
@staticmethod
def check_msg(msg, schema):
try:
v = Validator(schema, allow_unknown=True)
result = v.validate(msg)
if not result:
logger.error("参数校验异常: {}, requestId: {}", v.errors, msg["request_id"])
raise ServiceException(ExceptionType.ILLEGAL_PARAMETER_FORMAT.value[0],
ExceptionType.ILLEGAL_PARAMETER_FORMAT.value[1])
except ServiceException as s:
raise s
except Exception:
logger.error("参数校验异常: {}, requestId: {}", format_exc(), msg["request_id"])
raise ServiceException(ExceptionType.ILLEGAL_PARAMETER_FORMAT.value[0],
ExceptionType.ILLEGAL_PARAMETER_FORMAT.value[1])
'''
开启反馈线程,用于发送消息
'''
def start_feedback_thread(self):
if self.__feedbackThread is None:
self.__feedbackThread = FeedbackThread(self.__fbQueue, self.__kafka_config)
self.__feedbackThread.setDaemon(True)
self.__feedbackThread.start()
time.sleep(1)
if self.__feedbackThread and not self.__feedbackThread.is_alive():
logger.error("反馈线程异常停止, 开始重新启动反馈线程!!!!!")
self.__feedbackThread = FeedbackThread(self.__fbQueue, self.__kafka_config)
self.__feedbackThread.setDaemon(True)
self.__feedbackThread.start()
time.sleep(1)
def start_uploadGPU_thread(self):
if self.__uploadGPUThread is None:
self.__uploadGPUThread = uploadGPUinfos(self.__context, self.__kafka_config)
self.__uploadGPUThread.setDaemon(True)
self.__uploadGPUThread.start()
time.sleep(1)
if self.__uploadGPUThread and not self.__uploadGPUThread.is_alive():
logger.error("反馈线程异常停止, 开始重新启动反馈线程!!!!!")
self.__uploadGPUThread = uploadGPUinfos(self.__context, self.__kafka_config)
self.__uploadGPUThread.setDaemon(True)
self.__uploadGPUThread.start()
time.sleep(1)
'''
在线分析逻辑
'''
def online(self, message, analysisType):
if "start" == message.get("command"):
self.check_msg(message, ONLINE_START_SCHEMA)
if len(self.__listeningProcesses) >= int(self.__context['service']["task"]["limit"]):
raise ServiceException(ExceptionType.NO_RESOURCES.value[0],
ExceptionType.NO_RESOURCES.value[1])
self.startOnlineProcess(message, analysisType)
elif message.get("command") in ["algStart","algStop"]:
self.sendCmdToChildProcess(message,cmd=message.get("command"))
elif "stop" == message.get("command"):
self.check_msg(message, ONLINE_STOP_SCHEMA)
self.stopOnlineProcess(message)
else:
raise ServiceException(ExceptionType.ILLEGAL_PARAMETER_FORMAT.value[0],
ExceptionType.ILLEGAL_PARAMETER_FORMAT.value[1])
def offline(self, message, analysisType):
if "start" == message.get("command"):
self.check_msg(message, OFFLINE_START_SCHEMA)
if len(self.__listeningProcesses) >= int(self.__context['service']["task"]["limit"]):
raise ServiceException(ExceptionType.NO_RESOURCES.value[0],
ExceptionType.NO_RESOURCES.value[1])
self.startOfflineProcess(message, analysisType)
elif message.get("command") in ["algStart","algStop"]:
self.sendCmdToChildProcess( message,cmd=message.get("command"))
elif "stop" == message.get("command"):
self.check_msg(message, OFFLINE_STOP_SCHEMA)
self.stopOfflineProcess(message)
else:
raise ServiceException(ExceptionType.ILLEGAL_PARAMETER_FORMAT.value[0],
ExceptionType.ILLEGAL_PARAMETER_FORMAT.value[1])
def image(self, message, analysisType):
if "start" == message.get("command"):
self.check_msg(message, IMAGE_SCHEMA)
if len(self.__listeningProcesses) >= int(self.__context['service']["task"]["image"]["limit"]):
raise ServiceException(ExceptionType.NO_RESOURCES.value[0],
ExceptionType.NO_RESOURCES.value[1])
self.startImageProcess(message, analysisType)
else:
raise ServiceException(ExceptionType.ILLEGAL_PARAMETER_FORMAT.value[0],
ExceptionType.ILLEGAL_PARAMETER_FORMAT.value[1])
def recording(self, message, analysisType):
if "start" == message.get("command"):
self.check_msg(message, RECORDING_START_SCHEMA)
if len(self.__recordingProcesses) >= int(self.__context['service']["task"]["limit"]):
raise ServiceException(ExceptionType.NO_RESOURCES.value[0],
ExceptionType.NO_RESOURCES.value[1])
self.startRecordingProcess(message, analysisType)
elif "stop" == message.get("command"):
self.check_msg(message, RECORDING_STOP_SCHEMA)
self.stopRecordingProcess(message)
else:
raise ServiceException(ExceptionType.ILLEGAL_PARAMETER_FORMAT.value[0],
ExceptionType.ILLEGAL_PARAMETER_FORMAT.value[1])
# 开启录屏进程
def startRecordingProcess(self, msg, analysisType):
if self.__listeningProcesses.get(msg["request_id"]):
logger.warning("重复任务请稍后再试requestId:{}", msg["request_id"])
return
srp = ScreenRecordingProcess(self.__fbQueue, self.__context, msg, analysisType)
srp.start()
self.__recordingProcesses[msg["request_id"]] = srp
# 结束录屏进程
def stopRecordingProcess(self, msg):
rdp = self.__recordingProcesses.get(msg["request_id"])
if rdp is None:
logger.warning("未查询到该任务无法停止任务requestId:{}", msg["request_id"])
return
rdp.sendEvent({"command": "stop"})
def pullStream(self, message, analysisType):
if "start" == message.get("command"):
self.check_msg(message, PULL2PUSH_START_SCHEMA)
if len(self.__pull2PushProcesses) >= int(self.__context['service']["task"]["limit"]):
raise ServiceException(ExceptionType.NO_RESOURCES.value[0],
ExceptionType.NO_RESOURCES.value[1])
self.startPushStreamProcess(message, analysisType)
elif "stop" == message.get("command"):
self.check_msg(message, PULL2PUSH_STOP_SCHEMA)
self.stopPushStreamProcess(message)
else:
raise ServiceException(ExceptionType.ILLEGAL_PARAMETER_FORMAT.value[0],
ExceptionType.ILLEGAL_PARAMETER_FORMAT.value[1])
def startPushStreamProcess(self, msg, analysisType):
if self.__pull2PushProcesses.get(msg["request_id"]):
logger.warning("重复任务请稍后再试requestId:{}", msg["request_id"])
return
srp = PushStreamProcess(self.__fbQueue, self.__context, msg, analysisType)
srp.start()
self.__pull2PushProcesses[msg["request_id"]] = srp
# 结束录屏进程
def stopPushStreamProcess(self, msg):
srp = self.__pull2PushProcesses.get(msg["request_id"])
if srp is None:
logger.warning("未查询到该任务无法停止任务requestId:{}", msg["request_id"])
return
srp.sendEvent({"command": "stop", "videoIds": msg.get("video_ids", [])})