import argparse import os import sys from pathlib import Path import cv2 import torch import torch.backends.cudnn as cudnn import torch.nn as nn from collections import OrderedDict, namedtuple import numpy as np import time import tensorrt as trt #import pycuda.driver as cuda def trt_version(): return trt.__version__ def torch_device_from_trt(device): if device == trt.TensorLocation.DEVICE: return torch.device("cuda") elif device == trt.TensorLocation.HOST: return torch.device("cpu") else: return TypeError("%s is not supported by torch" % device) def torch_dtype_from_trt(dtype): if dtype == trt.int8: return torch.int8 elif trt_version() >= '7.0' and dtype == trt.bool: return torch.bool elif dtype == trt.int32: return torch.int32 elif dtype == trt.float16: return torch.float16 elif dtype == trt.float32: return torch.float32 else: raise TypeError("%s is not supported by torch" % dtype) class TRTModule(torch.nn.Module): def __init__(self, engine=None, input_names=None, output_names=None): super(TRTModule, self).__init__() self.engine = engine #if self.engine is not None: #engine创建执行context # self.context = self.engine.create_execution_context() self.input_names = input_names self.output_names = output_names def forward(self, *inputs): with self.engine.create_execution_context() as context: batch_size = inputs[0].shape[0] bindings = [None] * (len(self.input_names) + len(self.output_names)) # 创建输出tensor,并分配内存 outputs = [None] * len(self.output_names) for i, output_name in enumerate(self.output_names): idx = self.engine.get_binding_index(output_name)#通过binding_name找到对应的input_id dtype = torch_dtype_from_trt(self.engine.get_binding_dtype(idx))#找到对应的数据类型 shape = (batch_size,) + tuple(self.engine.get_binding_shape(idx))#找到对应的形状大小 device = torch_device_from_trt(self.engine.get_location(idx)) output = torch.empty(size=shape, dtype=dtype, device=device) outputs[i] = output print('###line65:',output_name,i,idx,dtype,shape) bindings[idx] = output.data_ptr()#绑定输出数据指针 for i, input_name in enumerate(self.input_names): idx = self.engine.get_binding_index(input_name) bindings[idx] = inputs[0].contiguous().data_ptr()#应当为inputs[i],对应3个输入。但由于我们使用的是单张图片,所以将3个输入全设置为相同的图片。 #self.context.execute_async( batch_size, bindings, torch.cuda.current_stream().cuda_stream)# 执行推理 , #self.context.execute_async_v2(bindings=bindings, stream_handle=torch.cuda.current_stream().cuda_stream) # 执行推理 context.execute_v2(bindings) # 执行推理 if len(outputs) == 1: outputs = outputs[0] return outputs[0] def get_ms(t1,t0): return (t1-t0)*1000.0 def segPreProcess_image(image,modelSize=(640,360),mean=(0.335, 0.358, 0.332),std = (0.141, 0.138, 0.143) ,numpy=False, RGB_convert_first=False ): time0 = time.time() if RGB_convert_first: image = cv2.cvtColor( image,cv2.COLOR_RGB2BGR) image = cv2.resize(image,modelSize) time0 = time.time() image = image.astype(np.float32) image /= 255.0 image[:,:,0] -=mean[0] image[:,:,1] -=mean[1] image[:,:,2] -=mean[2] image[:,:,0] /= std[0] image[:,:,1] /= std[1] image[:,:,2] /= std[2] if not RGB_convert_first: image = cv2.cvtColor( image,cv2.COLOR_RGB2BGR) image = np.transpose(image, ( 2, 0, 1)) if numpy: return image else: image = torch.from_numpy(image).float() image = image.unsqueeze(0) return image def yolov5Trtforward(model,im): namess=[ model.get_binding_name(index) for index in range(model.num_bindings) ] input_names = [namess[0]];output_names=namess[1:] with model.create_execution_context() as context: batch_size = im.shape[0] bindings = [None] * (len(input_names) + len(output_names)) # 创建输出tensor,并分配内存 outputs = [None] * len(output_names) for i, output_name in enumerate(output_names): idx = model.get_binding_index(output_name)#通过binding_name找到对应的input_id dtype = torch_dtype_from_trt(model.get_binding_dtype(idx))#找到对应的数据类型 shape = tuple(model.get_binding_shape(idx))#找到对应的形状大小 device = torch_device_from_trt(model.get_location(idx)) output = torch.empty(size=shape, dtype=dtype, device=device) outputs[i] = output #print('###line144:',idx,dtype,shape,output.size()) bindings[idx] = output.data_ptr()#绑定输出数据指针 for i, input_name in enumerate(input_names): idx = model.get_binding_index(input_name) bindings[idx] = im.contiguous().data_ptr() context.execute_v2(bindings) return outputs[3] def segTrtForward(engine,inputs,contextFlag=False): if not contextFlag: context = engine.create_execution_context() else: context=contextFlag #with engine.create_execution_context() as context: input_names=['images'];output_names=['output'] batch_size = inputs[0].shape[0] bindings = [None] * (len(input_names) + len(output_names)) # 创建输出tensor,并分配内存 outputs = [None] * len(output_names) for i, output_name in enumerate(output_names): idx = engine.get_binding_index(output_name)#通过binding_name找到对应的input_id dtype = torch_dtype_from_trt(engine.get_binding_dtype(idx))#找到对应的数据类型 shape = (batch_size,) + tuple(engine.get_binding_shape(idx))#找到对应的形状大小 device = torch_device_from_trt(engine.get_location(idx)) output = torch.empty(size=shape, dtype=dtype, device=device) #print('&'*10,'device:',device,'idx:',idx,'shape:',shape,'dtype:',dtype,' device:',output.get_device()) outputs[i] = output #print('###line65:',output_name,i,idx,dtype,shape) bindings[idx] = output.data_ptr()#绑定输出数据指针 for i, input_name in enumerate(input_names): idx =engine.get_binding_index(input_name) bindings[idx] = inputs[0].contiguous().data_ptr()#应当为inputs[i],对应3个输入。但由于我们使用的是单张图片,所以将3个输入全设置为相同的图片。 #print('#'*10,'input_names:,', input_name,'idx:',idx, inputs[0].dtype,', inputs[0] device:',inputs[0].get_device()) context.execute_v2(bindings) # 执行推理 if len(outputs) == 1: outputs = outputs[0] return outputs[0] def OcrTrtForward(engine,inputs,contextFlag=False): t0=time.time() #with engine.create_execution_context() as context: if not contextFlag: context = engine.create_execution_context() else: context=contextFlag input_names=['images'];output_names=['output'] batch_size = inputs[0].shape[0] bindings = [None] * (len(input_names) + len(output_names)) t1=time.time() # 创建输出tensor,并分配内存 outputs = [None] * len(output_names) for i, output_name in enumerate(output_names): idx = engine.get_binding_index(output_name)#通过binding_name找到对应的input_id dtype = torch_dtype_from_trt(engine.get_binding_dtype(idx))#找到对应的数据类型 shape = (batch_size,) + tuple(engine.get_binding_shape(idx))#找到对应的形状大小 device = torch_device_from_trt(engine.get_location(idx)) output = torch.empty(size=shape, dtype=dtype, device=device) #print('&'*10,'device:',device,'idx:',idx,'shape:',shape,'dtype:',dtype,' device:',output.get_device()) outputs[i] = output #print('###line65:',output_name,i,idx,dtype,shape) bindings[idx] = output.data_ptr()#绑定输出数据指针 t2=time.time() for i, input_name in enumerate(input_names): idx =engine.get_binding_index(input_name) bindings[idx] = inputs[0].contiguous().data_ptr()#应当为inputs[i],对应3个输入。但由于我们使用的是单张图片,所以将3个输入全设置为相同的图片。 #print('#'*10,'input_names:,', input_name,'idx:',idx, inputs[0].dtype,', inputs[0] device:',inputs[0].get_device()) t3=time.time() context.execute_v2(bindings) # 执行推理 t4=time.time() if len(outputs) == 1: outputs = outputs[0] outstr='create Context:%.2f alloc memory:%.2f prepare input:%.2f conext infer:%.2f, total:%.2f'%((t1-t0 )*1000 , (t2-t1)*1000,(t3-t2)*1000,(t4-t3)*1000, (t4-t0)*1000 ) return outputs[0],outstr def segtrtEval(engine,image_array0,par={'modelSize':(640,360),'nclass':2,'predResize':True,'mean':(0.485, 0.456, 0.406),'std' :(0.229, 0.224, 0.225),'numpy':False, 'RGB_convert_first':True}): time0_0=time.time() H,W,C=image_array0.shape img_input = segPreProcess_image(image_array0,modelSize=par['modelSize'],mean=par['mean'],std =par['std'],numpy=par['numpy'], RGB_convert_first=par['RGB_convert_first'] ) img_input = img_input.to('cuda:0') time1_0=time.time() pred=segTrtForward(engine,[img_input]) time2_0=time.time() pred=torch.argmax(pred,dim=1).cpu().numpy()[0] time3_0 = time.time() if 'predResize' in par.keys(): if par['predResize']: pred = cv2.resize(pred.astype(np.uint8),(W,H)) else: pred = cv2.resize(pred.astype(np.uint8),(W,H)) time4_0 = time.time() segInfoStr= 'pre-precess:%.1f ,infer:%.1f ,post-cpu-argmax:%.1f ,post-resize:%.1f, total:%.1f \n '%( get_ms(time1_0,time0_0),get_ms(time2_0,time1_0),get_ms(time3_0,time2_0),get_ms(time4_0,time3_0),get_ms(time4_0,time0_0) ) return pred,segInfoStr