|
- import argparse
- import os
- import sys
- from pathlib import Path
-
- import cv2
- import torch
- import torch.backends.cudnn as cudnn
- import torch.nn as nn
- from collections import OrderedDict, namedtuple
- import numpy as np
- import time
- import tensorrt as trt
- #import pycuda.driver as cuda
- def trt_version():
- return trt.__version__
-
- def torch_device_from_trt(device):
- if device == trt.TensorLocation.DEVICE:
- return torch.device("cuda")
- elif device == trt.TensorLocation.HOST:
- return torch.device("cpu")
- else:
- return TypeError("%s is not supported by torch" % device)
-
-
- def torch_dtype_from_trt(dtype):
- if dtype == trt.int8:
- return torch.int8
- elif trt_version() >= '7.0' and dtype == trt.bool:
- return torch.bool
- elif dtype == trt.int32:
- return torch.int32
- elif dtype == trt.float16:
- return torch.float16
- elif dtype == trt.float32:
- return torch.float32
- else:
- raise TypeError("%s is not supported by torch" % dtype)
-
- class TRTModule(torch.nn.Module):
- def __init__(self, engine=None, input_names=None, output_names=None):
- super(TRTModule, self).__init__()
- self.engine = engine
- #if self.engine is not None:
- #engine创建执行context
- # self.context = self.engine.create_execution_context()
-
- self.input_names = input_names
- self.output_names = output_names
-
-
- def forward(self, *inputs):
- with self.engine.create_execution_context() as context:
- batch_size = inputs[0].shape[0]
- bindings = [None] * (len(self.input_names) + len(self.output_names))
-
- # 创建输出tensor,并分配内存
- outputs = [None] * len(self.output_names)
- for i, output_name in enumerate(self.output_names):
- idx = self.engine.get_binding_index(output_name)#通过binding_name找到对应的input_id
- dtype = torch_dtype_from_trt(self.engine.get_binding_dtype(idx))#找到对应的数据类型
- shape = (batch_size,) + tuple(self.engine.get_binding_shape(idx))#找到对应的形状大小
- device = torch_device_from_trt(self.engine.get_location(idx))
- output = torch.empty(size=shape, dtype=dtype, device=device)
- outputs[i] = output
- print('###line65:',output_name,i,idx,dtype,shape)
- bindings[idx] = output.data_ptr()#绑定输出数据指针
-
-
- for i, input_name in enumerate(self.input_names):
- idx = self.engine.get_binding_index(input_name)
- bindings[idx] = inputs[0].contiguous().data_ptr()#应当为inputs[i],对应3个输入。但由于我们使用的是单张图片,所以将3个输入全设置为相同的图片。
-
-
- #self.context.execute_async( batch_size, bindings, torch.cuda.current_stream().cuda_stream)# 执行推理 ,
- #self.context.execute_async_v2(bindings=bindings, stream_handle=torch.cuda.current_stream().cuda_stream) # 执行推理
- context.execute_v2(bindings) # 执行推理
-
-
- if len(outputs) == 1:
- outputs = outputs[0]
-
- return outputs[0]
- def get_ms(t1,t0):
- return (t1-t0)*1000.0
-
-
- def segPreProcess_image(image,modelSize=(640,360),mean=(0.335, 0.358, 0.332),std = (0.141, 0.138, 0.143) ,numpy=False, RGB_convert_first=False ):
- time0 = time.time()
- if RGB_convert_first:
- image = cv2.cvtColor( image,cv2.COLOR_RGB2BGR)
- image = cv2.resize(image,modelSize)
- time0 = time.time()
- image = image.astype(np.float32)
- image /= 255.0
-
- image[:,:,0] -=mean[0]
- image[:,:,1] -=mean[1]
- image[:,:,2] -=mean[2]
-
- image[:,:,0] /= std[0]
- image[:,:,1] /= std[1]
- image[:,:,2] /= std[2]
- if not RGB_convert_first:
- image = cv2.cvtColor( image,cv2.COLOR_RGB2BGR)
- image = np.transpose(image, ( 2, 0, 1))
-
- if numpy:
- return image
- else:
-
- image = torch.from_numpy(image).float()
- image = image.unsqueeze(0)
-
- return image
-
- def yolov5Trtforward(model,im):
-
- namess=[ model.get_binding_name(index) for index in range(model.num_bindings) ]
- input_names = [namess[0]];output_names=namess[1:]
-
- with model.create_execution_context() as context:
- batch_size = im.shape[0]
- bindings = [None] * (len(input_names) + len(output_names))
-
- # 创建输出tensor,并分配内存
- outputs = [None] * len(output_names)
- for i, output_name in enumerate(output_names):
- idx = model.get_binding_index(output_name)#通过binding_name找到对应的input_id
- dtype = torch_dtype_from_trt(model.get_binding_dtype(idx))#找到对应的数据类型
- shape = tuple(model.get_binding_shape(idx))#找到对应的形状大小
- device = torch_device_from_trt(model.get_location(idx))
- output = torch.empty(size=shape, dtype=dtype, device=device)
- outputs[i] = output
- #print('###line144:',idx,dtype,shape,output.size())
- bindings[idx] = output.data_ptr()#绑定输出数据指针
-
-
- for i, input_name in enumerate(input_names):
- idx = model.get_binding_index(input_name)
- bindings[idx] = im.contiguous().data_ptr()
- context.execute_v2(bindings)
-
- return outputs[3]
-
- def segTrtForward(engine,inputs,contextFlag=False):
-
- if not contextFlag: context = engine.create_execution_context()
- else: context=contextFlag
-
- #with engine.create_execution_context() as context:
- input_names=['images'];output_names=['output']
- batch_size = inputs[0].shape[0]
- bindings = [None] * (len(input_names) + len(output_names))
-
- # 创建输出tensor,并分配内存
- outputs = [None] * len(output_names)
- for i, output_name in enumerate(output_names):
- idx = engine.get_binding_index(output_name)#通过binding_name找到对应的input_id
- dtype = torch_dtype_from_trt(engine.get_binding_dtype(idx))#找到对应的数据类型
- shape = (batch_size,) + tuple(engine.get_binding_shape(idx))#找到对应的形状大小
- device = torch_device_from_trt(engine.get_location(idx))
- output = torch.empty(size=shape, dtype=dtype, device=device)
- #print('&'*10,'device:',device,'idx:',idx,'shape:',shape,'dtype:',dtype,' device:',output.get_device())
- outputs[i] = output
- #print('###line65:',output_name,i,idx,dtype,shape)
- bindings[idx] = output.data_ptr()#绑定输出数据指针
-
-
- for i, input_name in enumerate(input_names):
- idx =engine.get_binding_index(input_name)
- bindings[idx] = inputs[0].contiguous().data_ptr()#应当为inputs[i],对应3个输入。但由于我们使用的是单张图片,所以将3个输入全设置为相同的图片。
- #print('#'*10,'input_names:,', input_name,'idx:',idx, inputs[0].dtype,', inputs[0] device:',inputs[0].get_device())
- context.execute_v2(bindings) # 执行推理
-
-
-
- if len(outputs) == 1:
- outputs = outputs[0]
-
- return outputs[0]
- def OcrTrtForward(engine,inputs,contextFlag=False):
-
- t0=time.time()
- #with engine.create_execution_context() as context:
- if not contextFlag: context = engine.create_execution_context()
- else: context=contextFlag
-
- input_names=['images'];output_names=['output']
- batch_size = inputs[0].shape[0]
- bindings = [None] * (len(input_names) + len(output_names))
- t1=time.time()
- # 创建输出tensor,并分配内存
- outputs = [None] * len(output_names)
- for i, output_name in enumerate(output_names):
- idx = engine.get_binding_index(output_name)#通过binding_name找到对应的input_id
- dtype = torch_dtype_from_trt(engine.get_binding_dtype(idx))#找到对应的数据类型
- shape = (batch_size,) + tuple(engine.get_binding_shape(idx))#找到对应的形状大小
- device = torch_device_from_trt(engine.get_location(idx))
- output = torch.empty(size=shape, dtype=dtype, device=device)
- #print('&'*10,'device:',device,'idx:',idx,'shape:',shape,'dtype:',dtype,' device:',output.get_device())
- outputs[i] = output
- #print('###line65:',output_name,i,idx,dtype,shape)
- bindings[idx] = output.data_ptr()#绑定输出数据指针
- t2=time.time()
-
- for i, input_name in enumerate(input_names):
- idx =engine.get_binding_index(input_name)
- bindings[idx] = inputs[0].contiguous().data_ptr()#应当为inputs[i],对应3个输入。但由于我们使用的是单张图片,所以将3个输入全设置为相同的图片。
- #print('#'*10,'input_names:,', input_name,'idx:',idx, inputs[0].dtype,', inputs[0] device:',inputs[0].get_device())
- t3=time.time()
- context.execute_v2(bindings) # 执行推理
- t4=time.time()
-
-
- if len(outputs) == 1:
- outputs = outputs[0]
- outstr='create Context:%.2f alloc memory:%.2f prepare input:%.2f conext infer:%.2f, total:%.2f'%((t1-t0 )*1000 , (t2-t1)*1000,(t3-t2)*1000,(t4-t3)*1000, (t4-t0)*1000 )
- return outputs[0],outstr
-
- def segtrtEval(engine,image_array0,par={'modelSize':(640,360),'nclass':2,'predResize':True,'mean':(0.485, 0.456, 0.406),'std' :(0.229, 0.224, 0.225),'numpy':False, 'RGB_convert_first':True}):
- time0_0=time.time()
- H,W,C=image_array0.shape
- img_input = segPreProcess_image(image_array0,modelSize=par['modelSize'],mean=par['mean'],std =par['std'],numpy=par['numpy'], RGB_convert_first=par['RGB_convert_first'] )
- img_input = img_input.to('cuda:0')
- time1_0=time.time()
- pred=segTrtForward(engine,[img_input])
- time2_0=time.time()
- pred=torch.argmax(pred,dim=1).cpu().numpy()[0]
- time3_0 = time.time()
- if 'predResize' in par.keys():
- if par['predResize']:
- pred = cv2.resize(pred.astype(np.uint8),(W,H))
- else:
- pred = cv2.resize(pred.astype(np.uint8),(W,H))
- time4_0 = time.time()
- segInfoStr= 'pre-precess:%.1f ,infer:%.1f ,post-cpu-argmax:%.1f ,post-resize:%.1f, total:%.1f \n '%( get_ms(time1_0,time0_0),get_ms(time2_0,time1_0),get_ms(time3_0,time2_0),get_ms(time4_0,time3_0),get_ms(time4_0,time0_0) )
- return pred,segInfoStr
-
-
|