371 lines
17 KiB
Python
371 lines
17 KiB
Python
import argparse
|
||
import os
|
||
import sys
|
||
from pathlib import Path
|
||
|
||
import cv2
|
||
import torch
|
||
import torch.backends.cudnn as cudnn
|
||
import torch.nn as nn
|
||
from collections import OrderedDict, namedtuple
|
||
import numpy as np
|
||
import time
|
||
import tensorrt as trt
|
||
#import pycuda.driver as cuda
|
||
def trt_version():
|
||
return trt.__version__
|
||
|
||
def torch_device_from_trt(device):
|
||
if device == trt.TensorLocation.DEVICE:
|
||
return torch.device("cuda")
|
||
elif device == trt.TensorLocation.HOST:
|
||
return torch.device("cpu")
|
||
else:
|
||
return TypeError("%s is not supported by torch" % device)
|
||
|
||
|
||
def torch_dtype_from_trt(dtype):
|
||
if dtype == trt.int8:
|
||
return torch.int8
|
||
elif trt_version() >= '7.0' and dtype == trt.bool:
|
||
return torch.bool
|
||
elif dtype == trt.int32:
|
||
return torch.int32
|
||
elif dtype == trt.float16:
|
||
return torch.float16
|
||
elif dtype == trt.float32:
|
||
return torch.float32
|
||
else:
|
||
raise TypeError("%s is not supported by torch" % dtype)
|
||
|
||
class TRTModule(torch.nn.Module):
|
||
def __init__(self, engine=None, input_names=None, output_names=None):
|
||
super(TRTModule, self).__init__()
|
||
self.engine = engine
|
||
#if self.engine is not None:
|
||
#engine创建执行context
|
||
# self.context = self.engine.create_execution_context()
|
||
|
||
self.input_names = input_names
|
||
self.output_names = output_names
|
||
|
||
|
||
def forward(self, *inputs):
|
||
with self.engine.create_execution_context() as context:
|
||
batch_size = inputs[0].shape[0]
|
||
bindings = [None] * (len(self.input_names) + len(self.output_names))
|
||
|
||
# 创建输出tensor,并分配内存
|
||
outputs = [None] * len(self.output_names)
|
||
for i, output_name in enumerate(self.output_names):
|
||
idx = self.engine.get_binding_index(output_name)#通过binding_name找到对应的input_id
|
||
dtype = torch_dtype_from_trt(self.engine.get_binding_dtype(idx))#找到对应的数据类型
|
||
shape = (batch_size,) + tuple(self.engine.get_binding_shape(idx))#找到对应的形状大小
|
||
device = torch_device_from_trt(self.engine.get_location(idx))
|
||
output = torch.empty(size=shape, dtype=dtype, device=device)
|
||
outputs[i] = output
|
||
print('###line65:',output_name,i,idx,dtype,shape)
|
||
bindings[idx] = output.data_ptr()#绑定输出数据指针
|
||
|
||
|
||
for i, input_name in enumerate(self.input_names):
|
||
idx = self.engine.get_binding_index(input_name)
|
||
bindings[idx] = inputs[0].contiguous().data_ptr()#应当为inputs[i],对应3个输入。但由于我们使用的是单张图片,所以将3个输入全设置为相同的图片。
|
||
|
||
|
||
#self.context.execute_async( batch_size, bindings, torch.cuda.current_stream().cuda_stream)# 执行推理 ,
|
||
#self.context.execute_async_v2(bindings=bindings, stream_handle=torch.cuda.current_stream().cuda_stream) # 执行推理
|
||
context.execute_v2(bindings) # 执行推理
|
||
|
||
|
||
if len(outputs) == 1:
|
||
outputs = outputs[0]
|
||
|
||
return outputs[0]
|
||
def get_ms(t1,t0):
|
||
return (t1-t0)*1000.0
|
||
def colorstr(*input):
|
||
# Colors a string https://en.wikipedia.org/wiki/ANSI_escape_code, i.e. colorstr('blue', 'hello world')
|
||
*args, string = input if len(input) > 1 else ('blue', 'bold', input[0]) # color arguments, string
|
||
colors = {'black': '\033[30m', # basic colors
|
||
'red': '\033[31m',
|
||
'green': '\033[32m',
|
||
'yellow': '\033[33m',
|
||
'blue': '\033[34m',
|
||
'magenta': '\033[35m',
|
||
'cyan': '\033[36m',
|
||
'white': '\033[37m',
|
||
'bright_black': '\033[90m', # bright colors
|
||
'bright_red': '\033[91m',
|
||
'bright_green': '\033[92m',
|
||
'bright_yellow': '\033[93m',
|
||
'bright_blue': '\033[94m',
|
||
'bright_magenta': '\033[95m',
|
||
'bright_cyan': '\033[96m',
|
||
'bright_white': '\033[97m',
|
||
'end': '\033[0m', # misc
|
||
'bold': '\033[1m',
|
||
'underline': '\033[4m'}
|
||
return ''.join(colors[x] for x in args) + f'{string}' + colors['end']
|
||
def file_size(path):
|
||
# Return file/dir size (MB)
|
||
path = Path(path)
|
||
if path.is_file():
|
||
return path.stat().st_size / 1E6
|
||
elif path.is_dir():
|
||
return sum(f.stat().st_size for f in path.glob('**/*') if f.is_file()) / 1E6
|
||
else:
|
||
return 0.0
|
||
|
||
def toONNX(seg_model,onnxFile,inputShape=(1,3,360,640),device=torch.device('cuda:0'),dynamic=False ):
|
||
|
||
import onnx
|
||
|
||
im = torch.rand(inputShape).to(device)
|
||
seg_model.eval()
|
||
out=seg_model(im)
|
||
print('###test model infer example over ####')
|
||
train=False
|
||
dynamic = False
|
||
opset=11
|
||
print('####begin to export to onnx')
|
||
|
||
torch.onnx.export(seg_model, im,onnxFile, opset_version=opset,
|
||
training=torch.onnx.TrainingMode.TRAINING if train else torch.onnx.TrainingMode.EVAL,
|
||
do_constant_folding=not train,
|
||
input_names=['images'],
|
||
output_names=['output'],
|
||
#dynamic_axes={'images': {0: 'batch', 2: 'height', 3: 'width'}, # shape(1,3,640,640)
|
||
# 'output': {0: 'batch', 1: 'anchors'} # shape(1,25200,85)
|
||
# } if dynamic else None
|
||
dynamic_axes={
|
||
'images': {0: 'batch_size', 2: 'in_width', 3: 'int_height'},
|
||
'output': {0: 'batch_size', 2: 'out_width', 3: 'out_height'}} if dynamic else None
|
||
|
||
)
|
||
|
||
print('output onnx file:',onnxFile)
|
||
def ONNXtoTrt(onnxFile,trtFile):
|
||
import tensorrt as trt
|
||
#onnx = Path('../weights/BiSeNet/checkpoint.onnx')
|
||
#onnxFile = Path('../weights/STDC/model_maxmIOU75_1720_0.946_360640.onnx')
|
||
time0=time.time()
|
||
half=True;verbose=True;workspace=4;prefix=colorstr('TensorRT:')
|
||
#f = onnx.with_suffix('.engine') # TensorRT engine file
|
||
f=trtFile
|
||
logger = trt.Logger(trt.Logger.INFO)
|
||
if verbose:
|
||
logger.min_severity = trt.Logger.Severity.VERBOSE
|
||
|
||
builder = trt.Builder(logger)
|
||
config = builder.create_builder_config()
|
||
config.max_workspace_size = workspace * 1 << 30
|
||
|
||
flag = (1 << int(trt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH))
|
||
network = builder.create_network(flag)
|
||
parser = trt.OnnxParser(network, logger)
|
||
if not parser.parse_from_file(str(onnxFile)):
|
||
raise RuntimeError(f'failed to load ONNX file: {onnx}')
|
||
|
||
inputs = [network.get_input(i) for i in range(network.num_inputs)]
|
||
outputs = [network.get_output(i) for i in range(network.num_outputs)]
|
||
print(f'{prefix} Network Description:')
|
||
for inp in inputs:
|
||
print(f'{prefix}\tinput "{inp.name}" with shape {inp.shape} and dtype {inp.dtype}')
|
||
for out in outputs:
|
||
print(f'{prefix}\toutput "{out.name}" with shape {out.shape} and dtype {out.dtype}')
|
||
|
||
half &= builder.platform_has_fast_fp16
|
||
print(f'{prefix} building FP{16 if half else 32} engine in {f}')
|
||
if half:
|
||
config.set_flag(trt.BuilderFlag.FP16)
|
||
with builder.build_engine(network, config) as engine, open(f, 'wb') as t:
|
||
t.write(engine.serialize())
|
||
print(f'{prefix} export success, saved as {f} ({file_size(f):.1f} MB)')
|
||
time1=time.time()
|
||
print('output trtfile from ONNX, time:%.4f s ,'%(time1-time0),trtFile)
|
||
|
||
def segPreProcess_image(image,modelSize=(640,360),mean=(0.335, 0.358, 0.332),std = (0.141, 0.138, 0.143) ,numpy=False, RGB_convert_first=False ):
|
||
time0 = time.time()
|
||
if RGB_convert_first:
|
||
image = cv2.cvtColor( image,cv2.COLOR_RGB2BGR)
|
||
time1 = time.time()
|
||
image = cv2.resize(image,modelSize)
|
||
time2 = time.time()
|
||
image = image.astype(np.float32)
|
||
image /= 255.0
|
||
time3 = time.time()
|
||
image[:,:,0] -=mean[0]
|
||
image[:,:,1] -=mean[1]
|
||
image[:,:,2] -=mean[2]
|
||
time4 = time.time()
|
||
image[:,:,0] /= std[0]
|
||
image[:,:,1] /= std[1]
|
||
image[:,:,2] /= std[2]
|
||
if not RGB_convert_first:
|
||
image = cv2.cvtColor( image,cv2.COLOR_RGB2BGR)
|
||
image = np.transpose(image, ( 2, 0, 1))
|
||
time5 = time.time()
|
||
print('RG convert:%.1f resize:%1f ,normalize:%.1f ,Demean:%.1f ,DeVar:%.1f '%( get_ms(time1,time0 ), get_ms(time2,time1 ), get_ms(time3,time2 ), get_ms(time4,time3 ), get_ms(time5,time4 ) ), numpy, RGB_convert_first)
|
||
if numpy:
|
||
return image
|
||
else:
|
||
|
||
image = torch.from_numpy(image).float()
|
||
image = image.unsqueeze(0)
|
||
|
||
return image
|
||
def segPreProcess_image_torch(image,modelSize=(640,360),mean=(0.335, 0.358, 0.332),std = (0.141, 0.138, 0.143) ,numpy=False, RGB_convert_first=False,device='cuda:0' ):
|
||
#输入是numpy,输出torch
|
||
t1 = torch.from_numpy( np.array( std))*255.0
|
||
t2 = torch.from_numpy(np.array(mean)/np.array(std))
|
||
|
||
time0 = time.time()
|
||
if RGB_convert_first:
|
||
image = cv2.cvtColor( image,cv2.COLOR_RGB2BGR)
|
||
time1 = time.time()
|
||
image = cv2.resize(image,modelSize)
|
||
image = np.transpose(image, ( 2, 0, 1))
|
||
time2 = time.time()
|
||
image = torch.from_numpy(image).float().to(device)
|
||
time3 = time.time()
|
||
image = image.unsqueeze(0)
|
||
|
||
|
||
#image[:,:,:,:]= image/255.0
|
||
|
||
#image[:,0,:,:] -=mean[0];image[:,1,:,:] -=mean[1];image[:,2,:,:] -=mean[2]
|
||
image[:,0,:,:] /= t1[0];image[:,1,:,:] /= t1[1];image[:,2,:,:] /= t1[2]
|
||
time4 = time.time()
|
||
#image[:,0,:,:] /= std[0];image[:,1,:,:] /= std[1];image[:,2,:,:] /= std[2]
|
||
image[:,0,:,:] -=t2[0];image[:,1,:,:] -=t2[1];image[:,2,:,:] -=t2[2]
|
||
time5 = time.time()
|
||
#print('RG convert:%.1f resizeee:%1f ,normalize:%.1f ,Demean:%.1f ,DeVar:%.1f '%( get_ms(time1,time0 ), get_ms(time2,time1 ), get_ms(time3,time2 ), get_ms(time4,time3 ), get_ms(time5,time4 ) ), numpy, RGB_convert_first)
|
||
return image
|
||
|
||
|
||
def yolov5Trtforward(model,im):
|
||
|
||
namess=[ model.get_binding_name(index) for index in range(model.num_bindings) ]
|
||
input_names = [namess[0]];output_names=namess[1:]
|
||
|
||
with model.create_execution_context() as context:
|
||
batch_size = im.shape[0]
|
||
bindings = [None] * (len(input_names) + len(output_names))
|
||
|
||
# 创建输出tensor,并分配内存
|
||
outputs = [None] * len(output_names)
|
||
for i, output_name in enumerate(output_names):
|
||
idx = model.get_binding_index(output_name)#通过binding_name找到对应的input_id
|
||
dtype = torch_dtype_from_trt(model.get_binding_dtype(idx))#找到对应的数据类型
|
||
shape = tuple(model.get_binding_shape(idx))#找到对应的形状大小
|
||
device = torch_device_from_trt(model.get_location(idx))
|
||
output = torch.empty(size=shape, dtype=dtype, device=device)
|
||
outputs[i] = output
|
||
#print('###line144:',idx,dtype,shape,output.size())
|
||
bindings[idx] = output.data_ptr()#绑定输出数据指针
|
||
|
||
|
||
for i, input_name in enumerate(input_names):
|
||
idx = model.get_binding_index(input_name)
|
||
bindings[idx] = im.contiguous().data_ptr()
|
||
context.execute_v2(bindings)
|
||
|
||
return outputs[3]
|
||
|
||
def segTrtForward(engine,inputs,contextFlag=False):
|
||
|
||
if not contextFlag: context = engine.create_execution_context()
|
||
else: context=contextFlag
|
||
|
||
#with engine.create_execution_context() as context:
|
||
input_names=['images'];output_names=['output']
|
||
batch_size = inputs[0].shape[0]
|
||
bindings = [None] * (len(input_names) + len(output_names))
|
||
|
||
# 创建输出tensor,并分配内存
|
||
outputs = [None] * len(output_names)
|
||
for i, output_name in enumerate(output_names):
|
||
idx = engine.get_binding_index(output_name)#通过binding_name找到对应的input_id
|
||
dtype = torch_dtype_from_trt(engine.get_binding_dtype(idx))#找到对应的数据类型
|
||
shape = (batch_size,) + tuple(engine.get_binding_shape(idx))#找到对应的形状大小
|
||
device = torch_device_from_trt(engine.get_location(idx))
|
||
output = torch.empty(size=shape, dtype=dtype, device=device)
|
||
#print('&'*10,'device:',device,'idx:',idx,'shape:',shape,'dtype:',dtype,' device:',output.get_device())
|
||
outputs[i] = output
|
||
#print('###line65:',output_name,i,idx,dtype,shape)
|
||
bindings[idx] = output.data_ptr()#绑定输出数据指针
|
||
|
||
|
||
for i, input_name in enumerate(input_names):
|
||
idx =engine.get_binding_index(input_name)
|
||
bindings[idx] = inputs[0].contiguous().data_ptr()#应当为inputs[i],对应3个输入。但由于我们使用的是单张图片,所以将3个输入全设置为相同的图片。
|
||
#print('#'*10,'input_names:,', input_name,'idx:',idx, inputs[0].dtype,', inputs[0] device:',inputs[0].get_device())
|
||
context.execute_v2(bindings) # 执行推理
|
||
|
||
|
||
|
||
if len(outputs) == 1:
|
||
outputs = outputs[0]
|
||
|
||
return outputs[0]
|
||
def OcrTrtForward(engine,inputs,contextFlag=False):
|
||
|
||
t0=time.time()
|
||
#with engine.create_execution_context() as context:
|
||
if not contextFlag: context = engine.create_execution_context()
|
||
else: context=contextFlag
|
||
|
||
input_names=['images'];output_names=['output']
|
||
batch_size = inputs[0].shape[0]
|
||
bindings = [None] * (len(input_names) + len(output_names))
|
||
t1=time.time()
|
||
# 创建输出tensor,并分配内存
|
||
outputs = [None] * len(output_names)
|
||
for i, output_name in enumerate(output_names):
|
||
idx = engine.get_binding_index(output_name)#通过binding_name找到对应的input_id
|
||
dtype = torch_dtype_from_trt(engine.get_binding_dtype(idx))#找到对应的数据类型
|
||
shape = (batch_size,) + tuple(engine.get_binding_shape(idx))#找到对应的形状大小
|
||
device = torch_device_from_trt(engine.get_location(idx))
|
||
output = torch.empty(size=shape, dtype=dtype, device=device)
|
||
#print('&'*10,'device:',device,'idx:',idx,'shape:',shape,'dtype:',dtype,' device:',output.get_device())
|
||
outputs[i] = output
|
||
#print('###line65:',output_name,i,idx,dtype,shape)
|
||
bindings[idx] = output.data_ptr()#绑定输出数据指针
|
||
t2=time.time()
|
||
|
||
for i, input_name in enumerate(input_names):
|
||
idx =engine.get_binding_index(input_name)
|
||
bindings[idx] = inputs[0].contiguous().data_ptr()#应当为inputs[i],对应3个输入。但由于我们使用的是单张图片,所以将3个输入全设置为相同的图片。
|
||
#print('#'*10,'input_names:,', input_name,'idx:',idx, inputs[0].dtype,', inputs[0] device:',inputs[0].get_device())
|
||
t3=time.time()
|
||
context.execute_v2(bindings) # 执行推理
|
||
t4=time.time()
|
||
|
||
|
||
if len(outputs) == 1:
|
||
outputs = outputs[0]
|
||
outstr='create Context:%.2f alloc memory:%.2f prepare input:%.2f conext infer:%.2f, total:%.2f'%((t1-t0 )*1000 , (t2-t1)*1000,(t3-t2)*1000,(t4-t3)*1000, (t4-t0)*1000 )
|
||
return outputs[0],outstr
|
||
|
||
def segtrtEval(engine,image_array0,par={'modelSize':(640,360),'nclass':2,'predResize':True,'mean':(0.485, 0.456, 0.406),'std' :(0.229, 0.224, 0.225),'numpy':False, 'RGB_convert_first':True}):
|
||
time0_0=time.time()
|
||
H,W,C=image_array0.shape
|
||
#img_input = segPreProcess_image(image_array0,modelSize=par['modelSize'],mean=par['mean'],std =par['std'],numpy=par['numpy'], RGB_convert_first=par['RGB_convert_first'] )
|
||
img_input = segPreProcess_image_torch(image_array0,modelSize=par['modelSize'],mean=par['mean'],std =par['std'],numpy=par['numpy'], RGB_convert_first=par['RGB_convert_first'] )
|
||
img_input = img_input.to('cuda:0')
|
||
time1_0=time.time()
|
||
pred=segTrtForward(engine,[img_input])
|
||
time2_0=time.time()
|
||
pred=torch.argmax(pred,dim=1).cpu().numpy()[0]
|
||
time3_0 = time.time()
|
||
if 'predResize' in par.keys():
|
||
if par['predResize']:
|
||
pred = cv2.resize(pred.astype(np.uint8),(W,H))
|
||
else:
|
||
pred = cv2.resize(pred.astype(np.uint8),(W,H))
|
||
time4_0 = time.time()
|
||
segInfoStr= 'pre-precess:%.1f ,infer:%.1f ,post-cpu-argmax:%.1f ,post-resize:%.1f, total:%.1f '%( get_ms(time1_0,time0_0),get_ms(time2_0,time1_0),get_ms(time3_0,time2_0),get_ms(time4_0,time3_0),get_ms(time4_0,time0_0) )
|
||
return pred,segInfoStr
|
||
|