|
- import argparse
- import os
- import sys
- from pathlib import Path
-
- import cv2
- import torch
- import torch.backends.cudnn as cudnn
- import torch.nn as nn
- from collections import OrderedDict, namedtuple
- import numpy as np
- import time
- import tensorrt as trt
- #import pycuda.driver as cuda
- def trt_version():
- return trt.__version__
-
- def torch_device_from_trt(device):
- if device == trt.TensorLocation.DEVICE:
- return torch.device("cuda")
- elif device == trt.TensorLocation.HOST:
- return torch.device("cpu")
- else:
- return TypeError("%s is not supported by torch" % device)
-
-
- def torch_dtype_from_trt(dtype):
- if dtype == trt.int8:
- return torch.int8
- elif trt_version() >= '7.0' and dtype == trt.bool:
- return torch.bool
- elif dtype == trt.int32:
- return torch.int32
- elif dtype == trt.float16:
- return torch.float16
- elif dtype == trt.float32:
- return torch.float32
- else:
- raise TypeError("%s is not supported by torch" % dtype)
-
- class TRTModule(torch.nn.Module):
- def __init__(self, engine=None, input_names=None, output_names=None):
- super(TRTModule, self).__init__()
- self.engine = engine
- #if self.engine is not None:
- #engine创建执行context
- # self.context = self.engine.create_execution_context()
-
- self.input_names = input_names
- self.output_names = output_names
-
-
- def forward(self, *inputs):
- with self.engine.create_execution_context() as context:
- batch_size = inputs[0].shape[0]
- bindings = [None] * (len(self.input_names) + len(self.output_names))
-
- # 创建输出tensor,并分配内存
- outputs = [None] * len(self.output_names)
- for i, output_name in enumerate(self.output_names):
- idx = self.engine.get_binding_index(output_name)#通过binding_name找到对应的input_id
- dtype = torch_dtype_from_trt(self.engine.get_binding_dtype(idx))#找到对应的数据类型
- shape = (batch_size,) + tuple(self.engine.get_binding_shape(idx))#找到对应的形状大小
- device = torch_device_from_trt(self.engine.get_location(idx))
- output = torch.empty(size=shape, dtype=dtype, device=device)
- outputs[i] = output
- print('###line65:',output_name,i,idx,dtype,shape)
- bindings[idx] = output.data_ptr()#绑定输出数据指针
-
-
- for i, input_name in enumerate(self.input_names):
- idx = self.engine.get_binding_index(input_name)
- bindings[idx] = inputs[0].contiguous().data_ptr()#应当为inputs[i],对应3个输入。但由于我们使用的是单张图片,所以将3个输入全设置为相同的图片。
-
-
- #self.context.execute_async( batch_size, bindings, torch.cuda.current_stream().cuda_stream)# 执行推理 ,
- #self.context.execute_async_v2(bindings=bindings, stream_handle=torch.cuda.current_stream().cuda_stream) # 执行推理
- context.execute_v2(bindings) # 执行推理
-
-
- if len(outputs) == 1:
- outputs = outputs[0]
-
- return outputs[0]
- def get_ms(t1,t0):
- return (t1-t0)*1000.0
- def colorstr(*input):
- # Colors a string https://en.wikipedia.org/wiki/ANSI_escape_code, i.e. colorstr('blue', 'hello world')
- *args, string = input if len(input) > 1 else ('blue', 'bold', input[0]) # color arguments, string
- colors = {'black': '\033[30m', # basic colors
- 'red': '\033[31m',
- 'green': '\033[32m',
- 'yellow': '\033[33m',
- 'blue': '\033[34m',
- 'magenta': '\033[35m',
- 'cyan': '\033[36m',
- 'white': '\033[37m',
- 'bright_black': '\033[90m', # bright colors
- 'bright_red': '\033[91m',
- 'bright_green': '\033[92m',
- 'bright_yellow': '\033[93m',
- 'bright_blue': '\033[94m',
- 'bright_magenta': '\033[95m',
- 'bright_cyan': '\033[96m',
- 'bright_white': '\033[97m',
- 'end': '\033[0m', # misc
- 'bold': '\033[1m',
- 'underline': '\033[4m'}
- return ''.join(colors[x] for x in args) + f'{string}' + colors['end']
- def file_size(path):
- # Return file/dir size (MB)
- path = Path(path)
- if path.is_file():
- return path.stat().st_size / 1E6
- elif path.is_dir():
- return sum(f.stat().st_size for f in path.glob('**/*') if f.is_file()) / 1E6
- else:
- return 0.0
-
- def toONNX(seg_model,onnxFile,inputShape=(1,3,360,640),device=torch.device('cuda:0'),dynamic=False ):
-
- import onnx
-
- im = torch.rand(inputShape).to(device)
- seg_model.eval()
- out=seg_model(im)
- print('###test model infer example over ####')
- train=False
- dynamic = False
- opset=11
- print('####begin to export to onnx')
-
- torch.onnx.export(seg_model, im,onnxFile, opset_version=opset,
- training=torch.onnx.TrainingMode.TRAINING if train else torch.onnx.TrainingMode.EVAL,
- do_constant_folding=not train,
- input_names=['images'],
- output_names=['output'],
- #dynamic_axes={'images': {0: 'batch', 2: 'height', 3: 'width'}, # shape(1,3,640,640)
- # 'output': {0: 'batch', 1: 'anchors'} # shape(1,25200,85)
- # } if dynamic else None
- dynamic_axes={
- 'images': {0: 'batch_size', 2: 'in_width', 3: 'int_height'},
- 'output': {0: 'batch_size', 2: 'out_width', 3: 'out_height'}} if dynamic else None
-
- )
-
- print('output onnx file:',onnxFile)
- def ONNXtoTrt(onnxFile,trtFile):
- import tensorrt as trt
- #onnx = Path('../weights/BiSeNet/checkpoint.onnx')
- #onnxFile = Path('../weights/STDC/model_maxmIOU75_1720_0.946_360640.onnx')
- time0=time.time()
- half=True;verbose=True;workspace=4;prefix=colorstr('TensorRT:')
- #f = onnx.with_suffix('.engine') # TensorRT engine file
- f=trtFile
- logger = trt.Logger(trt.Logger.INFO)
- if verbose:
- logger.min_severity = trt.Logger.Severity.VERBOSE
-
- builder = trt.Builder(logger)
- config = builder.create_builder_config()
- config.max_workspace_size = workspace * 1 << 30
-
- flag = (1 << int(trt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH))
- network = builder.create_network(flag)
- parser = trt.OnnxParser(network, logger)
- if not parser.parse_from_file(str(onnxFile)):
- raise RuntimeError(f'failed to load ONNX file: {onnx}')
-
- inputs = [network.get_input(i) for i in range(network.num_inputs)]
- outputs = [network.get_output(i) for i in range(network.num_outputs)]
- print(f'{prefix} Network Description:')
- for inp in inputs:
- print(f'{prefix}\tinput "{inp.name}" with shape {inp.shape} and dtype {inp.dtype}')
- for out in outputs:
- print(f'{prefix}\toutput "{out.name}" with shape {out.shape} and dtype {out.dtype}')
-
- half &= builder.platform_has_fast_fp16
- print(f'{prefix} building FP{16 if half else 32} engine in {f}')
- if half:
- config.set_flag(trt.BuilderFlag.FP16)
- with builder.build_engine(network, config) as engine, open(f, 'wb') as t:
- t.write(engine.serialize())
- print(f'{prefix} export success, saved as {f} ({file_size(f):.1f} MB)')
- time1=time.time()
- print('output trtfile from ONNX, time:%.4f s ,'%(time1-time0),trtFile)
-
- def segPreProcess_image(image,modelSize=(640,360),mean=(0.335, 0.358, 0.332),std = (0.141, 0.138, 0.143) ,numpy=False, RGB_convert_first=False ):
- time0 = time.time()
- if RGB_convert_first:
- image = cv2.cvtColor( image,cv2.COLOR_RGB2BGR)
- time1 = time.time()
- image = cv2.resize(image,modelSize)
- time2 = time.time()
- image = image.astype(np.float32)
- image /= 255.0
- time3 = time.time()
- image[:,:,0] -=mean[0]
- image[:,:,1] -=mean[1]
- image[:,:,2] -=mean[2]
- time4 = time.time()
- image[:,:,0] /= std[0]
- image[:,:,1] /= std[1]
- image[:,:,2] /= std[2]
- if not RGB_convert_first:
- image = cv2.cvtColor( image,cv2.COLOR_RGB2BGR)
- image = np.transpose(image, ( 2, 0, 1))
- time5 = time.time()
- print('RG convert:%.1f resize:%1f ,normalize:%.1f ,Demean:%.1f ,DeVar:%.1f '%( get_ms(time1,time0 ), get_ms(time2,time1 ), get_ms(time3,time2 ), get_ms(time4,time3 ), get_ms(time5,time4 ) ), numpy, RGB_convert_first)
- if numpy:
- return image
- else:
-
- image = torch.from_numpy(image).float()
- image = image.unsqueeze(0)
-
- return image
- def segPreProcess_image_torch(image,modelSize=(640,360),mean=(0.335, 0.358, 0.332),std = (0.141, 0.138, 0.143) ,numpy=False, RGB_convert_first=False,device='cuda:0' ):
- #输入是numpy,输出torch
- t1 = torch.from_numpy( np.array( std))*255.0
- t2 = torch.from_numpy(np.array(mean)/np.array(std))
-
- time0 = time.time()
- if RGB_convert_first:
- image = cv2.cvtColor( image,cv2.COLOR_RGB2BGR)
- time1 = time.time()
- image = cv2.resize(image,modelSize)
- image = np.transpose(image, ( 2, 0, 1))
- time2 = time.time()
- image = torch.from_numpy(image).float().to(device)
- time3 = time.time()
- image = image.unsqueeze(0)
-
-
- #image[:,:,:,:]= image/255.0
-
- #image[:,0,:,:] -=mean[0];image[:,1,:,:] -=mean[1];image[:,2,:,:] -=mean[2]
- image[:,0,:,:] /= t1[0];image[:,1,:,:] /= t1[1];image[:,2,:,:] /= t1[2]
- time4 = time.time()
- #image[:,0,:,:] /= std[0];image[:,1,:,:] /= std[1];image[:,2,:,:] /= std[2]
- image[:,0,:,:] -=t2[0];image[:,1,:,:] -=t2[1];image[:,2,:,:] -=t2[2]
- time5 = time.time()
- #print('RG convert:%.1f resizeee:%1f ,normalize:%.1f ,Demean:%.1f ,DeVar:%.1f '%( get_ms(time1,time0 ), get_ms(time2,time1 ), get_ms(time3,time2 ), get_ms(time4,time3 ), get_ms(time5,time4 ) ), numpy, RGB_convert_first)
- return image
-
-
- def yolov5Trtforward(model,im):
-
- namess=[ model.get_binding_name(index) for index in range(model.num_bindings) ]
- input_names = [namess[0]];output_names=namess[1:]
-
- with model.create_execution_context() as context:
- batch_size = im.shape[0]
- bindings = [None] * (len(input_names) + len(output_names))
-
- # 创建输出tensor,并分配内存
- outputs = [None] * len(output_names)
- for i, output_name in enumerate(output_names):
- idx = model.get_binding_index(output_name)#通过binding_name找到对应的input_id
- dtype = torch_dtype_from_trt(model.get_binding_dtype(idx))#找到对应的数据类型
- shape = tuple(model.get_binding_shape(idx))#找到对应的形状大小
- device = torch_device_from_trt(model.get_location(idx))
- output = torch.empty(size=shape, dtype=dtype, device=device)
- outputs[i] = output
- #print('###line144:',idx,dtype,shape,output.size())
- bindings[idx] = output.data_ptr()#绑定输出数据指针
-
-
- for i, input_name in enumerate(input_names):
- idx = model.get_binding_index(input_name)
- bindings[idx] = im.contiguous().data_ptr()
- context.execute_v2(bindings)
-
- return outputs[3]
-
- def segTrtForward(engine,inputs,contextFlag=False):
-
- if not contextFlag: context = engine.create_execution_context()
- else: context=contextFlag
-
- #with engine.create_execution_context() as context:
- input_names=['images'];output_names=['output']
- batch_size = inputs[0].shape[0]
- bindings = [None] * (len(input_names) + len(output_names))
-
- # 创建输出tensor,并分配内存
- outputs = [None] * len(output_names)
- for i, output_name in enumerate(output_names):
- idx = engine.get_binding_index(output_name)#通过binding_name找到对应的input_id
- dtype = torch_dtype_from_trt(engine.get_binding_dtype(idx))#找到对应的数据类型
- shape = (batch_size,) + tuple(engine.get_binding_shape(idx))#找到对应的形状大小
- device = torch_device_from_trt(engine.get_location(idx))
- output = torch.empty(size=shape, dtype=dtype, device=device)
- #print('&'*10,'device:',device,'idx:',idx,'shape:',shape,'dtype:',dtype,' device:',output.get_device())
- outputs[i] = output
- #print('###line65:',output_name,i,idx,dtype,shape)
- bindings[idx] = output.data_ptr()#绑定输出数据指针
-
-
- for i, input_name in enumerate(input_names):
- idx =engine.get_binding_index(input_name)
- bindings[idx] = inputs[0].contiguous().data_ptr()#应当为inputs[i],对应3个输入。但由于我们使用的是单张图片,所以将3个输入全设置为相同的图片。
- #print('#'*10,'input_names:,', input_name,'idx:',idx, inputs[0].dtype,', inputs[0] device:',inputs[0].get_device())
- context.execute_v2(bindings) # 执行推理
-
-
-
- if len(outputs) == 1:
- outputs = outputs[0]
-
- return outputs[0]
- def OcrTrtForward(engine,inputs,contextFlag=False):
-
- t0=time.time()
- #with engine.create_execution_context() as context:
- if not contextFlag: context = engine.create_execution_context()
- else: context=contextFlag
-
- input_names=['images'];output_names=['output']
- batch_size = inputs[0].shape[0]
- bindings = [None] * (len(input_names) + len(output_names))
- t1=time.time()
- # 创建输出tensor,并分配内存
- outputs = [None] * len(output_names)
- for i, output_name in enumerate(output_names):
- idx = engine.get_binding_index(output_name)#通过binding_name找到对应的input_id
- dtype = torch_dtype_from_trt(engine.get_binding_dtype(idx))#找到对应的数据类型
- shape = (batch_size,) + tuple(engine.get_binding_shape(idx))#找到对应的形状大小
- device = torch_device_from_trt(engine.get_location(idx))
- output = torch.empty(size=shape, dtype=dtype, device=device)
- #print('&'*10,'device:',device,'idx:',idx,'shape:',shape,'dtype:',dtype,' device:',output.get_device())
- outputs[i] = output
- #print('###line65:',output_name,i,idx,dtype,shape)
- bindings[idx] = output.data_ptr()#绑定输出数据指针
- t2=time.time()
-
- for i, input_name in enumerate(input_names):
- idx =engine.get_binding_index(input_name)
- bindings[idx] = inputs[0].contiguous().data_ptr()#应当为inputs[i],对应3个输入。但由于我们使用的是单张图片,所以将3个输入全设置为相同的图片。
- #print('#'*10,'input_names:,', input_name,'idx:',idx, inputs[0].dtype,', inputs[0] device:',inputs[0].get_device())
- t3=time.time()
- context.execute_v2(bindings) # 执行推理
- t4=time.time()
-
-
- if len(outputs) == 1:
- outputs = outputs[0]
- outstr='create Context:%.2f alloc memory:%.2f prepare input:%.2f conext infer:%.2f, total:%.2f'%((t1-t0 )*1000 , (t2-t1)*1000,(t3-t2)*1000,(t4-t3)*1000, (t4-t0)*1000 )
- return outputs[0],outstr
-
- def segtrtEval(engine,image_array0,par={'modelSize':(640,360),'nclass':2,'predResize':True,'mean':(0.485, 0.456, 0.406),'std' :(0.229, 0.224, 0.225),'numpy':False, 'RGB_convert_first':True}):
- time0_0=time.time()
- H,W,C=image_array0.shape
- #img_input = segPreProcess_image(image_array0,modelSize=par['modelSize'],mean=par['mean'],std =par['std'],numpy=par['numpy'], RGB_convert_first=par['RGB_convert_first'] )
- img_input = segPreProcess_image_torch(image_array0,modelSize=par['modelSize'],mean=par['mean'],std =par['std'],numpy=par['numpy'], RGB_convert_first=par['RGB_convert_first'] )
- img_input = img_input.to('cuda:0')
- time1_0=time.time()
- pred=segTrtForward(engine,[img_input])
- time2_0=time.time()
- pred=torch.argmax(pred,dim=1).cpu().numpy()[0]
- time3_0 = time.time()
- if 'predResize' in par.keys():
- if par['predResize']:
- pred = cv2.resize(pred.astype(np.uint8),(W,H))
- else:
- pred = cv2.resize(pred.astype(np.uint8),(W,H))
- time4_0 = time.time()
- segInfoStr= 'pre-precess:%.1f ,infer:%.1f ,post-cpu-argmax:%.1f ,post-resize:%.1f, total:%.1f '%( get_ms(time1_0,time0_0),get_ms(time2_0,time1_0),get_ms(time3_0,time2_0),get_ms(time4_0,time3_0),get_ms(time4_0,time0_0) )
- return pred,segInfoStr
-
|