AIlib2/segutils/trtUtils2.py

465 lines
20 KiB
Python
Raw Normal View History

2025-04-26 10:35:59 +08:00
#
# Copyright 1993-2020 NVIDIA Corporation. All rights reserved.
#
# NOTICE TO LICENSEE:
#
# This source code and/or documentation ("Licensed Deliverables") are
# subject to NVIDIA intellectual property rights under U.S. and
# international Copyright laws.
#
# These Licensed Deliverables contained herein is PROPRIETARY and
# CONFIDENTIAL to NVIDIA and is being provided under the terms and
# conditions of a form of NVIDIA software license agreement by and
# between NVIDIA and Licensee ("License Agreement") or electronically
# accepted by Licensee. Notwithstanding any terms or conditions to
# the contrary in the License Agreement, reproduction or disclosure
# of the Licensed Deliverables to any third party without the express
# written consent of NVIDIA is prohibited.
#
# NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
# LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
# SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. IT IS
# PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
# NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
# DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
# NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
# NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
# LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
# SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
# DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
# WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
# ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
# OF THESE LICENSED DELIVERABLES.
#
# U.S. Government End Users. These Licensed Deliverables are a
# "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
# 1995), consisting of "commercial computer software" and "commercial
# computer software documentation" as such terms are used in 48
# C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
# only as a commercial end item. Consistent with 48 C.F.R.12.212 and
# 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
# U.S. Government End Users acquire the Licensed Deliverables with
# only those rights set forth herein.
#
# Any use of the Licensed Deliverables in individual and commercial
# software must include, in the user documentation and internal
# comments to the code, the above Disclaimer and U.S. Government End
# Users Notice.
#
import argparse
import pycuda.driver as cuda
import pycuda.autoinit
import numpy as np
import torch
import tensorrt as trt
import time
import onnx
import onnxruntime
import os,sys,cv2
#from model.u2net import U2NET
#cuda.init()
model_names = ['u2net.onnx','u2net_dynamic_batch.onnx','u2net_dynamic_hw.onnx','u2net_dynamic_batch-hw.onnx' ]
dynamic_batch = {'input':{0:'batch'},
'output0':{0:'batch'},
'output1':{0:'batch'},
'output2':{0:'batch'},
'output3':{0:'batch'},
'output4':{0:'batch'},
'output5':{0:'batch'},
'output6':{0:'batch'}}
dynamic_hw ={'input':{2:'H',3:'W'},
'output0':{2:'H',3:'W'},
'output1':{2:'H',3:'W'},
'output2':{2:'H',3:'W'},
'output3':{2:'H',3:'W'},
'output4':{2:'H',3:'W'},
'output5':{2:'H',3:'W'},
'output6':{2:'H',3:'W'}}
dynamic_batch_hw ={'input':{0:'batch',2:'H',3:'W'},
'output0':{0:'batch',2:'H',3:'W'},
'output1':{0:'batch',2:'H',3:'W'},
'output2':{0:'batch',2:'H',3:'W'},
'output3':{0:'batch',2:'H',3:'W'},
'output4':{0:'batch',2:'H',3:'W'},
'output5':{0:'batch',2:'H',3:'W'},
'output6':{0:'batch',2:'H',3:'W'}}
dynamic_=[None,dynamic_batch,dynamic_hw,dynamic_batch_hw]
TRT_LOGGER = trt.Logger()
def pth2onnx(pth_model,onnx_name,input_shape=(1,3,512,512),input_names=['input'],output_names=['output'],dynamix_axis=None):
#pth_model:输入加载权重后的pth模型
#onnx_name:输出的onnx模型路径
#input_shape:模型输入的尺寸(建议尺寸)
#input_names:模型输入的名字list格式可以有多个输入
#output_names:模型输入的名字list格式可以有多个输出
#dynamix_axis:字典格式None-表示静态输入。每一个模型的输入输出都可以定义动态的维度
# 如dynamic_batch_hw ={'input':{0:'batch',2:'H',3:'W'}, 'output':{0:'batch',2:'H',3:'W'}},
# 表示input的B,H,W和output的B,H,W是动态尺寸
print('[I] beg to converting pth to onnx ...... ',dynamix_axis)
input_tensor = torch.ones(input_shape)
if next(pth_model.parameters()).is_cuda:
input_tensor = input_tensor.to('cuda:0')
with torch.no_grad():
torch.onnx.export(pth_model,
input_tensor,
onnx_name,
opset_version=11,
input_names=input_names,
do_constant_folding=True,
output_names=output_names,
dynamic_axes=dynamix_axis)
onnx_model = onnx.load(onnx_name)
try:
onnx.checker.check_model(onnx_model)
except Exception as e:
print('[Error] model incorrect:',e)
else:
print('[I] conver to onnx over in ', onnx_name)
print('')
def onnx_inference(onnx_input,model_name,outputName=['output0','output1','output2','output3','output4','output5','output6' ]):
providers=['CUDAExecutionProvider', 'CPUExecutionProvider']
print('8'*10, ' line125:',model_name)
#outputName = ['pred_logits', 'pred_points']
onnx_session = onnxruntime.InferenceSession(model_name,providers=providers)
try:
onnx_output = onnx_session.run(outputName,onnx_input)
except Exception as e:
onnx_output=None
print(e)
return onnx_output
def onnx2engine(onnx_file_path,engine_file_path,input_shape=[1,3,512,512],half=True,max_batch_size=1,input_profile_shapes=[None,None,None]):
#onnx_file_path:输入的onnx路径
#engine_file_path:输出的trt模型路径
#input_shape:默认的模型输入尺寸, 如[1,3,512,512] ,如果是动态的可以为[1,3,-1,-1]
#half:是否使用fp16,默认True
#max_batch_size:最大的bachsize默认是1
#input_profile_shapes:动态输入时输入的三个尺寸[最小尺寸,优化尺寸,最大尺寸]此时input_shape一定有-1
# 如(1,3,512,512),(1,3,1024,1024),(1,3,2048,2048)
builder = trt.Builder(TRT_LOGGER)
network = builder.create_network(1 << int(trt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH))
config = builder.create_builder_config()
parser = trt.OnnxParser(network,TRT_LOGGER)
runtime = trt.Runtime(TRT_LOGGER)
# 最大内存占用,一般1Gtrt特有的一切与优化有关,显存溢出需要重新设置
config.max_workspace_size = 1<<30 #256MB
if builder.platform_has_fast_fp16 and half:
config.set_flag(trt.BuilderFlag.FP16)
builder.max_batch_size = max_batch_size # 推理的时候要保证batch_size<=max_batch_size
# parse model file
if not os.path.exists(onnx_file_path):
print(f'onnx file {onnx_file_path} not found,please run torch_2_onnx.py first to generate it')
exit(0)
print(f'Loading ONNX file from path {onnx_file_path}...')
with open(onnx_file_path,'rb') as model:
print('Beginning ONNX file parsing')
if not parser.parse(model.read()):
print('ERROR:Failed to parse the ONNX file')
for error in range(parser.num_errors):
print(parser.get_error(error))
return None
# Static input setting
network.get_input(0).shape=input_shape
# Dynamic input setting 动态输入在builder的profile设置
# 为每个动态输入绑定一个profile
if -1 in input_shape:
profile = builder.create_optimization_profile()
profile.set_shape(network.get_input(0).name,input_profile_shapes[0],input_profile_shapes[1],input_profile_shapes[2] )#最小的尺寸,常用的尺寸,最大的尺寸,推理时候输入需要在这个范围内
config.add_optimization_profile(profile)
print('Completed parsing the ONNX file')
print(f'Building an engine from file {onnx_file_path}; this may take a while...')
t0 = time.time()
engine = builder.build_engine(network,config)
with open(engine_file_path,'wb') as f:
# f.write(plan)
f.write(engine.serialize())
t1 = time.time()
print('Completed creating Engine:%s, %.1f'%(engine_file_path,t1-t0))
try:
# Sometimes python2 does not understand FileNotFoundError
FileNotFoundError
except NameError:
FileNotFoundError = IOError
#EXPLICIT_BATCH = 1 << (int)(trt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH)
def GiB(val):
return val * 1 << 30
def add_help(description):
parser = argparse.ArgumentParser(description=description, formatter_class=argparse.ArgumentDefaultsHelpFormatter)
args, _ = parser.parse_known_args()
def find_sample_data(description="Runs a TensorRT Python sample", subfolder="", find_files=[]):
'''
Parses sample arguments.
Args:
description (str): Description of the sample.
subfolder (str): The subfolder containing data relevant to this sample
find_files (str): A list of filenames to find. Each filename will be replaced with an absolute path.
Returns:
str: Path of data directory.
'''
# Standard command-line arguments for all samples.
kDEFAULT_DATA_ROOT = os.path.join(os.sep, "usr", "src", "tensorrt", "data")
parser = argparse.ArgumentParser(description=description, formatter_class=argparse.ArgumentDefaultsHelpFormatter)
parser.add_argument("-d", "--datadir", help="Location of the TensorRT sample data directory, and any additional data directories.", action="append", default=[kDEFAULT_DATA_ROOT])
args, _ = parser.parse_known_args()
def get_data_path(data_dir):
# If the subfolder exists, append it to the path, otherwise use the provided path as-is.
data_path = os.path.join(data_dir, subfolder)
if not os.path.exists(data_path):
print("WARNING: " + data_path + " does not exist. Trying " + data_dir + " instead.")
data_path = data_dir
# Make sure data directory exists.
if not (os.path.exists(data_path)):
print("WARNING: {:} does not exist. Please provide the correct data path with the -d option.".format(data_path))
return data_path
data_paths = [get_data_path(data_dir) for data_dir in args.datadir]
return data_paths, locate_files(data_paths, find_files)
def locate_files(data_paths, filenames):
"""
Locates the specified files in the specified data directories.
If a file exists in multiple data directories, the first directory is used.
Args:
data_paths (List[str]): The data directories.
filename (List[str]): The names of the files to find.
Returns:
List[str]: The absolute paths of the files.
Raises:
FileNotFoundError if a file could not be located.
"""
found_files = [None] * len(filenames)
for data_path in data_paths:
# Find all requested files.
for index, (found, filename) in enumerate(zip(found_files, filenames)):
if not found:
file_path = os.path.abspath(os.path.join(data_path, filename))
if os.path.exists(file_path):
found_files[index] = file_path
# Check that all files were found
for f, filename in zip(found_files, filenames):
if not f or not os.path.exists(f):
raise FileNotFoundError("Could not find {:}. Searched in data paths: {:}".format(filename, data_paths))
return found_files
# Simple helper data class that's a little nicer to use than a 2-tuple.
class HostDeviceMem(object):
def __init__(self, host_mem, device_mem):
self.host = host_mem
self.device = device_mem
def __str__(self):
return "Host:\n" + str(self.host) + "\nDevice:\n" + str(self.device)
def __repr__(self):
return self.__str__()
# Allocates all buffers required for an engine, i.e. host/device inputs/outputs.
def allocate_buffers(engine,input_shape,streamFlag=True):
inputs = []
outputs = []
bindings = []
if streamFlag:
stream = cuda.Stream()
else: stream=None
for ib,binding in enumerate(engine):
dims = engine.get_binding_shape(binding)
#print(engine.get_binding_name(ib),dims,engine.max_batch_size)
if -1 in dims:
if isinstance(input_shape,list):
dims = input_shape[ib]
else:
dims = input_shape
# size = trt.volume(engine.get_binding_shape(binding)) * engine.max_batch_size
#size = trt.volume(dims) * engine.max_batch_size
size = trt.volume(dims)
dtype = trt.nptype(engine.get_binding_dtype(binding))
# Allocate host and device buffers
host_mem = cuda.pagelocked_empty(size, dtype)
device_mem = cuda.mem_alloc(host_mem.nbytes)
# Append the device buffer to device bindings.
bindings.append(int(device_mem))
# Append to the appropriate list.
if engine.binding_is_input(binding):
inputs.append(HostDeviceMem(host_mem, device_mem))
else:
outputs.append(HostDeviceMem(host_mem, device_mem))
return inputs, outputs, bindings, stream
# This function is generalized for multiple inputs/outputs.
# inputs and outputs are expected to be lists of HostDeviceMem objects.
def do_inference(context, bindings, inputs, outputs, stream, batch_size=1):
# Transfer input data to the GPU.
[cuda.memcpy_htod_async(inp.device, inp.host, stream) for inp in inputs]
# Run inference.
context.execute_async(batch_size=batch_size, bindings=bindings, stream_handle=stream.handle)
# Transfer predictions back from the GPU.
[cuda.memcpy_dtoh_async(out.host, out.device, stream) for out in outputs]
# Synchronize the stream
stream.synchronize()
# Return only the host outputs.
return [out.host for out in outputs]
# This function is generalized for multiple inputs/outputs for full dimension networks.
# inputs and outputs are expected to be lists of HostDeviceMem objects.
def do_inference_v2(context, bindings, inputs, outputs, stream):
# Transfer input data to the GPU.
[cuda.memcpy_htod_async(inp.device, inp.host, stream) for inp in inputs]
# Run inference.
#stream.synchronize()
#context.execute_v2(bindings) # 执行推
context.execute_async_v2(bindings=bindings, stream_handle=stream.handle)
# Transfer predictions back from the GPU.
[cuda.memcpy_dtoh_async(out.host, out.device, stream) for out in outputs]
# Synchronize the stream
stream.synchronize()
# Return only the host outputs.
return [out.host for out in outputs]
def trt_inference( img,img_h,img_w,context,inputs,outputs,bindings,stream,input_name = 'input'):
#输入:
#img--np格式,NCHW
#img_h,img_w--输入模型时图像的HW。动态输入是需要知道。
#context--外面开辟的trt上下文
#inputs,outputs,bindings,stream--第一次处理图像时开辟的内存及其地址绑定到trt的输出
#input_name--模型输入tensor的名字
#输出
#trt_outputs--为list格式里面的元素是numpy格式
origin_inputshape = context.get_tensor_shape( input_name)
#if origin_inputshape[-1]==-1:
context.set_optimization_profile_async(0,stream.handle)
origin_inputshape[-2],origin_inputshape[-1]=(img_h,img_w)
context.set_input_shape(input_name, (origin_inputshape))
inputs[0].host = np.ascontiguousarray(img)
trt_outputs = do_inference_v2(context,bindings=bindings,inputs=inputs,outputs=outputs,stream=stream)
return trt_outputs
# def do_inference_v3(context, bindings, inputs, outputs, stream,h_,w_):
# '''
# Copy from https://github.com/zhaogangthu/keras-yolo3-ocr-tensorrt/blob/master/tensorRT_yolo3/common.py
#
# '''
# # Transfer input data to the GPU.
#
# context.set_binding_shape(0, (1, 3, h_, w_))
#
# [cuda.memcpy_htod_async(inp.device, inp.host, stream) for inp in inputs]
# # Run inference.
# context.execute_async_v2(bindings=bindings, stream_handle=stream.handle)
# # Transfer predictions back from the GPU.
# [cuda.memcpy_dtoh_async(out.host, out.device, stream) for out in outputs]
# # Synchronize the stream
# stream.synchronize()
# # Return only the host outputs.
# return [out.host for out in outputs]
if __name__=='__main__':
model_path='weights/u2net_portrait.pth'
onnx_name = model_path.replace('.pth','.onnx')
trt_name = model_path.replace('.pth','.engine')
pth_model = U2NET(3,1)
pth_model.load_state_dict(torch.load(model_path))
input_names=['input']
output_names=['output%d'%(i) for i in range(7)]
dynamix_axis = dynamic_hw
input_shape =(1,3,512,512)
#测试pth转为onnx模型
#pth2onnx(pth_model,onnx_name,input_shape=input_shape ,input_names=input_names ,output_names=output_names ,dynamix_axis=dynamix_axis )
#测试onnx模型转为trt模型
input_profile_shapes = [(1,3,512,512),(1,3,1024,1024),(1,3,2048,2048)]
input_shape = [1,3,-1,-1]
half=True
max_batch_size = 1
onnx2engine(onnx_name,trt_name,input_shape=input_shape,half=half,max_batch_size=max_batch_size,input_profile_shapes=input_profile_shapes)
'''
with torch.no_grad():
for i,model_name in enumerate(model_names):
print(f'process model:{model_name}...')
torch.onnx.export(model,
input_tensor,
model_name,
opset_version=11,
input_names=['input'],
output_names=['output0','output1','output2','output3','output4','output5','output6'],
dynamic_axes=dynamic_[i])
print(f'onnx model:{model_name} saved successfully...')
#print('sleep 10s...')
time.sleep(10)
print(f'begin check onnx model:{model_name}...')
onnx_model = onnx.load(model_name)
try:
onnx.checker.check_model(onnx_model)
except Exception as e:
print('model incorrect')
print(e)
else:
print('model correct')
print('*'*50)
print('Begin to test...')
case_1 = np.random.rand(1,3,512,512).astype(np.float32)
case_2 = np.random.rand(2,3,512,512).astype(np.float32)
case_3 = np.random.rand(1,3,224,224).astype(np.float32)
cases = [case_1,case_2,case_3]
providers=['CUDAExecutionProvider', 'CPUExecutionProvider']
for model_name in model_names:
print('-'*50,model_name)
onnx_session = onnxruntime.InferenceSession(model_name,providers=providers)
for i,case in enumerate(cases):
onnx_input = {'input':case}
try:
onnx_output = onnx_session.run(['output0','output1','output2','output3','output4','output5','output6'],onnx_input)[0]
except Exception as e:
print(f'Input:{i} on model:{model_name} failed')
print(e)
else:
print(f'Input:{i} on model:{model_name} succeed')
'''