465 lines
20 KiB
Python
465 lines
20 KiB
Python
#
|
||
# Copyright 1993-2020 NVIDIA Corporation. All rights reserved.
|
||
#
|
||
# NOTICE TO LICENSEE:
|
||
#
|
||
# This source code and/or documentation ("Licensed Deliverables") are
|
||
# subject to NVIDIA intellectual property rights under U.S. and
|
||
# international Copyright laws.
|
||
#
|
||
# These Licensed Deliverables contained herein is PROPRIETARY and
|
||
# CONFIDENTIAL to NVIDIA and is being provided under the terms and
|
||
# conditions of a form of NVIDIA software license agreement by and
|
||
# between NVIDIA and Licensee ("License Agreement") or electronically
|
||
# accepted by Licensee. Notwithstanding any terms or conditions to
|
||
# the contrary in the License Agreement, reproduction or disclosure
|
||
# of the Licensed Deliverables to any third party without the express
|
||
# written consent of NVIDIA is prohibited.
|
||
#
|
||
# NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
|
||
# LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
|
||
# SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. IT IS
|
||
# PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
|
||
# NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
|
||
# DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
|
||
# NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
|
||
# NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
|
||
# LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
|
||
# SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
|
||
# DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
|
||
# WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
|
||
# ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
|
||
# OF THESE LICENSED DELIVERABLES.
|
||
#
|
||
# U.S. Government End Users. These Licensed Deliverables are a
|
||
# "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
|
||
# 1995), consisting of "commercial computer software" and "commercial
|
||
# computer software documentation" as such terms are used in 48
|
||
# C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
|
||
# only as a commercial end item. Consistent with 48 C.F.R.12.212 and
|
||
# 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
|
||
# U.S. Government End Users acquire the Licensed Deliverables with
|
||
# only those rights set forth herein.
|
||
#
|
||
# Any use of the Licensed Deliverables in individual and commercial
|
||
# software must include, in the user documentation and internal
|
||
# comments to the code, the above Disclaimer and U.S. Government End
|
||
# Users Notice.
|
||
#
|
||
|
||
|
||
import argparse
|
||
import pycuda.driver as cuda
|
||
import pycuda.autoinit
|
||
import numpy as np
|
||
import torch
|
||
import tensorrt as trt
|
||
|
||
|
||
import time
|
||
import onnx
|
||
import onnxruntime
|
||
import os,sys,cv2
|
||
#from model.u2net import U2NET
|
||
|
||
#cuda.init()
|
||
model_names = ['u2net.onnx','u2net_dynamic_batch.onnx','u2net_dynamic_hw.onnx','u2net_dynamic_batch-hw.onnx' ]
|
||
dynamic_batch = {'input':{0:'batch'},
|
||
'output0':{0:'batch'},
|
||
'output1':{0:'batch'},
|
||
'output2':{0:'batch'},
|
||
'output3':{0:'batch'},
|
||
'output4':{0:'batch'},
|
||
'output5':{0:'batch'},
|
||
'output6':{0:'batch'}}
|
||
dynamic_hw ={'input':{2:'H',3:'W'},
|
||
'output0':{2:'H',3:'W'},
|
||
'output1':{2:'H',3:'W'},
|
||
'output2':{2:'H',3:'W'},
|
||
'output3':{2:'H',3:'W'},
|
||
'output4':{2:'H',3:'W'},
|
||
'output5':{2:'H',3:'W'},
|
||
'output6':{2:'H',3:'W'}}
|
||
dynamic_batch_hw ={'input':{0:'batch',2:'H',3:'W'},
|
||
'output0':{0:'batch',2:'H',3:'W'},
|
||
'output1':{0:'batch',2:'H',3:'W'},
|
||
'output2':{0:'batch',2:'H',3:'W'},
|
||
'output3':{0:'batch',2:'H',3:'W'},
|
||
'output4':{0:'batch',2:'H',3:'W'},
|
||
'output5':{0:'batch',2:'H',3:'W'},
|
||
'output6':{0:'batch',2:'H',3:'W'}}
|
||
dynamic_=[None,dynamic_batch,dynamic_hw,dynamic_batch_hw]
|
||
|
||
TRT_LOGGER = trt.Logger()
|
||
def pth2onnx(pth_model,onnx_name,input_shape=(1,3,512,512),input_names=['input'],output_names=['output'],dynamix_axis=None):
|
||
#pth_model:输入加载权重后的pth模型
|
||
#onnx_name:输出的onnx模型路径
|
||
#input_shape:模型输入的尺寸(建议尺寸)
|
||
#input_names:模型输入的名字,list格式,可以有多个输入
|
||
#output_names:模型输入的名字,list格式,可以有多个输出
|
||
#dynamix_axis:字典格式,None-表示静态输入。每一个模型的输入输出都可以定义动态的维度
|
||
# 如dynamic_batch_hw ={'input':{0:'batch',2:'H',3:'W'}, 'output':{0:'batch',2:'H',3:'W'}},
|
||
# 表示input的B,H,W和output的B,H,W是动态尺寸
|
||
print('[I] beg to converting pth to onnx ...... ',dynamix_axis)
|
||
input_tensor = torch.ones(input_shape)
|
||
if next(pth_model.parameters()).is_cuda:
|
||
input_tensor = input_tensor.to('cuda:0')
|
||
with torch.no_grad():
|
||
torch.onnx.export(pth_model,
|
||
input_tensor,
|
||
onnx_name,
|
||
opset_version=11,
|
||
input_names=input_names,
|
||
do_constant_folding=True,
|
||
output_names=output_names,
|
||
dynamic_axes=dynamix_axis)
|
||
onnx_model = onnx.load(onnx_name)
|
||
try:
|
||
onnx.checker.check_model(onnx_model)
|
||
except Exception as e:
|
||
print('[Error] model incorrect:',e)
|
||
else:
|
||
print('[I] conver to onnx over in ', onnx_name)
|
||
print('')
|
||
def onnx_inference(onnx_input,model_name,outputName=['output0','output1','output2','output3','output4','output5','output6' ]):
|
||
providers=['CUDAExecutionProvider', 'CPUExecutionProvider']
|
||
print('8'*10, ' line125:',model_name)
|
||
#outputName = ['pred_logits', 'pred_points']
|
||
onnx_session = onnxruntime.InferenceSession(model_name,providers=providers)
|
||
try:
|
||
onnx_output = onnx_session.run(outputName,onnx_input)
|
||
|
||
except Exception as e:
|
||
onnx_output=None
|
||
print(e)
|
||
return onnx_output
|
||
def onnx2engine(onnx_file_path,engine_file_path,input_shape=[1,3,512,512],half=True,max_batch_size=1,input_profile_shapes=[None,None,None]):
|
||
#onnx_file_path:输入的onnx路径
|
||
#engine_file_path:输出的trt模型路径
|
||
#input_shape:默认的模型输入尺寸, 如[1,3,512,512] ,如果是动态的可以为[1,3,-1,-1]
|
||
#half:是否使用fp16,默认True
|
||
#max_batch_size:最大的bachsize,默认是1
|
||
#input_profile_shapes:动态输入时输入的三个尺寸[最小尺寸,优化尺寸,最大尺寸],此时input_shape一定有-1
|
||
# 如(1,3,512,512),(1,3,1024,1024),(1,3,2048,2048),
|
||
builder = trt.Builder(TRT_LOGGER)
|
||
network = builder.create_network(1 << int(trt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH))
|
||
config = builder.create_builder_config()
|
||
parser = trt.OnnxParser(network,TRT_LOGGER)
|
||
runtime = trt.Runtime(TRT_LOGGER)
|
||
|
||
# 最大内存占用,一般1G,trt特有的,一切与优化有关,显存溢出需要重新设置
|
||
config.max_workspace_size = 1<<30 #256MB
|
||
if builder.platform_has_fast_fp16 and half:
|
||
config.set_flag(trt.BuilderFlag.FP16)
|
||
builder.max_batch_size = max_batch_size # 推理的时候要保证batch_size<=max_batch_size
|
||
|
||
# parse model file
|
||
if not os.path.exists(onnx_file_path):
|
||
print(f'onnx file {onnx_file_path} not found,please run torch_2_onnx.py first to generate it')
|
||
exit(0)
|
||
print(f'Loading ONNX file from path {onnx_file_path}...')
|
||
with open(onnx_file_path,'rb') as model:
|
||
print('Beginning ONNX file parsing')
|
||
if not parser.parse(model.read()):
|
||
print('ERROR:Failed to parse the ONNX file')
|
||
for error in range(parser.num_errors):
|
||
print(parser.get_error(error))
|
||
return None
|
||
|
||
# Static input setting
|
||
network.get_input(0).shape=input_shape
|
||
# Dynamic input setting 动态输入在builder的profile设置
|
||
# 为每个动态输入绑定一个profile
|
||
if -1 in input_shape:
|
||
profile = builder.create_optimization_profile()
|
||
profile.set_shape(network.get_input(0).name,input_profile_shapes[0],input_profile_shapes[1],input_profile_shapes[2] )#最小的尺寸,常用的尺寸,最大的尺寸,推理时候输入需要在这个范围内
|
||
config.add_optimization_profile(profile)
|
||
|
||
print('Completed parsing the ONNX file')
|
||
print(f'Building an engine from file {onnx_file_path}; this may take a while...')
|
||
|
||
t0 = time.time()
|
||
engine = builder.build_engine(network,config)
|
||
|
||
with open(engine_file_path,'wb') as f:
|
||
# f.write(plan)
|
||
f.write(engine.serialize())
|
||
t1 = time.time()
|
||
print('Completed creating Engine:%s, %.1f'%(engine_file_path,t1-t0))
|
||
|
||
|
||
|
||
|
||
|
||
try:
|
||
# Sometimes python2 does not understand FileNotFoundError
|
||
FileNotFoundError
|
||
except NameError:
|
||
FileNotFoundError = IOError
|
||
|
||
#EXPLICIT_BATCH = 1 << (int)(trt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH)
|
||
|
||
def GiB(val):
|
||
return val * 1 << 30
|
||
|
||
|
||
def add_help(description):
|
||
parser = argparse.ArgumentParser(description=description, formatter_class=argparse.ArgumentDefaultsHelpFormatter)
|
||
args, _ = parser.parse_known_args()
|
||
|
||
|
||
def find_sample_data(description="Runs a TensorRT Python sample", subfolder="", find_files=[]):
|
||
'''
|
||
Parses sample arguments.
|
||
|
||
Args:
|
||
description (str): Description of the sample.
|
||
subfolder (str): The subfolder containing data relevant to this sample
|
||
find_files (str): A list of filenames to find. Each filename will be replaced with an absolute path.
|
||
|
||
Returns:
|
||
str: Path of data directory.
|
||
'''
|
||
|
||
# Standard command-line arguments for all samples.
|
||
kDEFAULT_DATA_ROOT = os.path.join(os.sep, "usr", "src", "tensorrt", "data")
|
||
parser = argparse.ArgumentParser(description=description, formatter_class=argparse.ArgumentDefaultsHelpFormatter)
|
||
parser.add_argument("-d", "--datadir", help="Location of the TensorRT sample data directory, and any additional data directories.", action="append", default=[kDEFAULT_DATA_ROOT])
|
||
args, _ = parser.parse_known_args()
|
||
|
||
def get_data_path(data_dir):
|
||
# If the subfolder exists, append it to the path, otherwise use the provided path as-is.
|
||
data_path = os.path.join(data_dir, subfolder)
|
||
if not os.path.exists(data_path):
|
||
print("WARNING: " + data_path + " does not exist. Trying " + data_dir + " instead.")
|
||
data_path = data_dir
|
||
# Make sure data directory exists.
|
||
if not (os.path.exists(data_path)):
|
||
print("WARNING: {:} does not exist. Please provide the correct data path with the -d option.".format(data_path))
|
||
return data_path
|
||
|
||
data_paths = [get_data_path(data_dir) for data_dir in args.datadir]
|
||
return data_paths, locate_files(data_paths, find_files)
|
||
|
||
def locate_files(data_paths, filenames):
|
||
"""
|
||
Locates the specified files in the specified data directories.
|
||
If a file exists in multiple data directories, the first directory is used.
|
||
|
||
Args:
|
||
data_paths (List[str]): The data directories.
|
||
filename (List[str]): The names of the files to find.
|
||
|
||
Returns:
|
||
List[str]: The absolute paths of the files.
|
||
|
||
Raises:
|
||
FileNotFoundError if a file could not be located.
|
||
"""
|
||
found_files = [None] * len(filenames)
|
||
for data_path in data_paths:
|
||
# Find all requested files.
|
||
for index, (found, filename) in enumerate(zip(found_files, filenames)):
|
||
if not found:
|
||
file_path = os.path.abspath(os.path.join(data_path, filename))
|
||
if os.path.exists(file_path):
|
||
found_files[index] = file_path
|
||
|
||
# Check that all files were found
|
||
for f, filename in zip(found_files, filenames):
|
||
if not f or not os.path.exists(f):
|
||
raise FileNotFoundError("Could not find {:}. Searched in data paths: {:}".format(filename, data_paths))
|
||
return found_files
|
||
|
||
# Simple helper data class that's a little nicer to use than a 2-tuple.
|
||
class HostDeviceMem(object):
|
||
def __init__(self, host_mem, device_mem):
|
||
self.host = host_mem
|
||
self.device = device_mem
|
||
|
||
def __str__(self):
|
||
return "Host:\n" + str(self.host) + "\nDevice:\n" + str(self.device)
|
||
|
||
def __repr__(self):
|
||
return self.__str__()
|
||
|
||
# Allocates all buffers required for an engine, i.e. host/device inputs/outputs.
|
||
def allocate_buffers(engine,input_shape,streamFlag=True):
|
||
inputs = []
|
||
outputs = []
|
||
bindings = []
|
||
if streamFlag:
|
||
stream = cuda.Stream()
|
||
else: stream=None
|
||
|
||
|
||
for ib,binding in enumerate(engine):
|
||
dims = engine.get_binding_shape(binding)
|
||
#print(engine.get_binding_name(ib),dims,engine.max_batch_size)
|
||
if -1 in dims:
|
||
if isinstance(input_shape,list):
|
||
dims = input_shape[ib]
|
||
else:
|
||
dims = input_shape
|
||
|
||
# size = trt.volume(engine.get_binding_shape(binding)) * engine.max_batch_size
|
||
#size = trt.volume(dims) * engine.max_batch_size
|
||
size = trt.volume(dims)
|
||
dtype = trt.nptype(engine.get_binding_dtype(binding))
|
||
# Allocate host and device buffers
|
||
|
||
host_mem = cuda.pagelocked_empty(size, dtype)
|
||
|
||
device_mem = cuda.mem_alloc(host_mem.nbytes)
|
||
# Append the device buffer to device bindings.
|
||
bindings.append(int(device_mem))
|
||
# Append to the appropriate list.
|
||
if engine.binding_is_input(binding):
|
||
inputs.append(HostDeviceMem(host_mem, device_mem))
|
||
else:
|
||
outputs.append(HostDeviceMem(host_mem, device_mem))
|
||
return inputs, outputs, bindings, stream
|
||
# This function is generalized for multiple inputs/outputs.
|
||
# inputs and outputs are expected to be lists of HostDeviceMem objects.
|
||
def do_inference(context, bindings, inputs, outputs, stream, batch_size=1):
|
||
# Transfer input data to the GPU.
|
||
[cuda.memcpy_htod_async(inp.device, inp.host, stream) for inp in inputs]
|
||
# Run inference.
|
||
context.execute_async(batch_size=batch_size, bindings=bindings, stream_handle=stream.handle)
|
||
# Transfer predictions back from the GPU.
|
||
[cuda.memcpy_dtoh_async(out.host, out.device, stream) for out in outputs]
|
||
# Synchronize the stream
|
||
stream.synchronize()
|
||
# Return only the host outputs.
|
||
return [out.host for out in outputs]
|
||
|
||
# This function is generalized for multiple inputs/outputs for full dimension networks.
|
||
# inputs and outputs are expected to be lists of HostDeviceMem objects.
|
||
def do_inference_v2(context, bindings, inputs, outputs, stream):
|
||
# Transfer input data to the GPU.
|
||
[cuda.memcpy_htod_async(inp.device, inp.host, stream) for inp in inputs]
|
||
# Run inference.
|
||
#stream.synchronize()
|
||
#context.execute_v2(bindings) # 执行推
|
||
|
||
context.execute_async_v2(bindings=bindings, stream_handle=stream.handle)
|
||
|
||
# Transfer predictions back from the GPU.
|
||
[cuda.memcpy_dtoh_async(out.host, out.device, stream) for out in outputs]
|
||
# Synchronize the stream
|
||
stream.synchronize()
|
||
# Return only the host outputs.
|
||
return [out.host for out in outputs]
|
||
|
||
def trt_inference( img,img_h,img_w,context,inputs,outputs,bindings,stream,input_name = 'input'):
|
||
#输入:
|
||
#img--np格式,NCHW
|
||
#img_h,img_w--输入模型时图像的H,W。动态输入是需要知道。
|
||
#context--外面开辟的trt上下文
|
||
#inputs,outputs,bindings,stream--第一次处理图像时,开辟的内存及其地址绑定到trt的输出
|
||
#input_name--模型输入tensor的名字
|
||
#输出
|
||
#trt_outputs--为list格式,里面的元素是numpy格式
|
||
|
||
origin_inputshape = context.get_tensor_shape( input_name)
|
||
|
||
#if origin_inputshape[-1]==-1:
|
||
context.set_optimization_profile_async(0,stream.handle)
|
||
origin_inputshape[-2],origin_inputshape[-1]=(img_h,img_w)
|
||
context.set_input_shape(input_name, (origin_inputshape))
|
||
|
||
|
||
|
||
inputs[0].host = np.ascontiguousarray(img)
|
||
trt_outputs = do_inference_v2(context,bindings=bindings,inputs=inputs,outputs=outputs,stream=stream)
|
||
return trt_outputs
|
||
|
||
# def do_inference_v3(context, bindings, inputs, outputs, stream,h_,w_):
|
||
# '''
|
||
# Copy from https://github.com/zhaogangthu/keras-yolo3-ocr-tensorrt/blob/master/tensorRT_yolo3/common.py
|
||
#
|
||
# '''
|
||
# # Transfer input data to the GPU.
|
||
#
|
||
# context.set_binding_shape(0, (1, 3, h_, w_))
|
||
#
|
||
# [cuda.memcpy_htod_async(inp.device, inp.host, stream) for inp in inputs]
|
||
# # Run inference.
|
||
# context.execute_async_v2(bindings=bindings, stream_handle=stream.handle)
|
||
# # Transfer predictions back from the GPU.
|
||
# [cuda.memcpy_dtoh_async(out.host, out.device, stream) for out in outputs]
|
||
# # Synchronize the stream
|
||
# stream.synchronize()
|
||
# # Return only the host outputs.
|
||
# return [out.host for out in outputs]
|
||
if __name__=='__main__':
|
||
model_path='weights/u2net_portrait.pth'
|
||
onnx_name = model_path.replace('.pth','.onnx')
|
||
trt_name = model_path.replace('.pth','.engine')
|
||
pth_model = U2NET(3,1)
|
||
pth_model.load_state_dict(torch.load(model_path))
|
||
|
||
|
||
input_names=['input']
|
||
output_names=['output%d'%(i) for i in range(7)]
|
||
dynamix_axis = dynamic_hw
|
||
input_shape =(1,3,512,512)
|
||
#测试pth转为onnx模型
|
||
#pth2onnx(pth_model,onnx_name,input_shape=input_shape ,input_names=input_names ,output_names=output_names ,dynamix_axis=dynamix_axis )
|
||
|
||
#测试onnx模型转为trt模型
|
||
|
||
input_profile_shapes = [(1,3,512,512),(1,3,1024,1024),(1,3,2048,2048)]
|
||
input_shape = [1,3,-1,-1]
|
||
half=True
|
||
max_batch_size = 1
|
||
onnx2engine(onnx_name,trt_name,input_shape=input_shape,half=half,max_batch_size=max_batch_size,input_profile_shapes=input_profile_shapes)
|
||
'''
|
||
with torch.no_grad():
|
||
for i,model_name in enumerate(model_names):
|
||
print(f'process model:{model_name}...')
|
||
torch.onnx.export(model,
|
||
input_tensor,
|
||
model_name,
|
||
opset_version=11,
|
||
input_names=['input'],
|
||
output_names=['output0','output1','output2','output3','output4','output5','output6'],
|
||
dynamic_axes=dynamic_[i])
|
||
|
||
print(f'onnx model:{model_name} saved successfully...')
|
||
|
||
#print('sleep 10s...')
|
||
time.sleep(10)
|
||
print(f'begin check onnx model:{model_name}...')
|
||
|
||
onnx_model = onnx.load(model_name)
|
||
try:
|
||
onnx.checker.check_model(onnx_model)
|
||
except Exception as e:
|
||
print('model incorrect')
|
||
print(e)
|
||
else:
|
||
print('model correct')
|
||
|
||
print('*'*50)
|
||
print('Begin to test...')
|
||
case_1 = np.random.rand(1,3,512,512).astype(np.float32)
|
||
case_2 = np.random.rand(2,3,512,512).astype(np.float32)
|
||
case_3 = np.random.rand(1,3,224,224).astype(np.float32)
|
||
cases = [case_1,case_2,case_3]
|
||
|
||
providers=['CUDAExecutionProvider', 'CPUExecutionProvider']
|
||
for model_name in model_names:
|
||
print('-'*50,model_name)
|
||
onnx_session = onnxruntime.InferenceSession(model_name,providers=providers)
|
||
for i,case in enumerate(cases):
|
||
onnx_input = {'input':case}
|
||
try:
|
||
onnx_output = onnx_session.run(['output0','output1','output2','output3','output4','output5','output6'],onnx_input)[0]
|
||
except Exception as e:
|
||
print(f'Input:{i} on model:{model_name} failed')
|
||
print(e)
|
||
else:
|
||
print(f'Input:{i} on model:{model_name} succeed')
|
||
'''
|