AIlib2/segutils/trtUtils2.py

465 lines
20 KiB
Python
Raw Permalink Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#
# Copyright 1993-2020 NVIDIA Corporation. All rights reserved.
#
# NOTICE TO LICENSEE:
#
# This source code and/or documentation ("Licensed Deliverables") are
# subject to NVIDIA intellectual property rights under U.S. and
# international Copyright laws.
#
# These Licensed Deliverables contained herein is PROPRIETARY and
# CONFIDENTIAL to NVIDIA and is being provided under the terms and
# conditions of a form of NVIDIA software license agreement by and
# between NVIDIA and Licensee ("License Agreement") or electronically
# accepted by Licensee. Notwithstanding any terms or conditions to
# the contrary in the License Agreement, reproduction or disclosure
# of the Licensed Deliverables to any third party without the express
# written consent of NVIDIA is prohibited.
#
# NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
# LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
# SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. IT IS
# PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
# NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
# DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
# NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
# NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
# LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
# SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
# DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
# WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
# ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
# OF THESE LICENSED DELIVERABLES.
#
# U.S. Government End Users. These Licensed Deliverables are a
# "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
# 1995), consisting of "commercial computer software" and "commercial
# computer software documentation" as such terms are used in 48
# C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
# only as a commercial end item. Consistent with 48 C.F.R.12.212 and
# 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
# U.S. Government End Users acquire the Licensed Deliverables with
# only those rights set forth herein.
#
# Any use of the Licensed Deliverables in individual and commercial
# software must include, in the user documentation and internal
# comments to the code, the above Disclaimer and U.S. Government End
# Users Notice.
#
import argparse
import pycuda.driver as cuda
import pycuda.autoinit
import numpy as np
import torch
import tensorrt as trt
import time
import onnx
import onnxruntime
import os,sys,cv2
#from model.u2net import U2NET
#cuda.init()
model_names = ['u2net.onnx','u2net_dynamic_batch.onnx','u2net_dynamic_hw.onnx','u2net_dynamic_batch-hw.onnx' ]
dynamic_batch = {'input':{0:'batch'},
'output0':{0:'batch'},
'output1':{0:'batch'},
'output2':{0:'batch'},
'output3':{0:'batch'},
'output4':{0:'batch'},
'output5':{0:'batch'},
'output6':{0:'batch'}}
dynamic_hw ={'input':{2:'H',3:'W'},
'output0':{2:'H',3:'W'},
'output1':{2:'H',3:'W'},
'output2':{2:'H',3:'W'},
'output3':{2:'H',3:'W'},
'output4':{2:'H',3:'W'},
'output5':{2:'H',3:'W'},
'output6':{2:'H',3:'W'}}
dynamic_batch_hw ={'input':{0:'batch',2:'H',3:'W'},
'output0':{0:'batch',2:'H',3:'W'},
'output1':{0:'batch',2:'H',3:'W'},
'output2':{0:'batch',2:'H',3:'W'},
'output3':{0:'batch',2:'H',3:'W'},
'output4':{0:'batch',2:'H',3:'W'},
'output5':{0:'batch',2:'H',3:'W'},
'output6':{0:'batch',2:'H',3:'W'}}
dynamic_=[None,dynamic_batch,dynamic_hw,dynamic_batch_hw]
TRT_LOGGER = trt.Logger()
def pth2onnx(pth_model,onnx_name,input_shape=(1,3,512,512),input_names=['input'],output_names=['output'],dynamix_axis=None):
#pth_model:输入加载权重后的pth模型
#onnx_name:输出的onnx模型路径
#input_shape:模型输入的尺寸(建议尺寸)
#input_names:模型输入的名字list格式可以有多个输入
#output_names:模型输入的名字list格式可以有多个输出
#dynamix_axis:字典格式None-表示静态输入。每一个模型的输入输出都可以定义动态的维度
# 如dynamic_batch_hw ={'input':{0:'batch',2:'H',3:'W'}, 'output':{0:'batch',2:'H',3:'W'}},
# 表示input的B,H,W和output的B,H,W是动态尺寸
print('[I] beg to converting pth to onnx ...... ',dynamix_axis)
input_tensor = torch.ones(input_shape)
if next(pth_model.parameters()).is_cuda:
input_tensor = input_tensor.to('cuda:0')
with torch.no_grad():
torch.onnx.export(pth_model,
input_tensor,
onnx_name,
opset_version=11,
input_names=input_names,
do_constant_folding=True,
output_names=output_names,
dynamic_axes=dynamix_axis)
onnx_model = onnx.load(onnx_name)
try:
onnx.checker.check_model(onnx_model)
except Exception as e:
print('[Error] model incorrect:',e)
else:
print('[I] conver to onnx over in ', onnx_name)
print('')
def onnx_inference(onnx_input,model_name,outputName=['output0','output1','output2','output3','output4','output5','output6' ]):
providers=['CUDAExecutionProvider', 'CPUExecutionProvider']
print('8'*10, ' line125:',model_name)
#outputName = ['pred_logits', 'pred_points']
onnx_session = onnxruntime.InferenceSession(model_name,providers=providers)
try:
onnx_output = onnx_session.run(outputName,onnx_input)
except Exception as e:
onnx_output=None
print(e)
return onnx_output
def onnx2engine(onnx_file_path,engine_file_path,input_shape=[1,3,512,512],half=True,max_batch_size=1,input_profile_shapes=[None,None,None]):
#onnx_file_path:输入的onnx路径
#engine_file_path:输出的trt模型路径
#input_shape:默认的模型输入尺寸, 如[1,3,512,512] ,如果是动态的可以为[1,3,-1,-1]
#half:是否使用fp16,默认True
#max_batch_size:最大的bachsize默认是1
#input_profile_shapes:动态输入时输入的三个尺寸[最小尺寸,优化尺寸,最大尺寸]此时input_shape一定有-1
# 如(1,3,512,512),(1,3,1024,1024),(1,3,2048,2048)
builder = trt.Builder(TRT_LOGGER)
network = builder.create_network(1 << int(trt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH))
config = builder.create_builder_config()
parser = trt.OnnxParser(network,TRT_LOGGER)
runtime = trt.Runtime(TRT_LOGGER)
# 最大内存占用,一般1Gtrt特有的一切与优化有关,显存溢出需要重新设置
config.max_workspace_size = 1<<30 #256MB
if builder.platform_has_fast_fp16 and half:
config.set_flag(trt.BuilderFlag.FP16)
builder.max_batch_size = max_batch_size # 推理的时候要保证batch_size<=max_batch_size
# parse model file
if not os.path.exists(onnx_file_path):
print(f'onnx file {onnx_file_path} not found,please run torch_2_onnx.py first to generate it')
exit(0)
print(f'Loading ONNX file from path {onnx_file_path}...')
with open(onnx_file_path,'rb') as model:
print('Beginning ONNX file parsing')
if not parser.parse(model.read()):
print('ERROR:Failed to parse the ONNX file')
for error in range(parser.num_errors):
print(parser.get_error(error))
return None
# Static input setting
network.get_input(0).shape=input_shape
# Dynamic input setting 动态输入在builder的profile设置
# 为每个动态输入绑定一个profile
if -1 in input_shape:
profile = builder.create_optimization_profile()
profile.set_shape(network.get_input(0).name,input_profile_shapes[0],input_profile_shapes[1],input_profile_shapes[2] )#最小的尺寸,常用的尺寸,最大的尺寸,推理时候输入需要在这个范围内
config.add_optimization_profile(profile)
print('Completed parsing the ONNX file')
print(f'Building an engine from file {onnx_file_path}; this may take a while...')
t0 = time.time()
engine = builder.build_engine(network,config)
with open(engine_file_path,'wb') as f:
# f.write(plan)
f.write(engine.serialize())
t1 = time.time()
print('Completed creating Engine:%s, %.1f'%(engine_file_path,t1-t0))
try:
# Sometimes python2 does not understand FileNotFoundError
FileNotFoundError
except NameError:
FileNotFoundError = IOError
#EXPLICIT_BATCH = 1 << (int)(trt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH)
def GiB(val):
return val * 1 << 30
def add_help(description):
parser = argparse.ArgumentParser(description=description, formatter_class=argparse.ArgumentDefaultsHelpFormatter)
args, _ = parser.parse_known_args()
def find_sample_data(description="Runs a TensorRT Python sample", subfolder="", find_files=[]):
'''
Parses sample arguments.
Args:
description (str): Description of the sample.
subfolder (str): The subfolder containing data relevant to this sample
find_files (str): A list of filenames to find. Each filename will be replaced with an absolute path.
Returns:
str: Path of data directory.
'''
# Standard command-line arguments for all samples.
kDEFAULT_DATA_ROOT = os.path.join(os.sep, "usr", "src", "tensorrt", "data")
parser = argparse.ArgumentParser(description=description, formatter_class=argparse.ArgumentDefaultsHelpFormatter)
parser.add_argument("-d", "--datadir", help="Location of the TensorRT sample data directory, and any additional data directories.", action="append", default=[kDEFAULT_DATA_ROOT])
args, _ = parser.parse_known_args()
def get_data_path(data_dir):
# If the subfolder exists, append it to the path, otherwise use the provided path as-is.
data_path = os.path.join(data_dir, subfolder)
if not os.path.exists(data_path):
print("WARNING: " + data_path + " does not exist. Trying " + data_dir + " instead.")
data_path = data_dir
# Make sure data directory exists.
if not (os.path.exists(data_path)):
print("WARNING: {:} does not exist. Please provide the correct data path with the -d option.".format(data_path))
return data_path
data_paths = [get_data_path(data_dir) for data_dir in args.datadir]
return data_paths, locate_files(data_paths, find_files)
def locate_files(data_paths, filenames):
"""
Locates the specified files in the specified data directories.
If a file exists in multiple data directories, the first directory is used.
Args:
data_paths (List[str]): The data directories.
filename (List[str]): The names of the files to find.
Returns:
List[str]: The absolute paths of the files.
Raises:
FileNotFoundError if a file could not be located.
"""
found_files = [None] * len(filenames)
for data_path in data_paths:
# Find all requested files.
for index, (found, filename) in enumerate(zip(found_files, filenames)):
if not found:
file_path = os.path.abspath(os.path.join(data_path, filename))
if os.path.exists(file_path):
found_files[index] = file_path
# Check that all files were found
for f, filename in zip(found_files, filenames):
if not f or not os.path.exists(f):
raise FileNotFoundError("Could not find {:}. Searched in data paths: {:}".format(filename, data_paths))
return found_files
# Simple helper data class that's a little nicer to use than a 2-tuple.
class HostDeviceMem(object):
def __init__(self, host_mem, device_mem):
self.host = host_mem
self.device = device_mem
def __str__(self):
return "Host:\n" + str(self.host) + "\nDevice:\n" + str(self.device)
def __repr__(self):
return self.__str__()
# Allocates all buffers required for an engine, i.e. host/device inputs/outputs.
def allocate_buffers(engine,input_shape,streamFlag=True):
inputs = []
outputs = []
bindings = []
if streamFlag:
stream = cuda.Stream()
else: stream=None
for ib,binding in enumerate(engine):
dims = engine.get_binding_shape(binding)
#print(engine.get_binding_name(ib),dims,engine.max_batch_size)
if -1 in dims:
if isinstance(input_shape,list):
dims = input_shape[ib]
else:
dims = input_shape
# size = trt.volume(engine.get_binding_shape(binding)) * engine.max_batch_size
#size = trt.volume(dims) * engine.max_batch_size
size = trt.volume(dims)
dtype = trt.nptype(engine.get_binding_dtype(binding))
# Allocate host and device buffers
host_mem = cuda.pagelocked_empty(size, dtype)
device_mem = cuda.mem_alloc(host_mem.nbytes)
# Append the device buffer to device bindings.
bindings.append(int(device_mem))
# Append to the appropriate list.
if engine.binding_is_input(binding):
inputs.append(HostDeviceMem(host_mem, device_mem))
else:
outputs.append(HostDeviceMem(host_mem, device_mem))
return inputs, outputs, bindings, stream
# This function is generalized for multiple inputs/outputs.
# inputs and outputs are expected to be lists of HostDeviceMem objects.
def do_inference(context, bindings, inputs, outputs, stream, batch_size=1):
# Transfer input data to the GPU.
[cuda.memcpy_htod_async(inp.device, inp.host, stream) for inp in inputs]
# Run inference.
context.execute_async(batch_size=batch_size, bindings=bindings, stream_handle=stream.handle)
# Transfer predictions back from the GPU.
[cuda.memcpy_dtoh_async(out.host, out.device, stream) for out in outputs]
# Synchronize the stream
stream.synchronize()
# Return only the host outputs.
return [out.host for out in outputs]
# This function is generalized for multiple inputs/outputs for full dimension networks.
# inputs and outputs are expected to be lists of HostDeviceMem objects.
def do_inference_v2(context, bindings, inputs, outputs, stream):
# Transfer input data to the GPU.
[cuda.memcpy_htod_async(inp.device, inp.host, stream) for inp in inputs]
# Run inference.
#stream.synchronize()
#context.execute_v2(bindings) # 执行推
context.execute_async_v2(bindings=bindings, stream_handle=stream.handle)
# Transfer predictions back from the GPU.
[cuda.memcpy_dtoh_async(out.host, out.device, stream) for out in outputs]
# Synchronize the stream
stream.synchronize()
# Return only the host outputs.
return [out.host for out in outputs]
def trt_inference( img,img_h,img_w,context,inputs,outputs,bindings,stream,input_name = 'input'):
#输入:
#img--np格式,NCHW
#img_h,img_w--输入模型时图像的HW。动态输入是需要知道。
#context--外面开辟的trt上下文
#inputs,outputs,bindings,stream--第一次处理图像时开辟的内存及其地址绑定到trt的输出
#input_name--模型输入tensor的名字
#输出
#trt_outputs--为list格式里面的元素是numpy格式
origin_inputshape = context.get_tensor_shape( input_name)
#if origin_inputshape[-1]==-1:
context.set_optimization_profile_async(0,stream.handle)
origin_inputshape[-2],origin_inputshape[-1]=(img_h,img_w)
context.set_input_shape(input_name, (origin_inputshape))
inputs[0].host = np.ascontiguousarray(img)
trt_outputs = do_inference_v2(context,bindings=bindings,inputs=inputs,outputs=outputs,stream=stream)
return trt_outputs
# def do_inference_v3(context, bindings, inputs, outputs, stream,h_,w_):
# '''
# Copy from https://github.com/zhaogangthu/keras-yolo3-ocr-tensorrt/blob/master/tensorRT_yolo3/common.py
#
# '''
# # Transfer input data to the GPU.
#
# context.set_binding_shape(0, (1, 3, h_, w_))
#
# [cuda.memcpy_htod_async(inp.device, inp.host, stream) for inp in inputs]
# # Run inference.
# context.execute_async_v2(bindings=bindings, stream_handle=stream.handle)
# # Transfer predictions back from the GPU.
# [cuda.memcpy_dtoh_async(out.host, out.device, stream) for out in outputs]
# # Synchronize the stream
# stream.synchronize()
# # Return only the host outputs.
# return [out.host for out in outputs]
if __name__=='__main__':
model_path='weights/u2net_portrait.pth'
onnx_name = model_path.replace('.pth','.onnx')
trt_name = model_path.replace('.pth','.engine')
pth_model = U2NET(3,1)
pth_model.load_state_dict(torch.load(model_path))
input_names=['input']
output_names=['output%d'%(i) for i in range(7)]
dynamix_axis = dynamic_hw
input_shape =(1,3,512,512)
#测试pth转为onnx模型
#pth2onnx(pth_model,onnx_name,input_shape=input_shape ,input_names=input_names ,output_names=output_names ,dynamix_axis=dynamix_axis )
#测试onnx模型转为trt模型
input_profile_shapes = [(1,3,512,512),(1,3,1024,1024),(1,3,2048,2048)]
input_shape = [1,3,-1,-1]
half=True
max_batch_size = 1
onnx2engine(onnx_name,trt_name,input_shape=input_shape,half=half,max_batch_size=max_batch_size,input_profile_shapes=input_profile_shapes)
'''
with torch.no_grad():
for i,model_name in enumerate(model_names):
print(f'process model:{model_name}...')
torch.onnx.export(model,
input_tensor,
model_name,
opset_version=11,
input_names=['input'],
output_names=['output0','output1','output2','output3','output4','output5','output6'],
dynamic_axes=dynamic_[i])
print(f'onnx model:{model_name} saved successfully...')
#print('sleep 10s...')
time.sleep(10)
print(f'begin check onnx model:{model_name}...')
onnx_model = onnx.load(model_name)
try:
onnx.checker.check_model(onnx_model)
except Exception as e:
print('model incorrect')
print(e)
else:
print('model correct')
print('*'*50)
print('Begin to test...')
case_1 = np.random.rand(1,3,512,512).astype(np.float32)
case_2 = np.random.rand(2,3,512,512).astype(np.float32)
case_3 = np.random.rand(1,3,224,224).astype(np.float32)
cases = [case_1,case_2,case_3]
providers=['CUDAExecutionProvider', 'CPUExecutionProvider']
for model_name in model_names:
print('-'*50,model_name)
onnx_session = onnxruntime.InferenceSession(model_name,providers=providers)
for i,case in enumerate(cases):
onnx_input = {'input':case}
try:
onnx_output = onnx_session.run(['output0','output1','output2','output3','output4','output5','output6'],onnx_input)[0]
except Exception as e:
print(f'Input:{i} on model:{model_name} failed')
print(e)
else:
print(f'Input:{i} on model:{model_name} succeed')
'''