AIlib2/segutils/trtUtils2.py

#
# Copyright 1993-2020 NVIDIA Corporation.  All rights reserved.
#
# NOTICE TO LICENSEE:
#
# This source code and/or documentation ("Licensed Deliverables") are
# subject to NVIDIA intellectual property rights under U.S. and
# international Copyright laws.
#
# These Licensed Deliverables contained herein is PROPRIETARY and
# CONFIDENTIAL to NVIDIA and is being provided under the terms and
# conditions of a form of NVIDIA software license agreement by and
# between NVIDIA and Licensee ("License Agreement") or electronically
# accepted by Licensee.  Notwithstanding any terms or conditions to
# the contrary in the License Agreement, reproduction or disclosure
# of the Licensed Deliverables to any third party without the express
# written consent of NVIDIA is prohibited.
#
# NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
# LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
# SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
# PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
# NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
# DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
# NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
# NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
# LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
# SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
# DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
# WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
# ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
# OF THESE LICENSED DELIVERABLES.
#
# U.S. Government End Users.  These Licensed Deliverables are a
# "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
# 1995), consisting of "commercial computer software" and "commercial
# computer software documentation" as such terms are used in 48
# C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
# only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
# 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
# U.S. Government End Users acquire the Licensed Deliverables with
# only those rights set forth herein.
#
# Any use of the Licensed Deliverables in individual and commercial
# software must include, in the user documentation and internal
# comments to the code, the above Disclaimer and U.S. Government End
# Users Notice.
#


import argparse
import pycuda.driver as cuda
import pycuda.autoinit
import numpy as np
import torch
import tensorrt as trt


import time 
import onnx
import onnxruntime
import os,sys,cv2
#from model.u2net import U2NET

#cuda.init()
model_names = ['u2net.onnx','u2net_dynamic_batch.onnx','u2net_dynamic_hw.onnx','u2net_dynamic_batch-hw.onnx' ]
dynamic_batch = {'input':{0:'batch'},
        'output0':{0:'batch'},
        'output1':{0:'batch'},
        'output2':{0:'batch'},
        'output3':{0:'batch'},
        'output4':{0:'batch'},
        'output5':{0:'batch'},
        'output6':{0:'batch'}}
dynamic_hw ={'input':{2:'H',3:'W'},
        'output0':{2:'H',3:'W'},
        'output1':{2:'H',3:'W'},
        'output2':{2:'H',3:'W'},
        'output3':{2:'H',3:'W'},
        'output4':{2:'H',3:'W'},
        'output5':{2:'H',3:'W'},
        'output6':{2:'H',3:'W'}}
dynamic_batch_hw ={'input':{0:'batch',2:'H',3:'W'},
        'output0':{0:'batch',2:'H',3:'W'},
        'output1':{0:'batch',2:'H',3:'W'},
        'output2':{0:'batch',2:'H',3:'W'},
        'output3':{0:'batch',2:'H',3:'W'},
        'output4':{0:'batch',2:'H',3:'W'},
        'output5':{0:'batch',2:'H',3:'W'},
        'output6':{0:'batch',2:'H',3:'W'}}        
dynamic_=[None,dynamic_batch,dynamic_hw,dynamic_batch_hw]

TRT_LOGGER = trt.Logger()
def pth2onnx(pth_model,onnx_name,input_shape=(1,3,512,512),input_names=['input'],output_names=['output'],dynamix_axis=None):
    #pth_model:输入加载权重后的pth模型
    #onnx_name:输出的onnx模型路径
    #input_shape:模型输入的尺寸（建议尺寸）
    #input_names:模型输入的名字，list格式，可以有多个输入
    #output_names:模型输入的名字，list格式，可以有多个输出
    #dynamix_axis:字典格式，None-表示静态输入。每一个模型的输入输出都可以定义动态的维度
    #             如dynamic_batch_hw ={'input':{0:'batch',2:'H',3:'W'}, 'output':{0:'batch',2:'H',3:'W'}},  
    #             表示input的B,H,W和output的B,H,W是动态尺寸
    print('[I] beg to converting pth to onnx ...... ',dynamix_axis)
    input_tensor = torch.ones(input_shape)
    if next(pth_model.parameters()).is_cuda:
        input_tensor = input_tensor.to('cuda:0')
    with torch.no_grad():
        torch.onnx.export(pth_model,
                input_tensor,
                onnx_name,
                opset_version=11,
                input_names=input_names,
                do_constant_folding=True,
                output_names=output_names,
                dynamic_axes=dynamix_axis)
    onnx_model = onnx.load(onnx_name)
    try:
        onnx.checker.check_model(onnx_model)
    except Exception as e:
        print('[Error] model incorrect:',e)
    else:
        print('[I] conver to onnx over in ', onnx_name)
    print('')   
def onnx_inference(onnx_input,model_name,outputName=['output0','output1','output2','output3','output4','output5','output6' ]):
    providers=['CUDAExecutionProvider', 'CPUExecutionProvider']
    print('8'*10, '  line125:',model_name)
    #outputName = ['pred_logits', 'pred_points']
    onnx_session = onnxruntime.InferenceSession(model_name,providers=providers)
    try:
        onnx_output = onnx_session.run(outputName,onnx_input)
        
    except Exception as e:
        onnx_output=None
        print(e)
    return onnx_output
def onnx2engine(onnx_file_path,engine_file_path,input_shape=[1,3,512,512],half=True,max_batch_size=1,input_profile_shapes=[None,None,None]):
    #onnx_file_path:输入的onnx路径
    #engine_file_path:输出的trt模型路径
    #input_shape:默认的模型输入尺寸, 如[1,3,512,512] ，如果是动态的可以为[1,3,-1,-1]
    #half:是否使用fp16,默认True 
    #max_batch_size:最大的bachsize，默认是1
    #input_profile_shapes:动态输入时输入的三个尺寸[最小尺寸,优化尺寸,最大尺寸]，此时input_shape一定有-1
    #                    如(1,3,512,512),(1,3,1024,1024),(1,3,2048,2048)，
    builder = trt.Builder(TRT_LOGGER)
    network = builder.create_network(1 << int(trt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH))
    config = builder.create_builder_config()
    parser = trt.OnnxParser(network,TRT_LOGGER)
    runtime = trt.Runtime(TRT_LOGGER)
    
    # 最大内存占用,一般1G，trt特有的，一切与优化有关,显存溢出需要重新设置
    config.max_workspace_size = 1<<30 #256MB 
    if builder.platform_has_fast_fp16 and half:
        config.set_flag(trt.BuilderFlag.FP16)
    builder.max_batch_size = max_batch_size # 推理的时候要保证batch_size<=max_batch_size

    # parse model file
    if not os.path.exists(onnx_file_path):
        print(f'onnx file {onnx_file_path} not found,please run torch_2_onnx.py first to generate it')
        exit(0)
    print(f'Loading ONNX file from path {onnx_file_path}...')
    with open(onnx_file_path,'rb') as model:
        print('Beginning ONNX file parsing')
        if not parser.parse(model.read()):
            print('ERROR:Failed to parse the ONNX file')
            for error in range(parser.num_errors):
                print(parser.get_error(error))
            return None
    
    # Static input setting
    network.get_input(0).shape=input_shape
    # Dynamic input setting 动态输入在builder的profile设置
    # 为每个动态输入绑定一个profile
    if -1 in input_shape:
        profile = builder.create_optimization_profile()
        profile.set_shape(network.get_input(0).name,input_profile_shapes[0],input_profile_shapes[1],input_profile_shapes[2]  )#最小的尺寸,常用的尺寸,最大的尺寸,推理时候输入需要在这个范围内
        config.add_optimization_profile(profile)

    print('Completed parsing the ONNX file')
    print(f'Building an engine from file {onnx_file_path}; this may take a while...')
    
    t0 = time.time()
    engine = builder.build_engine(network,config)
    
    with open(engine_file_path,'wb') as f:
        # f.write(plan)
        f.write(engine.serialize())
    t1 = time.time()    
    print('Completed creating Engine:%s, %.1f'%(engine_file_path,t1-t0))    
 

try:
    # Sometimes python2 does not understand FileNotFoundError
    FileNotFoundError
except NameError:
    FileNotFoundError = IOError

#EXPLICIT_BATCH = 1 << (int)(trt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH)

def GiB(val):
    return val * 1 << 30


def add_help(description):
    parser = argparse.ArgumentParser(description=description, formatter_class=argparse.ArgumentDefaultsHelpFormatter)
    args, _ = parser.parse_known_args()


def find_sample_data(description="Runs a TensorRT Python sample", subfolder="", find_files=[]):
    '''
    Parses sample arguments.

    Args:
        description (str): Description of the sample.
        subfolder (str): The subfolder containing data relevant to this sample
        find_files (str): A list of filenames to find. Each filename will be replaced with an absolute path.

    Returns:
        str: Path of data directory.
    '''

    # Standard command-line arguments for all samples.
    kDEFAULT_DATA_ROOT = os.path.join(os.sep, "usr", "src", "tensorrt", "data")
    parser = argparse.ArgumentParser(description=description, formatter_class=argparse.ArgumentDefaultsHelpFormatter)
    parser.add_argument("-d", "--datadir", help="Location of the TensorRT sample data directory, and any additional data directories.", action="append", default=[kDEFAULT_DATA_ROOT])
    args, _ = parser.parse_known_args()

    def get_data_path(data_dir):
        # If the subfolder exists, append it to the path, otherwise use the provided path as-is.
        data_path = os.path.join(data_dir, subfolder)
        if not os.path.exists(data_path):
            print("WARNING: " + data_path + " does not exist. Trying " + data_dir + " instead.")
            data_path = data_dir
        # Make sure data directory exists.
        if not (os.path.exists(data_path)):
            print("WARNING: {:} does not exist. Please provide the correct data path with the -d option.".format(data_path))
        return data_path

    data_paths = [get_data_path(data_dir) for data_dir in args.datadir]
    return data_paths, locate_files(data_paths, find_files)

def locate_files(data_paths, filenames):
    """
    Locates the specified files in the specified data directories.
    If a file exists in multiple data directories, the first directory is used.

    Args:
        data_paths (List[str]): The data directories.
        filename (List[str]): The names of the files to find.

    Returns:
        List[str]: The absolute paths of the files.

    Raises:
        FileNotFoundError if a file could not be located.
    """
    found_files = [None] * len(filenames)
    for data_path in data_paths:
        # Find all requested files.
        for index, (found, filename) in enumerate(zip(found_files, filenames)):
            if not found:
                file_path = os.path.abspath(os.path.join(data_path, filename))
                if os.path.exists(file_path):
                    found_files[index] = file_path

    # Check that all files were found
    for f, filename in zip(found_files, filenames):
        if not f or not os.path.exists(f):
            raise FileNotFoundError("Could not find {:}. Searched in data paths: {:}".format(filename, data_paths))
    return found_files

# Simple helper data class that's a little nicer to use than a 2-tuple.
class HostDeviceMem(object):
    def __init__(self, host_mem, device_mem):
        self.host = host_mem
        self.device = device_mem

    def __str__(self):
        return "Host:\n" + str(self.host) + "\nDevice:\n" + str(self.device)

    def __repr__(self):
        return self.__str__()

# Allocates all buffers required for an engine, i.e. host/device inputs/outputs.
def allocate_buffers(engine,input_shape,streamFlag=True):
    inputs = []
    outputs = []
    bindings = []
    if streamFlag:
        stream = cuda.Stream()
    else: stream=None    
    
    
    for ib,binding in enumerate(engine):
        dims = engine.get_binding_shape(binding)
        #print(engine.get_binding_name(ib),dims,engine.max_batch_size)
        if -1 in dims:
            if isinstance(input_shape,list):
                dims =   input_shape[ib]  
            else:
                dims =   input_shape  
   
        # size = trt.volume(engine.get_binding_shape(binding)) * engine.max_batch_size
        #size = trt.volume(dims) * engine.max_batch_size
        size = trt.volume(dims) 
        dtype = trt.nptype(engine.get_binding_dtype(binding))
        # Allocate host and device buffers

        host_mem = cuda.pagelocked_empty(size, dtype)
       
        device_mem = cuda.mem_alloc(host_mem.nbytes)
        # Append the device buffer to device bindings.
        bindings.append(int(device_mem))
        # Append to the appropriate list.
        if engine.binding_is_input(binding):
            inputs.append(HostDeviceMem(host_mem, device_mem))
        else:
            outputs.append(HostDeviceMem(host_mem, device_mem))
    return inputs, outputs, bindings, stream   
# This function is generalized for multiple inputs/outputs.
# inputs and outputs are expected to be lists of HostDeviceMem objects.
def do_inference(context, bindings, inputs, outputs, stream, batch_size=1):
    # Transfer input data to the GPU.
    [cuda.memcpy_htod_async(inp.device, inp.host, stream) for inp in inputs]
    # Run inference.
    context.execute_async(batch_size=batch_size, bindings=bindings, stream_handle=stream.handle)
    # Transfer predictions back from the GPU.
    [cuda.memcpy_dtoh_async(out.host, out.device, stream) for out in outputs]
    # Synchronize the stream
    stream.synchronize()
    # Return only the host outputs.
    return [out.host for out in outputs]

# This function is generalized for multiple inputs/outputs for full dimension networks.
# inputs and outputs are expected to be lists of HostDeviceMem objects.
def do_inference_v2(context, bindings, inputs, outputs, stream):
    # Transfer input data to the GPU.
    [cuda.memcpy_htod_async(inp.device, inp.host, stream) for inp in inputs]
    # Run inference.
    #stream.synchronize()
    #context.execute_v2(bindings) # 执行推
     
    context.execute_async_v2(bindings=bindings, stream_handle=stream.handle)

    # Transfer predictions back from the GPU.
    [cuda.memcpy_dtoh_async(out.host, out.device, stream) for out in outputs]
    # Synchronize the stream
    stream.synchronize()
    # Return only the host outputs.
    return [out.host for out in outputs]

def trt_inference( img,img_h,img_w,context,inputs,outputs,bindings,stream,input_name = 'input'):
    #输入：
    #img--np格式,NCHW
    #img_h,img_w--输入模型时图像的H，W。动态输入是需要知道。
    #context--外面开辟的trt上下文
    #inputs,outputs,bindings,stream--第一次处理图像时，开辟的内存及其地址绑定到trt的输出
    #input_name--模型输入tensor的名字
    #输出
    #trt_outputs--为list格式，里面的元素是numpy格式

    origin_inputshape = context.get_tensor_shape( input_name) 

    #if origin_inputshape[-1]==-1:
    context.set_optimization_profile_async(0,stream.handle)
    origin_inputshape[-2],origin_inputshape[-1]=(img_h,img_w)
    context.set_input_shape(input_name, (origin_inputshape))

        
    inputs[0].host = np.ascontiguousarray(img)
    trt_outputs = do_inference_v2(context,bindings=bindings,inputs=inputs,outputs=outputs,stream=stream)
    return trt_outputs
        
# def do_inference_v3(context, bindings, inputs, outputs, stream,h_,w_):
#     '''
#     Copy from https://github.com/zhaogangthu/keras-yolo3-ocr-tensorrt/blob/master/tensorRT_yolo3/common.py
# 
#     '''
#     # Transfer input data to the GPU.
#
#     context.set_binding_shape(0, (1, 3, h_, w_))
#
#     [cuda.memcpy_htod_async(inp.device, inp.host, stream) for inp in inputs]
#     # Run inference.
#     context.execute_async_v2(bindings=bindings, stream_handle=stream.handle)
#     # Transfer predictions back from the GPU.
#     [cuda.memcpy_dtoh_async(out.host, out.device, stream) for out in outputs]
#     # Synchronize the stream
#     stream.synchronize()
#     # Return only the host outputs.
#     return [out.host for out in outputs]
if __name__=='__main__':    
    model_path='weights/u2net_portrait.pth'
    onnx_name = model_path.replace('.pth','.onnx')
    trt_name = model_path.replace('.pth','.engine')
    pth_model = U2NET(3,1)
    pth_model.load_state_dict(torch.load(model_path))

            
    input_names=['input']
    output_names=['output%d'%(i) for i in range(7)]
    dynamix_axis = dynamic_hw
    input_shape =(1,3,512,512) 
    #测试pth转为onnx模型
    #pth2onnx(pth_model,onnx_name,input_shape=input_shape ,input_names=input_names ,output_names=output_names ,dynamix_axis=dynamix_axis )
    
    #测试onnx模型转为trt模型
    
    input_profile_shapes = [(1,3,512,512),(1,3,1024,1024),(1,3,2048,2048)]
    input_shape = [1,3,-1,-1]
    half=True 
    max_batch_size = 1
    onnx2engine(onnx_name,trt_name,input_shape=input_shape,half=half,max_batch_size=max_batch_size,input_profile_shapes=input_profile_shapes)
    '''
    with torch.no_grad():
        for i,model_name in enumerate(model_names):
            print(f'process model:{model_name}...')
            torch.onnx.export(model,
                    input_tensor,
                    model_name,
                    opset_version=11,
                    input_names=['input'],
                    output_names=['output0','output1','output2','output3','output4','output5','output6'],
                    dynamic_axes=dynamic_[i])

            print(f'onnx model:{model_name} saved successfully...')
            
            #print('sleep 10s...')
            time.sleep(10)
            print(f'begin check onnx model:{model_name}...')

            onnx_model = onnx.load(model_name)
            try:
                onnx.checker.check_model(onnx_model)
            except Exception as e:
                print('model incorrect')
                print(e)
            else:
                print('model correct')

    print('*'*50)
    print('Begin to test...')
    case_1 = np.random.rand(1,3,512,512).astype(np.float32)
    case_2 = np.random.rand(2,3,512,512).astype(np.float32)
    case_3 = np.random.rand(1,3,224,224).astype(np.float32)
    cases = [case_1,case_2,case_3]

    providers=['CUDAExecutionProvider', 'CPUExecutionProvider']
    for model_name in model_names:
        print('-'*50,model_name)
        onnx_session = onnxruntime.InferenceSession(model_name,providers=providers)
        for i,case in enumerate(cases):
            onnx_input = {'input':case}
            try:
                onnx_output = onnx_session.run(['output0','output1','output2','output3','output4','output5','output6'],onnx_input)[0]
            except Exception as e:
                print(f'Input:{i} on model:{model_name} failed')
                print(e)
            else:
                print(f'Input:{i} on model:{model_name} succeed')
    '''