""" An example that uses TensorRT's Python api to make inferences. """ import os import shutil import sys import threading import time import cv2 import numpy as np import torch import pycuda.autoinit import pycuda.driver as cuda import tensorrt as trt def get_img_path_batches(batch_size, img_dir): ret = [] batch = [] for root, dirs, files in os.walk(img_dir): for name in files: if len(batch) == batch_size: ret.append(batch) batch = [] batch.append(os.path.join(root, name)) if len(batch) > 0: ret.append(batch) return ret with open("imagenet_classes.txt") as f: classes = [line.strip() for line in f.readlines()] class YoLov5TRT(object): """ description: A YOLOv5 class that warps TensorRT ops, preprocess and postprocess ops. """ def __init__(self, engine_file_path): # Create a Context on this device, self.ctx = cuda.Device(0).make_context() stream = cuda.Stream() TRT_LOGGER = trt.Logger(trt.Logger.INFO) runtime = trt.Runtime(TRT_LOGGER) # Deserialize the engine from file with open(engine_file_path, "rb") as f: engine = runtime.deserialize_cuda_engine(f.read()) context = engine.create_execution_context() host_inputs = [] cuda_inputs = [] host_outputs = [] cuda_outputs = [] bindings = [] self.mean = (0.485, 0.456, 0.406) self.std = (0.229, 0.224, 0.225) for binding in engine: print('binding:', binding, engine.get_binding_shape(binding)) size = trt.volume(engine.get_binding_shape( binding)) * engine.max_batch_size dtype = trt.nptype(engine.get_binding_dtype(binding)) # Allocate host and device buffers host_mem = cuda.pagelocked_empty(size, dtype) cuda_mem = cuda.mem_alloc(host_mem.nbytes) # Append the device buffer to device bindings. bindings.append(int(cuda_mem)) # Append to the appropriate list. if engine.binding_is_input(binding): self.input_w = engine.get_binding_shape(binding)[-1] self.input_h = engine.get_binding_shape(binding)[-2] host_inputs.append(host_mem) cuda_inputs.append(cuda_mem) else: host_outputs.append(host_mem) cuda_outputs.append(cuda_mem) # Store self.stream = stream self.context = context self.engine = engine self.host_inputs = host_inputs self.cuda_inputs = cuda_inputs self.host_outputs = host_outputs self.cuda_outputs = cuda_outputs self.bindings = bindings self.batch_size = engine.max_batch_size def infer(self, raw_image_generator): threading.Thread.__init__(self) # Make self the active context, pushing it on top of the context stack. self.ctx.push() # Restore stream = self.stream context = self.context engine = self.engine host_inputs = self.host_inputs cuda_inputs = self.cuda_inputs host_outputs = self.host_outputs cuda_outputs = self.cuda_outputs bindings = self.bindings # Do image preprocess batch_image_raw = [] batch_input_image = np.empty( shape=[self.batch_size, 3, self.input_h, self.input_w]) for i, image_raw in enumerate(raw_image_generator): batch_image_raw.append(image_raw) input_image = self.preprocess_cls_image(image_raw) np.copyto(batch_input_image[i], input_image) batch_input_image = np.ascontiguousarray(batch_input_image) # Copy input image to host buffer np.copyto(host_inputs[0], batch_input_image.ravel()) start = time.time() # Transfer input data to the GPU. cuda.memcpy_htod_async(cuda_inputs[0], host_inputs[0], stream) # Run inference. context.execute_async(batch_size=self.batch_size, bindings=bindings, stream_handle=stream.handle) # Transfer predictions back from the GPU. cuda.memcpy_dtoh_async(host_outputs[0], cuda_outputs[0], stream) # Synchronize the stream stream.synchronize() end = time.time() # Remove any context from the top of the context stack, deactivating it. self.ctx.pop() # Here we use the first row of output in that batch_size = 1 output = host_outputs[0] # Do postprocess for i in range(self.batch_size): classes_ls, predicted_conf_ls, category_id_ls = self.postprocess_cls( output) cv2.putText(batch_image_raw[i], str( classes_ls), (10, 50), cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 255, 0), 1, cv2.LINE_AA) print(classes_ls, predicted_conf_ls) return batch_image_raw, end - start def destroy(self): # Remove any context from the top of the context stack, deactivating it. self.ctx.pop() def get_raw_image(self, image_path_batch): """ description: Read an image from image path """ for img_path in image_path_batch: yield cv2.imread(img_path) def get_raw_image_zeros(self, image_path_batch=None): """ description: Ready data for warmup """ for _ in range(self.batch_size): yield np.zeros([self.input_h, self.input_w, 3], dtype=np.uint8) def preprocess_cls_image(self, input_img): im = cv2.cvtColor(input_img, cv2.COLOR_BGR2RGB) im = cv2.resize(im, (self.input_h, self.input_w)) im = np.float32(im) im /= 255.0 im -= self.mean im /= self.std im = im.transpose(2, 0, 1) # prepare batch batch_data = np.expand_dims(im, axis=0) return batch_data def postprocess_cls(self, output_data): classes_ls = [] predicted_conf_ls = [] category_id_ls = [] output_data = output_data.reshape(self.batch_size, -1) output_data = torch.Tensor(output_data) p = torch.nn.functional.softmax(output_data, dim=1) score, index = torch.topk(p, 3) for ind in range(index.shape[0]): input_category_id = index[ind][0].item() # 716 category_id_ls.append(input_category_id) predicted_confidence = score[ind][0].item() predicted_conf_ls.append(predicted_confidence) classes_ls.append(classes[input_category_id]) return classes_ls, predicted_conf_ls, category_id_ls class inferThread(threading.Thread): def __init__(self, yolov5_wrapper, image_path_batch): threading.Thread.__init__(self) self.yolov5_wrapper = yolov5_wrapper self.image_path_batch = image_path_batch def run(self): batch_image_raw, use_time = self.yolov5_wrapper.infer( self.yolov5_wrapper.get_raw_image(self.image_path_batch)) for i, img_path in enumerate(self.image_path_batch): parent, filename = os.path.split(img_path) save_name = os.path.join('output', filename) # Save image cv2.imwrite(save_name, batch_image_raw[i]) print('input->{}, time->{:.2f}ms, saving into output/'.format( self.image_path_batch, use_time * 1000)) class warmUpThread(threading.Thread): def __init__(self, yolov5_wrapper): threading.Thread.__init__(self) self.yolov5_wrapper = yolov5_wrapper def run(self): batch_image_raw, use_time = self.yolov5_wrapper.infer( self.yolov5_wrapper.get_raw_image_zeros()) print( 'warm_up->{}, time->{:.2f}ms'.format(batch_image_raw[0].shape, use_time * 1000)) if __name__ == "__main__": # load custom plugin and engine engine_file_path = "build/yolov5s-cls.engine" if len(sys.argv) > 1: engine_file_path = sys.argv[1] if os.path.exists('output/'): shutil.rmtree('output/') os.makedirs('output/') # a YoLov5TRT instance yolov5_wrapper = YoLov5TRT(engine_file_path) try: print('batch size is', yolov5_wrapper.batch_size) image_dir = "images/" image_path_batches = get_img_path_batches( yolov5_wrapper.batch_size, image_dir) for i in range(10): # create a new thread to do warm_up thread1 = warmUpThread(yolov5_wrapper) thread1.start() thread1.join() for batch in image_path_batches: # create a new thread to do inference thread1 = inferThread(yolov5_wrapper, batch) thread1.start() thread1.join() finally: # destroy the instance yolov5_wrapper.destroy()