|
- """
- An example that uses TensorRT's Python api to make inferences.
- """
- import os
- import shutil
- import sys
- import threading
- import time
- import cv2
- import numpy as np
- import torch
- import pycuda.autoinit
- import pycuda.driver as cuda
- import tensorrt as trt
-
-
- def get_img_path_batches(batch_size, img_dir):
- ret = []
- batch = []
- for root, dirs, files in os.walk(img_dir):
- for name in files:
- if len(batch) == batch_size:
- ret.append(batch)
- batch = []
- batch.append(os.path.join(root, name))
- if len(batch) > 0:
- ret.append(batch)
- return ret
-
-
- with open("imagenet_classes.txt") as f:
- classes = [line.strip() for line in f.readlines()]
-
-
- class YoLov5TRT(object):
- """
- description: A YOLOv5 class that warps TensorRT ops, preprocess and postprocess ops.
- """
-
- def __init__(self, engine_file_path):
- # Create a Context on this device,
- self.ctx = cuda.Device(0).make_context()
- stream = cuda.Stream()
- TRT_LOGGER = trt.Logger(trt.Logger.INFO)
- runtime = trt.Runtime(TRT_LOGGER)
-
- # Deserialize the engine from file
- with open(engine_file_path, "rb") as f:
- engine = runtime.deserialize_cuda_engine(f.read())
- context = engine.create_execution_context()
-
- host_inputs = []
- cuda_inputs = []
- host_outputs = []
- cuda_outputs = []
- bindings = []
- self.mean = (0.485, 0.456, 0.406)
- self.std = (0.229, 0.224, 0.225)
-
- for binding in engine:
- print('binding:', binding, engine.get_binding_shape(binding))
- size = trt.volume(engine.get_binding_shape(
- binding)) * engine.max_batch_size
- dtype = trt.nptype(engine.get_binding_dtype(binding))
- # Allocate host and device buffers
- host_mem = cuda.pagelocked_empty(size, dtype)
- cuda_mem = cuda.mem_alloc(host_mem.nbytes)
- # Append the device buffer to device bindings.
- bindings.append(int(cuda_mem))
- # Append to the appropriate list.
- if engine.binding_is_input(binding):
- self.input_w = engine.get_binding_shape(binding)[-1]
- self.input_h = engine.get_binding_shape(binding)[-2]
- host_inputs.append(host_mem)
- cuda_inputs.append(cuda_mem)
- else:
- host_outputs.append(host_mem)
- cuda_outputs.append(cuda_mem)
-
- # Store
- self.stream = stream
- self.context = context
- self.engine = engine
- self.host_inputs = host_inputs
- self.cuda_inputs = cuda_inputs
- self.host_outputs = host_outputs
- self.cuda_outputs = cuda_outputs
- self.bindings = bindings
- self.batch_size = engine.max_batch_size
-
- def infer(self, raw_image_generator):
- threading.Thread.__init__(self)
- # Make self the active context, pushing it on top of the context stack.
- self.ctx.push()
- # Restore
- stream = self.stream
- context = self.context
- engine = self.engine
- host_inputs = self.host_inputs
- cuda_inputs = self.cuda_inputs
- host_outputs = self.host_outputs
- cuda_outputs = self.cuda_outputs
- bindings = self.bindings
- # Do image preprocess
- batch_image_raw = []
- batch_input_image = np.empty(
- shape=[self.batch_size, 3, self.input_h, self.input_w])
- for i, image_raw in enumerate(raw_image_generator):
- batch_image_raw.append(image_raw)
- input_image = self.preprocess_cls_image(image_raw)
- np.copyto(batch_input_image[i], input_image)
- batch_input_image = np.ascontiguousarray(batch_input_image)
-
- # Copy input image to host buffer
- np.copyto(host_inputs[0], batch_input_image.ravel())
- start = time.time()
- # Transfer input data to the GPU.
- cuda.memcpy_htod_async(cuda_inputs[0], host_inputs[0], stream)
- # Run inference.
- context.execute_async(batch_size=self.batch_size,
- bindings=bindings, stream_handle=stream.handle)
- # Transfer predictions back from the GPU.
- cuda.memcpy_dtoh_async(host_outputs[0], cuda_outputs[0], stream)
- # Synchronize the stream
- stream.synchronize()
- end = time.time()
- # Remove any context from the top of the context stack, deactivating it.
- self.ctx.pop()
- # Here we use the first row of output in that batch_size = 1
- output = host_outputs[0]
- # Do postprocess
- for i in range(self.batch_size):
- classes_ls, predicted_conf_ls, category_id_ls = self.postprocess_cls(
- output)
- cv2.putText(batch_image_raw[i], str(
- classes_ls), (10, 50), cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 255, 0), 1, cv2.LINE_AA)
- print(classes_ls, predicted_conf_ls)
- return batch_image_raw, end - start
-
- def destroy(self):
- # Remove any context from the top of the context stack, deactivating it.
- self.ctx.pop()
-
- def get_raw_image(self, image_path_batch):
- """
- description: Read an image from image path
- """
- for img_path in image_path_batch:
- yield cv2.imread(img_path)
-
- def get_raw_image_zeros(self, image_path_batch=None):
- """
- description: Ready data for warmup
- """
- for _ in range(self.batch_size):
- yield np.zeros([self.input_h, self.input_w, 3], dtype=np.uint8)
-
- def preprocess_cls_image(self, input_img):
- im = cv2.cvtColor(input_img, cv2.COLOR_BGR2RGB)
- im = cv2.resize(im, (self.input_h, self.input_w))
- im = np.float32(im)
- im /= 255.0
- im -= self.mean
- im /= self.std
- im = im.transpose(2, 0, 1)
- # prepare batch
- batch_data = np.expand_dims(im, axis=0)
- return batch_data
-
- def postprocess_cls(self, output_data):
- classes_ls = []
- predicted_conf_ls = []
- category_id_ls = []
- output_data = output_data.reshape(self.batch_size, -1)
- output_data = torch.Tensor(output_data)
- p = torch.nn.functional.softmax(output_data, dim=1)
- score, index = torch.topk(p, 3)
- for ind in range(index.shape[0]):
- input_category_id = index[ind][0].item() # 716
- category_id_ls.append(input_category_id)
- predicted_confidence = score[ind][0].item()
- predicted_conf_ls.append(predicted_confidence)
- classes_ls.append(classes[input_category_id])
- return classes_ls, predicted_conf_ls, category_id_ls
-
-
- class inferThread(threading.Thread):
- def __init__(self, yolov5_wrapper, image_path_batch):
- threading.Thread.__init__(self)
- self.yolov5_wrapper = yolov5_wrapper
- self.image_path_batch = image_path_batch
-
- def run(self):
- batch_image_raw, use_time = self.yolov5_wrapper.infer(
- self.yolov5_wrapper.get_raw_image(self.image_path_batch))
- for i, img_path in enumerate(self.image_path_batch):
- parent, filename = os.path.split(img_path)
- save_name = os.path.join('output', filename)
- # Save image
- cv2.imwrite(save_name, batch_image_raw[i])
- print('input->{}, time->{:.2f}ms, saving into output/'.format(
- self.image_path_batch, use_time * 1000))
-
-
- class warmUpThread(threading.Thread):
- def __init__(self, yolov5_wrapper):
- threading.Thread.__init__(self)
- self.yolov5_wrapper = yolov5_wrapper
-
- def run(self):
- batch_image_raw, use_time = self.yolov5_wrapper.infer(
- self.yolov5_wrapper.get_raw_image_zeros())
- print(
- 'warm_up->{}, time->{:.2f}ms'.format(batch_image_raw[0].shape, use_time * 1000))
-
-
- if __name__ == "__main__":
- # load custom plugin and engine
- engine_file_path = "build/yolov5s-cls.engine"
-
- if len(sys.argv) > 1:
- engine_file_path = sys.argv[1]
-
- if os.path.exists('output/'):
- shutil.rmtree('output/')
- os.makedirs('output/')
- # a YoLov5TRT instance
- yolov5_wrapper = YoLov5TRT(engine_file_path)
- try:
- print('batch size is', yolov5_wrapper.batch_size)
-
- image_dir = "images/"
- image_path_batches = get_img_path_batches(
- yolov5_wrapper.batch_size, image_dir)
-
- for i in range(10):
- # create a new thread to do warm_up
- thread1 = warmUpThread(yolov5_wrapper)
- thread1.start()
- thread1.join()
- for batch in image_path_batches:
- # create a new thread to do inference
- thread1 = inferThread(yolov5_wrapper, batch)
- thread1.start()
- thread1.join()
- finally:
- # destroy the instance
- yolov5_wrapper.destroy()
|