Object Detection Inference Using TensorRT

The object detection sample in CVCUDA uses the Peoplenet Model from NGC. The etlt model must be serialized to tensorrt using the tao-converter. The tensorrt model is provided as input to the sample application. The model supports implicit batch size. We will need to specify the max batch size during serialization. We will allocate the output Tensors in advance for TensorRT based on the output layer dimensions inferred from the tenorrt model loaded.

class ObjectDetectionTensorRT:
    def __init__(
        self,
        output_dir,
        batch_size,
        image_size,
        device_id,
        cvcuda_perf,
    ):
        self.logger = logging.getLogger(__name__)
        self.output_dir = output_dir
        self.batch_size = batch_size
        self.image_size = image_size
        self.device_id = device_id
        self.cvcuda_perf = cvcuda_perf

        # Download and prepare the models for the first use.
        etlt_model_path = os.path.join(self.output_dir, "resnet34_peoplenet_int8.etlt")
        trt_engine_file_path = os.path.join(
            self.output_dir,
            "resnet34_peoplenet_int8.%d.%d.%d.trtmodel"
            % (
                batch_size,
                image_size[1],
                image_size[0],
            ),
        )

        # Check if we have a previously generated model.
        if not os.path.isfile(trt_engine_file_path):
            if not os.path.isfile(etlt_model_path):
                # We need to download the ETLE model first from NGC.
                model_url = (
                    "https://api.ngc.nvidia.com/v2/models/"
                    "nvidia/tao/peoplenet/versions/deployable_quantized_v2.6.1/"
                    "files/resnet34_peoplenet_int8.etlt"
                )
                self.logger.info(
                    "Downloading the PeopleNet model from NGC: %s" % model_url
                )
                urllib.request.urlretrieve(model_url, etlt_model_path)
                self.logger.info("Download complete. Saved to: %s" % etlt_model_path)

            # Convert ETLE to TensorRT model using the TAO-Converter.
            self.logger.info("Converting the PeopleNet model to TensorRT...")
            if os.system(
                "tao-converter -e %s -k tlt_encode -d 3,%d,%d -m %d -i nchw %s"
                % (
                    trt_engine_file_path,
                    image_size[1],
                    image_size[0],
                    batch_size,
                    etlt_model_path,
                )
            ):
                raise Exception("Conversion failed.")
            else:
                self.logger.info(
                    "Conversion complete. Saved to: %s" % trt_engine_file_path
                )

        # Once the TensorRT engine generation is all done, we load it.
        trt_logger = trt.Logger(trt.Logger.ERROR)
        with open(trt_engine_file_path, "rb") as f, trt.Runtime(trt_logger) as runtime:
            # Keeping this as a class variable because we want to be able to
            # allocate the output tensors either on its first use or when the
            # batch size changes
            self.trt_model = runtime.deserialize_cuda_engine(f.read())

        # Create execution context.
        self.model = self.trt_model.create_execution_context()

        # We will allocate the output tensors and its bindings either when we
        # use it for the first time or when the batch size changes.
        self.output_tensors, self.output_idx = None, None

        self.logger.info("Using TensorRT as the inference engine.")

To run the inference the __call__ method is used. It uses the correct I/O bindings and makes sure to use the CUDA stream to perform the forward inference pass. In passing the inputs, we are directly going to pass the data from the CVCUDA tensor without further conversions. The API to do so does involve accessing an internal member named __cuda_array_interface__ as shown in the code below.

def __call__(self, tensor):
    self.cvcuda_perf.push_range("inference.tensorrt")

    # Grab the data directly from the pre-allocated tensor.
    input_bindings = [tensor.cuda().__cuda_array_interface__["data"][0]]
    output_bindings = []

    actual_batch_size = tensor.shape[0]

    # Need to allocate the output tensors
    if not self.output_tensors or actual_batch_size != self.batch_size:
        self.output_tensors, self.output_idx = setup_tensort_bindings(
            self.trt_model,
            actual_batch_size,
            self.device_id,
            self.logger,
        )

    for t in self.output_tensors:
        output_bindings.append(t.data_ptr())
    io_bindings = input_bindings + output_bindings

    # Call inference for implicit batch
    self.model.execute_async(
        actual_batch_size,
        bindings=io_bindings,
        stream_handle=cvcuda.Stream.current.handle,
    )

    boxes = self.output_tensors[0]
    score = self.output_tensors[1]

    self.cvcuda_perf.pop_range()  # inference.tensorrt
    return boxes, score