Object Detection Inference Using TensorRT

The object detection sample in CVCUDA uses the Peoplenet Model from NGC. The etlt model must be serialized to tensorrt using the tao-converter. The tensorrt model is provided as input to the sample application. The model supports implicit batch size. We will need to specify the max batch size during serialization. We will allocate the output Tensors in advance for TensorRT based on the output layer dimensions inferred from the tenorrt model loaded.

 1class ObjectDetectionTensorRT:
 2    def __init__(
 3        self,
 4        output_dir,
 5        batch_size,
 6        image_size,
 7        device_id,
 8        cvcuda_perf,
 9    ):
10        self.logger = logging.getLogger(__name__)
11        self.output_dir = output_dir
12        self.batch_size = batch_size
13        self.image_size = image_size
14        self.device_id = device_id
15        self.cvcuda_perf = cvcuda_perf
16
17        # Download and prepare the models for the first use.
18        etlt_model_path = os.path.join(self.output_dir, "resnet34_peoplenet_int8.etlt")
19        trt_engine_file_path = os.path.join(
20            self.output_dir,
21            "resnet34_peoplenet_int8.%d.%d.%d.trtmodel"
22            % (
23                batch_size,
24                image_size[1],
25                image_size[0],
26            ),
27        )
28
29        # Check if we have a previously generated model.
30        if not os.path.isfile(trt_engine_file_path):
31            if not os.path.isfile(etlt_model_path):
32                # We need to download the ETLE model first from NGC.
33                model_url = (
34                    "https://api.ngc.nvidia.com/v2/models/"
35                    "nvidia/tao/peoplenet/versions/deployable_quantized_v2.6.1/"
36                    "files/resnet34_peoplenet_int8.etlt"
37                )
38                self.logger.info(
39                    "Downloading the PeopleNet model from NGC: %s" % model_url
40                )
41                urllib.request.urlretrieve(model_url, etlt_model_path)
42                self.logger.info("Download complete. Saved to: %s" % etlt_model_path)
43
44            # Convert ETLE to TensorRT model using the TAO-Converter.
45            self.logger.info("Converting the PeopleNet model to TensorRT...")
46            if os.system(
47                "tao-converter -e %s -k tlt_encode -d 3,%d,%d -m %d -i nchw %s"
48                % (
49                    trt_engine_file_path,
50                    image_size[1],
51                    image_size[0],
52                    batch_size,
53                    etlt_model_path,
54                )
55            ):
56                raise Exception("Conversion failed.")
57            else:
58                self.logger.info(
59                    "Conversion complete. Saved to: %s" % trt_engine_file_path
60                )
61
62        # Once the TensorRT engine generation is all done, we load it.
63        trt_logger = trt.Logger(trt.Logger.ERROR)
64        with open(trt_engine_file_path, "rb") as f, trt.Runtime(trt_logger) as runtime:
65            # Keeping this as a class variable because we want to be able to
66            # allocate the output tensors either on its first use or when the
67            # batch size changes
68            self.trt_model = runtime.deserialize_cuda_engine(f.read())
69
70        # Create execution context.
71        self.model = self.trt_model.create_execution_context()
72
73        # We will allocate the output tensors and its bindings either when we
74        # use it for the first time or when the batch size changes.
75        self.output_tensors, self.output_idx = None, None
76
77        self.logger.info("Using TensorRT as the inference engine.")

To run the inference the __call__ method is used. It uses the correct I/O bindings and makes sure to use the CUDA stream to perform the forward inference pass. In passing the inputs, we are directly going to pass the data from the CVCUDA tensor without further conversions. The API to do so does involve accessing an internal member named __cuda_array_interface__ as shown in the code below.

 1def __call__(self, tensor):
 2    self.cvcuda_perf.push_range("inference.tensorrt")
 3
 4    # Grab the data directly from the pre-allocated tensor.
 5    input_bindings = [tensor.cuda().__cuda_array_interface__["data"][0]]
 6    output_bindings = []
 7
 8    actual_batch_size = tensor.shape[0]
 9
10    # Need to allocate the output tensors
11    if not self.output_tensors or actual_batch_size != self.batch_size:
12        self.output_tensors, self.output_idx = setup_tensort_bindings(
13            self.trt_model,
14            actual_batch_size,
15            self.device_id,
16            self.logger,
17        )
18
19    for t in self.output_tensors:
20        output_bindings.append(t.data_ptr())
21    io_bindings = input_bindings + output_bindings
22
23    # Call inference for implicit batch
24    self.model.execute_async(
25        actual_batch_size,
26        bindings=io_bindings,
27        stream_handle=cvcuda.Stream.current.handle,
28    )
29
30    boxes = self.output_tensors[0]
31    score = self.output_tensors[1]
32
33    self.cvcuda_perf.pop_range()  # inference.tensorrt
34    return boxes, score