Inference using TensorRT Backend.

发表于 2020-09-21 分类于算法与硬件阅读次数：

0x01 Tensorflow 2.0

1.1. Convert

keras hdf5 –> .pb

# V2 behaviour is disabled by default in Jetpack 4.4.DP.
import tensorflow.compat.v2 as tf
from tensorflow.keras.models import load_model

model = load_model('./model/fer2013_mini_XCEPTION.102-0.66.hdf5')
model.save('./model/tf_savedmodel', save_format='tf')

.pb–>trt.pb

params = trt.DEFAULT_TRT_CONVERSION_PARAMS
params._replace(precision_mode=trt.TrtPrecisionMode.INT8)
converter = trt.TrtGraphConverterV2(input_saved_model_dir='./model/tf_savedmodel', conversion_params=params)
converter.convert()
converter.save('./model/trt_int8')

1.2. Inference

if use_trt:
    saved_model_loaded = tf.saved_model.load('./model/trt_int8', tags=[trt.tag_constants.SERVING])
    graph_func = saved_model_loaded.signatures[trt.signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY]
    self.emotion_classifier = frozen_func
else:
    self.emotion_classifier = tf.keras.models.load_model('xxx.hdf5', compile=False)

1.3. v2 Behaviour

It looks like the current Tensorflow for JP 4.4 was compiled with --config=v1 flag, as V2 behaviour seems to be disabled in default.
The workaround is:

1
2
3

import tensorflow.compat.v2 as tf
import tensorflow.compat.v2.keras as keras
tf.enable_v2_behavior()

0x02 Others ( Pytorch / MxNet / Caffe )

2.1. Convert model to ONNX

pass

2.2. Build TensorRT Engine from ONNX Model

def build_engine_onnx(model_file):
    with trt.Builder(TRT_LOGGER) as builder,
        builder.create_network(1 << int(trt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH)) as network,
            trt.OnnxParser(network, TRT_LOGGER) as parser:
                builder.max_workspace_size = common.GiB(1)
                builder.max_batch_size = batch_size
                # Load the Onnx model and parse it in order to populate the TensorRT network.
                with open(model_file, 'rb') as model:
                    parser.parse(model.read())
                return builder.build_cuda_engine(network)

engine = build_engine_onnx('resnet100.onnx')
engine_file_path = './arcface_trt.engine'
with open(engine_file_path, "wb") as f:
    f.write(engine.serialize())

2.3. Inference from TRT Engine

def build(engine_file):
    with open(engine_file, 'rb') as f, trt.Runtime(TRT_LOGGER) as runtime:
        engine = runtime.deserialize_cuda_engine(f.read())
    inputs, outputs, bindings, stream = allocate_buffers(engine)
    context = engine.create_execution_context()

def run(objects_frame):
    allocate_place = np.prod(objects_frame.shape)
    inputs[0].host[:allocate_place] = objects_frame.flatten(order='C').astype(np.float32)
    trt_outputs = do_inference(
        self.context, bindings=self.bindings,
        inputs=inputs, outputs=outputs, stream=stream)

    return trt_outputs

2.4. pycuda

try:
    # Sometimes python2 does not understand FileNotFoundError
    FileNotFoundError
except NameError:
    FileNotFoundError = IOError

def GiB(val):
    return val * 1 << 30   # 1 << 10 << 10 << 10, 1024*1024*1024

# Simple helper data class that's a little nicer to use than a 2-tuple.
class HostDeviceMem(object):
    def __init__(self, host_mem, device_mem):
        self.host = host_mem
        self.device = device_mem

    def __str__(self):
        return "Host:\n" + str(self.host) + "\nDevice:\n" + str(self.device)

    def __repr__(self):
        return self.__str__()

# Allocates all buffers required for an engine, i.e. host/device inputs/outputs.
def allocate_buffers(engine):
    inputs = []
    outputs = []
    bindings = []
    stream = cuda.Stream()
    for binding in engine:
        size = trt.volume(engine.get_binding_shape(binding)) * engine.max_batch_size
        dtype = trt.nptype(engine.get_binding_dtype(binding))
        # Allocate host and device buffers
        host_mem = cuda.pagelocked_empty(size, dtype)      # pagelocked memory (Direct Memory Access，DMA)
        device_mem = cuda.mem_alloc(host_mem.nbytes)
        # Append the device buffer to device bindings.
        bindings.append(int(device_mem))
        # Append to the appropriate list.
        if engine.binding_is_input(binding):
            inputs.append(HostDeviceMem(host_mem, device_mem))
        else:
            outputs.append(HostDeviceMem(host_mem, device_mem))
    return inputs, outputs, bindings, stream

# This function is generalized for multiple inputs/outputs.
# inputs and outputs are expected to be lists of HostDeviceMem objects.
def do_inference(context, bindings, inputs, outputs, stream, batch_size=1):
    # Transfer input data to the GPU.
    [cuda.memcpy_htod_async(inp.device, inp.host, stream) for inp in inputs]
    # Run inference.
    context.execute_async(batch_size=batch_size, bindings=bindings, stream_handle=stream.handle)
    # Transfer predictions back from the GPU.
    [cuda.memcpy_dtoh_async(out.host, out.device, stream) for out in outputs]
    # Synchronize the stream
    stream.synchronize()
    # Return only the host outputs.
    return [out.host for out in outputs]