Source code for trtutils.builder._build

# Copyright (c) 2024 Justin Davis (davisjustin302@gmail.com)
#
# MIT License
# mypy: disable-error-code="import-untyped"
from __future__ import annotations

import contextlib
import shutil
from pathlib import Path
from typing import TYPE_CHECKING

with contextlib.suppress(ImportError):
    import tensorrt as trt

from trtutils._config import CONFIG
from trtutils._flags import FLAGS
from trtutils._log import LOG
from trtutils.core import cache as caching_tools

from ._calibrator import EngineCalibrator
from ._onnx import read_onnx
from ._utils import get_check_dla

if FLAGS.BUILD_PROGRESS:
    from ._progress import ProgressBar

if TYPE_CHECKING:
    from collections.abc import Callable

    from ._batcher import AbstractBatcher


[docs] def build_engine( onnx: Path | str, output: Path | str, default_device: trt.DeviceType | str = trt.DeviceType.GPU, timing_cache: Path | str | None = None, workspace: float = 4.0, dla_core: int | None = None, calibration_cache: Path | str | None = None, data_batcher: AbstractBatcher | None = None, layer_precision: list[tuple[int, trt.DataType | None]] | None = None, layer_device: list[tuple[int, trt.DeviceType | None]] | None = None, shapes: list[tuple[str, tuple[int, ...]]] | None = None, input_tensor_formats: list[tuple[str, trt.DataType, trt.TensorFormat]] | None = None, output_tensor_formats: list[tuple[str, trt.DataType, trt.TensorFormat]] | None = None, hooks: list[Callable[[trt.INetworkDefinition], trt.INetworkDefinition]] | None = None, *, gpu_fallback: bool = False, direct_io: bool = False, prefer_precision_constraints: bool = False, reject_empty_algorithms: bool = False, ignore_timing_mismatch: bool = False, fp16: bool | None = None, int8: bool | None = None, cache: bool | None = None, verbose: bool | None = None, ) -> None: """ Build a TensorRT engine from an ONNX model. The order in which operations occur inside build_engine: 1. Parse the ONNX model 2. Apply any network hooks 3. Create optimization profile and apply any manual shapes 4. Apply builder flags (precision constraints, empty algorithms, direct I/O) 5. Configure tensor formats if specified 6. Configure precision (FP16, INT8) 7. Set default device and DLA core 8. Apply individual layer precision and device settings 9. Set up timing cache 10. Build the engine 11. Save timing cache and engine Parameters ---------- onnx : Path, str The path to the onnx model. output : Path, str The location to save the TensorRT engine. default_device : trt.DeviceType, str, optional The device to use for the engine. By default, trt.DeviceType.GPU. Options are trt.DeviceType.GPU, trt.DeviceType.DLA, or a string of "gpu" or "dla". timing_cache : Path, str, optional Where to store the timing cache data. Default is None. workspace : float The size of the workspace in gigabytes. Default is 4.0 GiB. calibration_cache : Path, str, optional The path to the calibration cache. data_batcher : AbstractBatcher, optional The data batcher to use for calibration. dla_core : int, optional The DLA core to build the engine for. By default, None or build the engine for GPU. layer_precision : list[tuple[int, trt.DataType | None]], optional The precision to use for specific layers. By default, None. layer_device : list[tuple[int, trt.DeviceType | None]], optional The device to use for specific layers. By default, None. shapes : list[tuple[str, tuple[int, ...]]], optional A list of (input_name, shape) pairs to specify the shapes of the input layers. For example, shapes=[("images", (1, 3, imgsz, imgsz))] will set the input “images” to a fixed shape. This shape will be used as the min, optimal, and max shape for the binding. By default, None. input_tensor_formats : list[tuple[str, trt.DataType, trt.TensorFormat]], optional A list of (name, dtype format) to allow deep specification of input layers. For example, input_tensor_formats=[("input", trt.DataType.UINT8, trt.TensorFormat.HWC)] By default, None output_tensor_formats : list[tuple[str, trt.DataType, trt.TensorFormat]], optional A list of (name, dtype format) to allow deep specification of output layers. For example, output_tensor_formats=[("output", trt.DataType.HALF, trt.TensorFormat.LINEAR)] By default, None hooks : list[Callable[[trt.INetworkDefinition], trt.INetworkDefinition]], optional An optional list of 'hook' functions to modify the TensorRT network before the remainder of the build phase occurs. By default, None gpu_fallback : bool Whether or not to allow GPU fallback for unsupported layers when building the engine for DLA. By default, False direct_io : bool Use direct IO for the engine. By default, False prefer_precision_constraints : bool Whether or not to prefer precision constraints. By default, False reject_empty_algorithms : bool Whether or not to reject empty algorithms. By default, False ignore_timing_mismatch : bool Whether or not to allow different CUDA device generated timing caches to be used in the building of engines. By default, False fp16 : bool, optional If True, quantize the engine to FP16 precision. int8 : bool, optional If True, quantize the engine to INT8 precision. cache : bool, optional Whether or not to cache the engine in the trtutils engine cache. If an existing version is found will use that. Uses the name of the output file to assess if the engine has been compiled before. As such, naming the output 'engine', 'model' or similiar will result in unintended caching behavior. By default None, will not cache the engine. verbose : bool, optional If True, print verbose output. By default, None or False Raises ------ RuntimeError If the ONNX model cannot be parsed RuntimeError If the TensorRT engines fails to build ValueError If layer is manually assigned to DLA and DLA is not supported and gpu_fallback is False """ # load libnvinfer plugins CONFIG.load_plugins() output_path = Path(output).resolve() # first thing is to check cache if cache: exists, location = caching_tools.query_cache(output_path.stem) if exists: shutil.copy(location, output_path) return # match the device valid_gpu = ["gpu", "GPU"] valid_dla = ["dla", "DLA"] if isinstance(default_device, str): if default_device not in valid_gpu + valid_dla: err_msg = f"Invalid default device: {default_device}. Must be one of: {valid_gpu + valid_dla}" raise ValueError(err_msg) default_device = ( trt.DeviceType.GPU if default_device in valid_gpu else trt.DeviceType.DLA ) else: if default_device not in [trt.DeviceType.GPU, trt.DeviceType.DLA]: err_msg = f"Invalid default device: {default_device}. Must be one of: {valid_gpu + valid_dla}" raise ValueError(err_msg) default_device = ( trt.DeviceType.GPU if default_device == trt.DeviceType.GPU else trt.DeviceType.DLA ) # read the onnx model network, builder, config, _ = read_onnx( onnx, workspace, ) # handle all hooks to start if hooks is not None: for hook in hooks: network = hook(network) # helper function for checking if layer can run on DLA check_dla: Callable[[trt.ILayer], bool] = get_check_dla(config) if verbose and FLAGS.BUILD_PROGRESS: LOG.debug("Applying ProgressBar to config") config.progress_monitor = ProgressBar() # create profile and config profile = builder.create_optimization_profile() # handle if manual shapes were passed for inputs if shapes: for input_name, shape in shapes: # set the minimum, optimal, maximum to all the same profile.set_shape(input_name, shape, shape, shape) config.add_optimization_profile(profile) # handle some flags if prefer_precision_constraints: config.set_flag(trt.BuilderFlag.PREFER_PRECISION_CONSTRAINTS) if reject_empty_algorithms: config.set_flag(trt.BuilderFlag.REJECT_EMPTY_ALGORITHMS) # handle custom datatype/format for input/output tensors if ( input_tensor_formats is not None or output_tensor_formats is not None ) and not direct_io: LOG.warning( "Direct IO not enabled, but some tensor formats specified. Enabling direct IO." ) direct_io = True if direct_io: config.set_flag(trt.BuilderFlag.DIRECT_IO) if input_tensor_formats is not None: for tensor_name, tensor_dtype, tensor_format in input_tensor_formats: found = False for idx in range(network.num_inputs): inp = network.get_input(idx) if inp.name == tensor_name: inp.dtype = tensor_dtype inp.allowed_formats = 1 << int(tensor_format) found = True break if not found: LOG.warning(f"Input tensor '{tensor_name}' not found in network") if output_tensor_formats is not None: for tensor_name, tensor_dtype, tensor_format in output_tensor_formats: found = False for idx in range(network.num_outputs): out = network.get_output(idx) if out.name == tensor_name: out.dtype = tensor_dtype out.allowed_formats = 1 << int(tensor_format) found = True break if not found: LOG.warning(f"Output tensor '{tensor_name}' not found in network") # setup the precision sets if fp16 or int8: # want to enable fp16 for both int8 and fp16 since fp16 may be faster if not builder.platform_has_fast_fp16: LOG.warning("Platform does not have native fast FP16.") config.set_flag(trt.BuilderFlag.FP16) if int8: if not builder.platform_has_fast_int8: LOG.warning("Platform does not have native fast INT8.") config.set_flag(trt.BuilderFlag.INT8) if calibration_cache is None and data_batcher is None: err_msg = "Neither calibration cache or data batcher passed during model building, INT8 build will not be accurate." LOG.warning(err_msg) config.int8_calibrator = EngineCalibrator(calibration_cache=calibration_cache) if data_batcher is not None: config.int8_calibrator.set_batcher(data_batcher) # assign the default device config.default_device_type = default_device # handle DLA assignment if dla_core is not None: config.DLA_core = dla_core if gpu_fallback: config.set_flag(trt.BuilderFlag.GPU_FALLBACK) # handle individual layer precision if layer_precision is not None: # validate length if len(layer_precision) != network.num_layers: err_msg = "Layer precision list must be the same length as the number of layers in the network." raise ValueError(err_msg) # handle precision assignment for layer_idx, precision in layer_precision: if precision is None: continue layer = network.get_layer(layer_idx) layer.precision = precision # handle individual layer device if layer_device is not None: # validate length if len(layer_device) != network.num_layers: err_msg = "Layer device list must be the same length as the number of layers in the network." raise ValueError(err_msg) # handle device assignment for layer_idx, device in layer_device: if device is None: continue layer = network.get_layer(layer_idx) # assess if can run on DLA if device == trt.DeviceType.DLA and not check_dla(layer): err_msg = f"Layer {layer.name} (type: {layer.type}) cannot run on DLA" if gpu_fallback: err_msg += ", using GPU fallback" LOG.warning(err_msg) else: raise ValueError(err_msg) else: config.set_device_type(layer, device) # load/setup the timing cache timing_cache_path: Path | None = ( Path(timing_cache).resolve() if timing_cache else None ) if timing_cache_path: buffer = b"" if timing_cache_path.exists(): with timing_cache_path.open("rb") as timing_cache_file: buffer = timing_cache_file.read() t_cache = config.create_timing_cache(buffer) config.set_timing_cache(t_cache, ignore_mismatch=ignore_timing_mismatch) # build the engine if FLAGS.BUILD_SERIALIZED: engine_bytes = builder.build_serialized_network(network, config) else: engine_bytes = builder.build_engine(network, config) # save the timing cache if timing_cache_path: post_t_cache = config.get_timing_cache() with timing_cache_path.open("wb") as f: f.write(memoryview(post_t_cache.serialize())) if engine_bytes is None: err_msg = "Failed to build engine." raise RuntimeError(err_msg) with output_path.open("wb") as f: f.write(engine_bytes) if cache: caching_tools.store_in_cache(output_path, overwrite=False, clear_old=False)