Source code for trtutils.builder._dla

# Copyright (c) 2024-2026 Justin Davis (davisjustin302@gmail.com)
#
# MIT License
# mypy: disable-error-code="import-untyped"
from __future__ import annotations

from typing import TYPE_CHECKING

from trtutils._log import LOG
from trtutils.compat._libs import trt

from ._build import build_engine
from ._onnx import read_onnx
from ._utils import get_check_dla

if TYPE_CHECKING:
    from collections.abc import Callable
    from pathlib import Path

    from ._batcher import AbstractBatcher


[docs] def can_run_on_dla( onnx: Path | str | trt.INetworkDefinition, config: trt.IBuilderConfig | None = None, *, verbose_layers: bool | None = None, verbose_chunks: bool | None = None, ) -> tuple[bool, list[tuple[list[trt.ILayer], int, int, bool]]]: """ Whether or not the entire model can be run on a DLA. Parameters ---------- onnx : Path, str, or trt.INetworkDefinition The path to the onnx file or a pre-made TensorRT network. config : trt.IBuilderConfig, optional The TensorRT builder config. Required if onnx is a network. verbose_layers : bool, optional Whether to print verbose output for individual layers, by default None verbose_chunks : bool, optional Whether to print verbose output for layer chunks, by default None Returns ------- tuple[bool, list[tuple[list[trt.ILayer], int, int, bool]]] Whether or not the model will all run on DLA and each block of layers. Where each block can run on a single device, DLA or GPU. Raises ------ ValueError If config is not provided when onnx is a network """ # handle network input if isinstance(onnx, trt.INetworkDefinition): if config is None: err_msg = "Config must be provided when onnx is a network" raise ValueError(err_msg) network = onnx else: network, _, config, _ = read_onnx(onnx) check_dla: Callable[[trt.ILayer], bool] = get_check_dla(config) # assign to DLA 0, since core doesnt matter for this check config.default_device_type = trt.DeviceType.DLA config.DLA_core = 0 full_dla = True last_layer_dla = False chunks: list[tuple[list[trt.ILayer], int, int, bool]] = [] curr_start: int = 0 curr_layers: list[trt.ILayer] = [] for idx in range(network.num_layers): layer = network.get_layer(idx) # check if the layer can run on DLA dla_valid = check_dla(layer) if not dla_valid: full_dla = False # handle chunk storage if dla_valid != last_layer_dla and len(curr_layers) > 0: chunks.append((curr_layers, curr_start, idx - 1, last_layer_dla)) curr_layers = [layer] curr_start = idx else: curr_layers.append(layer) last_layer_dla = dla_valid if verbose_layers: LOG.info( f"Layer {idx}: {layer.name}, {layer.type}, {layer.precision}, {layer.metadata}", ) LOG.info(f"\tDLA: {dla_valid}") # handle final chunk chunks.append((curr_layers, curr_start, network.num_layers - 1, last_layer_dla)) if verbose_chunks: LOG.info(f"Found {len(chunks)} Chunks of Layers") for i, (layers, start, end, on_dla) in enumerate(chunks): LOG.info( f"\tChunk {i}: [{start} - {end}], {len(layers)} layers, {'DLA' if on_dla else 'GPU'}" ) return full_dla, chunks
[docs] def build_dla_engine( onnx: Path | str, output_path: Path | str, data_batcher: AbstractBatcher, dla_core: int, max_chunks: int = 1, min_layers: int = 20, workspace: float = 4.0, calibration_cache: Path | str | None = None, timing_cache: Path | str | None = None, shapes: list[tuple[str, tuple[int, ...]]] | None = None, input_tensor_formats: list[tuple[str, trt.DataType, trt.TensorFormat]] | None = None, output_tensor_formats: list[tuple[str, trt.DataType, trt.TensorFormat]] | None = None, hooks: list[Callable[[trt.INetworkDefinition], trt.INetworkDefinition]] | None = None, optimization_level: int = 3, *, direct_io: bool = False, prefer_precision_constraints: bool = False, reject_empty_algorithms: bool = False, ignore_timing_mismatch: bool = False, fp8: bool | None = None, cache: bool | None = None, verbose: bool | None = None, ) -> None: """ Automatically build a TensorRT engine for DLA with automatic layer assignments. This function will: 1. Check which layers can run on DLA 2. Find the largest chunk of DLA-compatible layers 3. Assign those layers to DLA with INT8 precision 4. Assign remaining layers to GPU with FP16 precision Parameters ---------- onnx : Path, str The path to the ONNX model or a pre-made TensorRT network output_path : Path, str The path where the engine should be saved data_batcher : AbstractBatcher The data batcher instance for INT8 calibration dla_core : int The DLA core to use max_chunks : int, optional The maximum number of DLA-compatible chunks to assign to the DLA. By default 1, which will assign the first compatible chunk. Can set to 0 to assign all chunks which meet min_layers. min_layers : int, optional The minimum number of layers in a chunk to be assigned to DLA. By default 20, which will assign chunks with at least 20 layers. Can set to 0 to assign all chunks. workspace : float The size of the workspace in gigabytes. Default is 4.0 GiB. calibration_cache : Path, str, optional The path to the calibration cache. timing_cache : Path, str, optional Where to store the timing cache data. Default is None. shapes : list[tuple[str, tuple[int, ...]]], optional A list of (input_name, shape) pairs to specify the shapes of the input layers. For example, shapes=[("images", (1, 3, imgsz, imgsz))] will set the input "images" to a fixed shape. This shape will be used as the min, optimal, and max shape for the binding. By default, None. input_tensor_formats : list[tuple[str, trt.DataType, trt.TensorFormat]], optional A list of (name, dtype format) to allow deep specification of input layers. For example, input_tensor_formats=[("input", trt.DataType.UINT8, trt.TensorFormat.HWC)] By default, None output_tensor_formats : list[tuple[str, trt.DataType, trt.TensorFormat]], optional A list of (name, dtype format) to allow deep specification of output layers. For example, output_tensor_formats=[("output", trt.DataType.HALF, trt.TensorFormat.LINEAR)] By default, None hooks : list[Callable[[trt.INetworkDefinition], trt.INetworkDefinition]], optional An optional list of 'hook' functions to modify the TensorRT network before the remainder of the build phase occurs. By default, None optimization_level : int, optional Optimization level to apply to the TensorRT builder config (0-5). By default, 3. direct_io : bool Use direct IO for the engine. By default, False prefer_precision_constraints : bool Whether or not to prefer precision constraints. By default, False reject_empty_algorithms : bool Whether or not to reject empty algorithms. By default, False ignore_timing_mismatch : bool Whether or not to allow different CUDA device generated timing caches to be used in the building of engines. By default, False fp8 : bool, optional If True, enable FP8 precision for GPU layers. Requires compute capability >= 8.9 (Ada Lovelace / Hopper or newer). DLA layers will still use INT8 precision. cache : bool, optional Whether or not to cache the engine in the trtutils engine cache. If an existing version is found will use that. Uses the name of the output file to assess if the engine has been compiled before. As such, naming the output 'engine', 'model' or similiar will result in unintended caching behavior. By default None, will not cache the engine. verbose : bool, optional Whether to print verbose output, by default False """ # read the onnx path network, _, config, _ = read_onnx(onnx) # check layers for DLA compatibility and use int8 precision full_dla, chunks = can_run_on_dla( onnx=network, config=config, verbose_layers=verbose, verbose_chunks=verbose, ) if verbose: LOG.info(f"Model can run fully on DLA: {full_dla}") LOG.info(f"Found {len(chunks)} chunks of layers") # case where the entire model can run on DLA if full_dla: build_engine( onnx, output_path, default_device=trt.DeviceType.DLA, data_batcher=data_batcher, workspace=workspace, timing_cache=timing_cache, calibration_cache=calibration_cache, dla_core=dla_core, shapes=shapes, input_tensor_formats=input_tensor_formats, output_tensor_formats=output_tensor_formats, hooks=hooks, direct_io=direct_io, prefer_precision_constraints=prefer_precision_constraints, reject_empty_algorithms=reject_empty_algorithms, ignore_timing_mismatch=ignore_timing_mismatch, cache=cache, fp16=True, fp8=fp8, int8=True, verbose=verbose, ) return # identify if any chunks contain DLA layers dla_chunks = [(i, chunk) for i, chunk in enumerate(chunks) if chunk[3]] # case where no DLA layers are found if not dla_chunks: LOG.warning("No DLA-compatible layers found. Building GPU-only engine.") build_engine( onnx, output_path, workspace=workspace, timing_cache=timing_cache, calibration_cache=calibration_cache, data_batcher=data_batcher, shapes=shapes, input_tensor_formats=input_tensor_formats, output_tensor_formats=output_tensor_formats, hooks=hooks, direct_io=direct_io, prefer_precision_constraints=prefer_precision_constraints, reject_empty_algorithms=reject_empty_algorithms, ignore_timing_mismatch=ignore_timing_mismatch, fp16=True, fp8=fp8, int8=True, cache=cache, verbose=verbose, ) return # sort chunks by len and filter by min_layers or until max_chunks is reached dla_chunks = sorted(dla_chunks, key=lambda x: len(x[1][0]), reverse=True) if verbose: LOG.info( f"Found {len(dla_chunks)} total chunks of which: {sum(1 if chunk[1][-1] else 0 for chunk in dla_chunks)} are DLA compatible." ) # define lists for storing layer assignments layer_precision: list[tuple[int, trt.DataType | None]] = [] layer_device: list[tuple[int, trt.DeviceType | None]] = [] # assign default to GPU/FP16 exclude_layer_types = [trt.LayerType.CONSTANT, trt.LayerType.SHUFFLE] for idx in range(network.num_layers): layer = network.get_layer(idx) layer_name: str = layer.name layer_name = layer_name.lower() layer_device.append((idx, trt.DeviceType.GPU)) # intelligently assign precision level to HALF unless layer # is Constant, Shuffle, or Tile if layer.type in exclude_layer_types or "tile" in layer_name: layer_precision.append((idx, None)) else: layer_precision.append((idx, trt.DataType.HALF)) # iterate over chunks and assign to DLA matched_chunks = 0 for _, (layers, start, end, on_dla) in dla_chunks: if matched_chunks >= max_chunks and max_chunks > 0: break if not on_dla: continue if len(layers) < min_layers: continue for layer_id in range(start, end + 1, 1): layer_precision[layer_id] = (layer_id, trt.DataType.INT8) layer_device[layer_id] = (layer_id, trt.DeviceType.DLA) matched_chunks += 1 # verbose iteration if verbose: for (idx, device), (_, datatype) in zip(layer_device, layer_precision): LOG.info( f"Layer {idx}: {network.get_layer(idx).name}, " f"{'DLA' if device == trt.DeviceType.DLA else 'GPU'}, " f"{'INT8' if datatype == trt.DataType.INT8 else 'FP16'}" ) # build engine with specific layer assignments build_engine( onnx, output_path, default_device=trt.DeviceType.DLA, # default device DLA timing_cache=timing_cache, workspace=workspace, calibration_cache=calibration_cache, data_batcher=data_batcher, layer_precision=layer_precision, layer_device=layer_device, dla_core=dla_core, # ensure DLA core is maintained shapes=shapes, input_tensor_formats=input_tensor_formats, output_tensor_formats=output_tensor_formats, hooks=hooks, optimization_level=optimization_level, gpu_fallback=True, # enable GPU fallback to account for input/copy direct_io=direct_io, prefer_precision_constraints=prefer_precision_constraints, reject_empty_algorithms=reject_empty_algorithms, ignore_timing_mismatch=ignore_timing_mismatch, fp16=True, fp8=fp8, int8=True, cache=cache, verbose=verbose, )