# Copyright (c) 2024-2026 Justin Davis (davisjustin302@gmail.com)
#
# MIT License
# mypy: disable-error-code="import-untyped"
from __future__ import annotations
from typing import TYPE_CHECKING
from trtutils._log import LOG
from trtutils.compat._libs import trt
from ._build import build_engine
from ._onnx import read_onnx
from ._utils import get_check_dla
if TYPE_CHECKING:
from collections.abc import Callable
from pathlib import Path
from ._batcher import AbstractBatcher
[docs]
def can_run_on_dla(
onnx: Path | str | trt.INetworkDefinition,
config: trt.IBuilderConfig | None = None,
*,
verbose_layers: bool | None = None,
verbose_chunks: bool | None = None,
) -> tuple[bool, list[tuple[list[trt.ILayer], int, int, bool]]]:
"""
Whether or not the entire model can be run on a DLA.
Parameters
----------
onnx : Path, str, or trt.INetworkDefinition
The path to the onnx file or a pre-made TensorRT network.
config : trt.IBuilderConfig, optional
The TensorRT builder config. Required if onnx is a network.
verbose_layers : bool, optional
Whether to print verbose output for individual layers, by default None
verbose_chunks : bool, optional
Whether to print verbose output for layer chunks, by default None
Returns
-------
tuple[bool, list[tuple[list[trt.ILayer], int, int, bool]]]
Whether or not the model will all run on DLA and each block of layers.
Where each block can run on a single device, DLA or GPU.
Raises
------
ValueError
If config is not provided when onnx is a network
"""
# handle network input
if isinstance(onnx, trt.INetworkDefinition):
if config is None:
err_msg = "Config must be provided when onnx is a network"
raise ValueError(err_msg)
network = onnx
else:
network, _, config, _ = read_onnx(onnx)
check_dla: Callable[[trt.ILayer], bool] = get_check_dla(config)
# assign to DLA 0, since core doesnt matter for this check
config.default_device_type = trt.DeviceType.DLA
config.DLA_core = 0
full_dla = True
last_layer_dla = False
chunks: list[tuple[list[trt.ILayer], int, int, bool]] = []
curr_start: int = 0
curr_layers: list[trt.ILayer] = []
for idx in range(network.num_layers):
layer = network.get_layer(idx)
# check if the layer can run on DLA
dla_valid = check_dla(layer)
if not dla_valid:
full_dla = False
# handle chunk storage
if dla_valid != last_layer_dla and len(curr_layers) > 0:
chunks.append((curr_layers, curr_start, idx - 1, last_layer_dla))
curr_layers = [layer]
curr_start = idx
else:
curr_layers.append(layer)
last_layer_dla = dla_valid
if verbose_layers:
LOG.info(
f"Layer {idx}: {layer.name}, {layer.type}, {layer.precision}, {layer.metadata}",
)
LOG.info(f"\tDLA: {dla_valid}")
# handle final chunk
chunks.append((curr_layers, curr_start, network.num_layers - 1, last_layer_dla))
if verbose_chunks:
LOG.info(f"Found {len(chunks)} Chunks of Layers")
for i, (layers, start, end, on_dla) in enumerate(chunks):
LOG.info(
f"\tChunk {i}: [{start} - {end}], {len(layers)} layers, {'DLA' if on_dla else 'GPU'}"
)
return full_dla, chunks
[docs]
def build_dla_engine(
onnx: Path | str,
output_path: Path | str,
data_batcher: AbstractBatcher,
dla_core: int,
max_chunks: int = 1,
min_layers: int = 20,
workspace: float = 4.0,
calibration_cache: Path | str | None = None,
timing_cache: Path | str | None = None,
shapes: list[tuple[str, tuple[int, ...]]] | None = None,
input_tensor_formats: list[tuple[str, trt.DataType, trt.TensorFormat]] | None = None,
output_tensor_formats: list[tuple[str, trt.DataType, trt.TensorFormat]] | None = None,
hooks: list[Callable[[trt.INetworkDefinition], trt.INetworkDefinition]] | None = None,
optimization_level: int = 3,
*,
direct_io: bool = False,
prefer_precision_constraints: bool = False,
reject_empty_algorithms: bool = False,
ignore_timing_mismatch: bool = False,
fp8: bool | None = None,
cache: bool | None = None,
verbose: bool | None = None,
) -> None:
"""
Automatically build a TensorRT engine for DLA with automatic layer assignments.
This function will:
1. Check which layers can run on DLA
2. Find the largest chunk of DLA-compatible layers
3. Assign those layers to DLA with INT8 precision
4. Assign remaining layers to GPU with FP16 precision
Parameters
----------
onnx : Path, str
The path to the ONNX model or a pre-made TensorRT network
output_path : Path, str
The path where the engine should be saved
data_batcher : AbstractBatcher
The data batcher instance for INT8 calibration
dla_core : int
The DLA core to use
max_chunks : int, optional
The maximum number of DLA-compatible chunks to assign to the DLA.
By default 1, which will assign the first compatible chunk.
Can set to 0 to assign all chunks which meet min_layers.
min_layers : int, optional
The minimum number of layers in a chunk to be assigned to DLA.
By default 20, which will assign chunks with at least 20 layers.
Can set to 0 to assign all chunks.
workspace : float
The size of the workspace in gigabytes.
Default is 4.0 GiB.
calibration_cache : Path, str, optional
The path to the calibration cache.
timing_cache : Path, str, optional
Where to store the timing cache data.
Default is None.
shapes : list[tuple[str, tuple[int, ...]]], optional
A list of (input_name, shape) pairs to specify the shapes of the input layers.
For example, shapes=[("images", (1, 3, imgsz, imgsz))] will set the input
"images" to a fixed shape. This shape will be used as the min, optimal,
and max shape for the binding.
By default, None.
input_tensor_formats : list[tuple[str, trt.DataType, trt.TensorFormat]], optional
A list of (name, dtype format) to allow deep specification of input layers.
For example, input_tensor_formats=[("input", trt.DataType.UINT8, trt.TensorFormat.HWC)]
By default, None
output_tensor_formats : list[tuple[str, trt.DataType, trt.TensorFormat]], optional
A list of (name, dtype format) to allow deep specification of output layers.
For example, output_tensor_formats=[("output", trt.DataType.HALF, trt.TensorFormat.LINEAR)]
By default, None
hooks : list[Callable[[trt.INetworkDefinition], trt.INetworkDefinition]], optional
An optional list of 'hook' functions to modify the TensorRT network before
the remainder of the build phase occurs.
By default, None
optimization_level : int, optional
Optimization level to apply to the TensorRT builder config (0-5).
By default, 3.
direct_io : bool
Use direct IO for the engine.
By default, False
prefer_precision_constraints : bool
Whether or not to prefer precision constraints.
By default, False
reject_empty_algorithms : bool
Whether or not to reject empty algorithms.
By default, False
ignore_timing_mismatch : bool
Whether or not to allow different CUDA device generated timing
caches to be used in the building of engines.
By default, False
fp8 : bool, optional
If True, enable FP8 precision for GPU layers.
Requires compute capability >= 8.9 (Ada Lovelace / Hopper or newer).
DLA layers will still use INT8 precision.
cache : bool, optional
Whether or not to cache the engine in the trtutils engine cache.
If an existing version is found will use that.
Uses the name of the output file to assess if the engine has been compiled before.
As such, naming the output 'engine', 'model' or similiar will result in
unintended caching behavior.
By default None, will not cache the engine.
verbose : bool, optional
Whether to print verbose output, by default False
"""
# read the onnx path
network, _, config, _ = read_onnx(onnx)
# check layers for DLA compatibility and use int8 precision
full_dla, chunks = can_run_on_dla(
onnx=network,
config=config,
verbose_layers=verbose,
verbose_chunks=verbose,
)
if verbose:
LOG.info(f"Model can run fully on DLA: {full_dla}")
LOG.info(f"Found {len(chunks)} chunks of layers")
# case where the entire model can run on DLA
if full_dla:
build_engine(
onnx,
output_path,
default_device=trt.DeviceType.DLA,
data_batcher=data_batcher,
workspace=workspace,
timing_cache=timing_cache,
calibration_cache=calibration_cache,
dla_core=dla_core,
shapes=shapes,
input_tensor_formats=input_tensor_formats,
output_tensor_formats=output_tensor_formats,
hooks=hooks,
direct_io=direct_io,
prefer_precision_constraints=prefer_precision_constraints,
reject_empty_algorithms=reject_empty_algorithms,
ignore_timing_mismatch=ignore_timing_mismatch,
cache=cache,
fp16=True,
fp8=fp8,
int8=True,
verbose=verbose,
)
return
# identify if any chunks contain DLA layers
dla_chunks = [(i, chunk) for i, chunk in enumerate(chunks) if chunk[3]]
# case where no DLA layers are found
if not dla_chunks:
LOG.warning("No DLA-compatible layers found. Building GPU-only engine.")
build_engine(
onnx,
output_path,
workspace=workspace,
timing_cache=timing_cache,
calibration_cache=calibration_cache,
data_batcher=data_batcher,
shapes=shapes,
input_tensor_formats=input_tensor_formats,
output_tensor_formats=output_tensor_formats,
hooks=hooks,
direct_io=direct_io,
prefer_precision_constraints=prefer_precision_constraints,
reject_empty_algorithms=reject_empty_algorithms,
ignore_timing_mismatch=ignore_timing_mismatch,
fp16=True,
fp8=fp8,
int8=True,
cache=cache,
verbose=verbose,
)
return
# sort chunks by len and filter by min_layers or until max_chunks is reached
dla_chunks = sorted(dla_chunks, key=lambda x: len(x[1][0]), reverse=True)
if verbose:
LOG.info(
f"Found {len(dla_chunks)} total chunks of which: {sum(1 if chunk[1][-1] else 0 for chunk in dla_chunks)} are DLA compatible."
)
# define lists for storing layer assignments
layer_precision: list[tuple[int, trt.DataType | None]] = []
layer_device: list[tuple[int, trt.DeviceType | None]] = []
# assign default to GPU/FP16
exclude_layer_types = [trt.LayerType.CONSTANT, trt.LayerType.SHUFFLE]
for idx in range(network.num_layers):
layer = network.get_layer(idx)
layer_name: str = layer.name
layer_name = layer_name.lower()
layer_device.append((idx, trt.DeviceType.GPU))
# intelligently assign precision level to HALF unless layer
# is Constant, Shuffle, or Tile
if layer.type in exclude_layer_types or "tile" in layer_name:
layer_precision.append((idx, None))
else:
layer_precision.append((idx, trt.DataType.HALF))
# iterate over chunks and assign to DLA
matched_chunks = 0
for _, (layers, start, end, on_dla) in dla_chunks:
if matched_chunks >= max_chunks and max_chunks > 0:
break
if not on_dla:
continue
if len(layers) < min_layers:
continue
for layer_id in range(start, end + 1, 1):
layer_precision[layer_id] = (layer_id, trt.DataType.INT8)
layer_device[layer_id] = (layer_id, trt.DeviceType.DLA)
matched_chunks += 1
# verbose iteration
if verbose:
for (idx, device), (_, datatype) in zip(layer_device, layer_precision):
LOG.info(
f"Layer {idx}: {network.get_layer(idx).name}, "
f"{'DLA' if device == trt.DeviceType.DLA else 'GPU'}, "
f"{'INT8' if datatype == trt.DataType.INT8 else 'FP16'}"
)
# build engine with specific layer assignments
build_engine(
onnx,
output_path,
default_device=trt.DeviceType.DLA, # default device DLA
timing_cache=timing_cache,
workspace=workspace,
calibration_cache=calibration_cache,
data_batcher=data_batcher,
layer_precision=layer_precision,
layer_device=layer_device,
dla_core=dla_core, # ensure DLA core is maintained
shapes=shapes,
input_tensor_formats=input_tensor_formats,
output_tensor_formats=output_tensor_formats,
hooks=hooks,
optimization_level=optimization_level,
gpu_fallback=True, # enable GPU fallback to account for input/copy
direct_io=direct_io,
prefer_precision_constraints=prefer_precision_constraints,
reject_empty_algorithms=reject_empty_algorithms,
ignore_timing_mismatch=ignore_timing_mismatch,
fp16=True,
fp8=fp8,
int8=True,
cache=cache,
verbose=verbose,
)