Source code for trtutils.builder._dla

# Copyright (c) 2024 Justin Davis (davisjustin302@gmail.com)
#
# MIT License
# mypy: disable-error-code="import-untyped"
from __future__ import annotations

import contextlib
from typing import TYPE_CHECKING

with contextlib.suppress(ImportError):
    import tensorrt as trt

from trtutils._log import LOG

from ._build import build_engine
from ._onnx import read_onnx
from ._utils import get_check_dla

if TYPE_CHECKING:
    from collections.abc import Callable
    from pathlib import Path

    from ._batcher import AbstractBatcher


[docs] def can_run_on_dla( onnx: Path | str | trt.INetworkDefinition, config: trt.IBuilderConfig | None = None, *, verbose_layers: bool | None = None, verbose_chunks: bool | None = None, ) -> tuple[bool, list[tuple[list[trt.ILayer], int, int, bool]]]: """ Whether or not the entire model can be run on a DLA. Parameters ---------- onnx : Path, str, or trt.INetworkDefinition The path to the onnx file or a pre-made TensorRT network. config : trt.IBuilderConfig, optional The TensorRT builder config. Required if onnx is a network. verbose_layers : bool, optional Whether to print verbose output for individual layers, by default None verbose_chunks : bool, optional Whether to print verbose output for layer chunks, by default None Returns ------- tuple[bool, list[tuple[list[trt.ILayer], int, int, bool]]] Whether or not the model will all run on DLA and each block of layers. Where each block can run on a single device, DLA or GPU. Raises ------ ValueError If config is not provided when onnx is a network """ # handle network input if isinstance(onnx, trt.INetworkDefinition): if config is None: err_msg = "Config must be provided when onnx is a network" raise ValueError(err_msg) network = onnx else: network, _, config, _ = read_onnx(onnx) check_dla: Callable[[trt.ILayer], bool] = get_check_dla(config) # assign to DLA 0, since core doesnt matter for this check config.default_device_type = trt.DeviceType.DLA config.DLA_core = 0 full_dla = True last_layer_dla = False chunks: list[tuple[list[trt.ILayer], int, int, bool]] = [] curr_start: int = 0 curr_layers: list[trt.ILayer] = [] for idx in range(network.num_layers): layer = network.get_layer(idx) # check if the layer can run on DLA dla_valid = check_dla(layer) if not dla_valid: full_dla = False # handle chunk storage if dla_valid != last_layer_dla and len(curr_layers) > 0: chunks.append((curr_layers, curr_start, idx - 1, last_layer_dla)) curr_layers = [layer] curr_start = idx else: curr_layers.append(layer) last_layer_dla = dla_valid if verbose_layers: LOG.info( f"Layer {idx}: {layer.name}, {layer.type}, {layer.precision}, {layer.metadata}", ) LOG.info(f"\tDLA: {dla_valid}") # handle final chunk chunks.append((curr_layers, curr_start, network.num_layers - 1, last_layer_dla)) if verbose_chunks: LOG.info(f"Found {len(chunks)} Chunks of Layers") for i, (layers, start, end, on_dla) in enumerate(chunks): LOG.info( f"\tChunk {i}: [{start} - {end}], {len(layers)} layers, {'DLA' if on_dla else 'GPU'}" ) return full_dla, chunks
[docs] def build_dla_engine( onnx: Path | str, output_path: Path | str, data_batcher: AbstractBatcher, dla_core: int, max_chunks: int = 1, min_layers: int = 20, timing_cache: Path | str | None = None, *, verbose: bool | None = None, ) -> None: """ Automatically build a TensorRT engine for DLA with automatic layer assignments. This function will: 1. Check which layers can run on DLA 2. Find the largest chunk of DLA-compatible layers 3. Assign those layers to DLA with INT8 precision 4. Assign remaining layers to GPU with FP16 precision Parameters ---------- onnx : Path, str The path to the ONNX model or a pre-made TensorRT network output_path : Path, str The path where the engine should be saved data_batcher : AbstractBatcher The data batcher instance for INT8 calibration dla_core : int The DLA core to use max_chunks : int, optional The maximum number of DLA-compatible chunks to assign to the DLA. By default 1, which will assign the first compatible chunk. Can set to 0 to assign all chunks which meet min_layers. min_layers : int, optional The minimum number of layers in a chunk to be assigned to DLA. By default 20, which will assign chunks with at least 20 layers. Can set to 0 to assign all chunks. timing_cache : Path, str, optional The path to the timing cache file verbose : bool, optional Whether to print verbose output, by default False """ # read the onnx path network, _, config, _ = read_onnx(onnx) # check layers for DLA compatibility and use int8 precision full_dla, chunks = can_run_on_dla( onnx=network, config=config, verbose_layers=verbose, verbose_chunks=verbose, ) if verbose: LOG.info(f"Model can run fully on DLA: {full_dla}") LOG.info(f"Found {len(chunks)} chunks of layers") # case where the entire model can run on DLA if full_dla: build_engine( onnx, output_path, default_device=trt.DeviceType.DLA, data_batcher=data_batcher, dla_core=dla_core, fp16=True, int8=True, verbose=verbose, ) return # identify if any chunks contain DLA layers dla_chunks = [(i, chunk) for i, chunk in enumerate(chunks) if chunk[3]] # case where no DLA layers are found if not dla_chunks: LOG.warning("No DLA-compatible layers found. Building GPU-only engine.") build_engine( onnx, output_path, fp16=True, verbose=verbose, ) return # sort chunks by len and filter by min_layers or until max_chunks is reached dla_chunks = sorted(dla_chunks, key=lambda x: len(x[1][0]), reverse=True) if verbose: LOG.info( f"Found {len(dla_chunks)} total chunks of which: {sum(1 if chunk[1][-1] else 0 for chunk in dla_chunks)} are DLA compatible." ) # define lists for storing layer assignments layer_precision: list[tuple[int, trt.DataType | None]] = [] layer_device: list[tuple[int, trt.DeviceType | None]] = [] # assign default to GPU/FP16 exclude_layer_types = [trt.LayerType.CONSTANT, trt.LayerType.SHUFFLE] for idx in range(network.num_layers): layer = network.get_layer(idx) layer_name: str = layer.name layer_name = layer_name.lower() layer_device.append((idx, trt.DeviceType.GPU)) # intelligently assign precision level to HALF unless layer # is Constant, Shuffle, or Tile if layer.type in exclude_layer_types or "tile" in layer_name: layer_precision.append((idx, None)) else: layer_precision.append((idx, trt.DataType.HALF)) # iterate over chunks and assign to DLA matched_chunks = 0 for _, (layers, start, end, on_dla) in dla_chunks: if matched_chunks >= max_chunks and max_chunks > 0: break if not on_dla: continue if len(layers) < min_layers: continue for layer_id in range(start, end + 1, 1): layer_precision[layer_id] = (layer_id, trt.DataType.INT8) layer_device[layer_id] = (layer_id, trt.DeviceType.DLA) matched_chunks += 1 # verbose iteration if verbose: for (idx, device), (_, datatype) in zip(layer_device, layer_precision): LOG.info( f"Layer {idx}: {network.get_layer(idx).name}, " f"{'DLA' if device == trt.DeviceType.DLA else 'GPU'}, " f"{'INT8' if datatype == trt.DataType.INT8 else 'FP16'}" ) # build engine with specific layer assignments build_engine( onnx, output_path, default_device=trt.DeviceType.GPU, # default device GPU timing_cache=timing_cache, data_batcher=data_batcher, layer_precision=layer_precision, layer_device=layer_device, dla_core=dla_core, # ensure DLA core is maintained gpu_fallback=True, # enable GPU fallback to account for input/copy fp16=True, int8=True, verbose=verbose, )