Source code for trtutils.builder._dla

# Copyright (c) 2024 Justin Davis (davisjustin302@gmail.com)
#
# MIT License
# mypy: disable-error-code="import-untyped"
from __future__ import annotations

import contextlib
from typing import TYPE_CHECKING

with contextlib.suppress(ImportError):
    import tensorrt as trt

from trtutils._log import LOG

from ._build import build_engine
from ._onnx import read_onnx
from ._utils import get_check_dla

if TYPE_CHECKING:
    from collections.abc import Callable
    from pathlib import Path

    from ._batcher import AbstractBatcher



[docs]
def can_run_on_dla(
    onnx: Path | str | trt.INetworkDefinition,
    config: trt.IBuilderConfig | None = None,
    *,
    verbose_layers: bool | None = None,
    verbose_chunks: bool | None = None,
) -> tuple[bool, list[tuple[list[trt.ILayer], int, int, bool]]]:
    """
    Whether or not the entire model can be run on a DLA.

    Parameters
    ----------
    onnx : Path, str, or trt.INetworkDefinition
        The path to the onnx file or a pre-made TensorRT network.
    config : trt.IBuilderConfig, optional
        The TensorRT builder config. Required if onnx is a network.
    verbose_layers : bool, optional
        Whether to print verbose output for individual layers, by default None
    verbose_chunks : bool, optional
        Whether to print verbose output for layer chunks, by default None

    Returns
    -------
    tuple[bool, list[tuple[list[trt.ILayer], int, int, bool]]]
        Whether or not the model will all run on DLA and each block of layers.
        Where each block can run on a single device, DLA or GPU.

    Raises
    ------
    ValueError
        If config is not provided when onnx is a network

    """
    # handle network input
    if isinstance(onnx, trt.INetworkDefinition):
        if config is None:
            err_msg = "Config must be provided when onnx is a network"
            raise ValueError(err_msg)
        network = onnx
    else:
        network, _, config, _ = read_onnx(onnx)

    check_dla: Callable[[trt.ILayer], bool] = get_check_dla(config)

    # assign to DLA 0, since core doesnt matter for this check
    config.default_device_type = trt.DeviceType.DLA
    config.DLA_core = 0

    full_dla = True
    last_layer_dla = False
    chunks: list[tuple[list[trt.ILayer], int, int, bool]] = []
    curr_start: int = 0
    curr_layers: list[trt.ILayer] = []

    for idx in range(network.num_layers):
        layer = network.get_layer(idx)

        # check if the layer can run on DLA
        dla_valid = check_dla(layer)
        if not dla_valid:
            full_dla = False

        # handle chunk storage
        if dla_valid != last_layer_dla and len(curr_layers) > 0:
            chunks.append((curr_layers, curr_start, idx - 1, last_layer_dla))
            curr_layers = [layer]
            curr_start = idx
        else:
            curr_layers.append(layer)

        last_layer_dla = dla_valid

        if verbose_layers:
            LOG.info(
                f"Layer {idx}: {layer.name}, {layer.type}, {layer.precision}, {layer.metadata}",
            )
            LOG.info(f"\tDLA: {dla_valid}")

    # handle final chunk
    chunks.append((curr_layers, curr_start, network.num_layers - 1, last_layer_dla))

    if verbose_chunks:
        LOG.info(f"Found {len(chunks)} Chunks of Layers")
        for i, (layers, start, end, on_dla) in enumerate(chunks):
            LOG.info(
                f"\tChunk {i}: [{start} - {end}], {len(layers)} layers, {'DLA' if on_dla else 'GPU'}"
            )

    return full_dla, chunks




[docs]
def build_dla_engine(
    onnx: Path | str,
    output_path: Path | str,
    data_batcher: AbstractBatcher,
    dla_core: int,
    max_chunks: int = 1,
    min_layers: int = 20,
    timing_cache: Path | str | None = None,
    *,
    verbose: bool | None = None,
) -> None:
    """
    Automatically build a TensorRT engine for DLA with automatic layer assignments.

    This function will:
    1. Check which layers can run on DLA
    2. Find the largest chunk of DLA-compatible layers
    3. Assign those layers to DLA with INT8 precision
    4. Assign remaining layers to GPU with FP16 precision

    Parameters
    ----------
    onnx : Path, str
        The path to the ONNX model or a pre-made TensorRT network
    output_path : Path, str
        The path where the engine should be saved
    data_batcher : AbstractBatcher
        The data batcher instance for INT8 calibration
    dla_core : int
        The DLA core to use
    max_chunks : int, optional
        The maximum number of DLA-compatible chunks to assign to the DLA.
        By default 1, which will assign the first compatible chunk.
        Can set to 0 to assign all chunks which meet min_layers.
    min_layers : int, optional
        The minimum number of layers in a chunk to be assigned to DLA.
        By default 20, which will assign chunks with at least 20 layers.
        Can set to 0 to assign all chunks.
    timing_cache : Path, str, optional
        The path to the timing cache file
    verbose : bool, optional
        Whether to print verbose output, by default False

    """
    # read the onnx path
    network, _, config, _ = read_onnx(onnx)

    # check layers for DLA compatibility and use int8 precision
    full_dla, chunks = can_run_on_dla(
        onnx=network,
        config=config,
        verbose_layers=verbose,
        verbose_chunks=verbose,
    )

    if verbose:
        LOG.info(f"Model can run fully on DLA: {full_dla}")
        LOG.info(f"Found {len(chunks)} chunks of layers")

    # case where the entire model can run on DLA
    if full_dla:
        build_engine(
            onnx,
            output_path,
            default_device=trt.DeviceType.DLA,
            data_batcher=data_batcher,
            dla_core=dla_core,
            fp16=True,
            int8=True,
            verbose=verbose,
        )
        return

    # identify if any chunks contain DLA layers
    dla_chunks = [(i, chunk) for i, chunk in enumerate(chunks) if chunk[3]]

    # case where no DLA layers are found
    if not dla_chunks:
        LOG.warning("No DLA-compatible layers found. Building GPU-only engine.")
        build_engine(
            onnx,
            output_path,
            fp16=True,
            verbose=verbose,
        )
        return

    # sort chunks by len and filter by min_layers or until max_chunks is reached
    dla_chunks = sorted(dla_chunks, key=lambda x: len(x[1][0]), reverse=True)

    if verbose:
        LOG.info(
            f"Found {len(dla_chunks)} total chunks of which: {sum(1 if chunk[1][-1] else 0 for chunk in dla_chunks)} are DLA compatible."
        )

    # define lists for storing layer assignments
    layer_precision: list[tuple[int, trt.DataType | None]] = []
    layer_device: list[tuple[int, trt.DeviceType | None]] = []

    # assign default to GPU/FP16
    exclude_layer_types = [trt.LayerType.CONSTANT, trt.LayerType.SHUFFLE]
    for idx in range(network.num_layers):
        layer = network.get_layer(idx)
        layer_name: str = layer.name
        layer_name = layer_name.lower()
        layer_device.append((idx, trt.DeviceType.GPU))
        # intelligently assign precision level to HALF unless layer
        # is Constant, Shuffle, or Tile
        if layer.type in exclude_layer_types or "tile" in layer_name:
            layer_precision.append((idx, None))
        else:
            layer_precision.append((idx, trt.DataType.HALF))

    # iterate over chunks and assign to DLA
    matched_chunks = 0
    for _, (layers, start, end, on_dla) in dla_chunks:
        if matched_chunks >= max_chunks and max_chunks > 0:
            break
        if not on_dla:
            continue
        if len(layers) < min_layers:
            continue

        for layer_id in range(start, end + 1, 1):
            layer_precision[layer_id] = (layer_id, trt.DataType.INT8)
            layer_device[layer_id] = (layer_id, trt.DeviceType.DLA)

        matched_chunks += 1

    # verbose iteration
    if verbose:
        for (idx, device), (_, datatype) in zip(layer_device, layer_precision):
            LOG.info(
                f"Layer {idx}: {network.get_layer(idx).name}, "
                f"{'DLA' if device == trt.DeviceType.DLA else 'GPU'}, "
                f"{'INT8' if datatype == trt.DataType.INT8 else 'FP16'}"
            )

    # build engine with specific layer assignments
    build_engine(
        onnx,
        output_path,
        default_device=trt.DeviceType.GPU,  # default device GPU
        timing_cache=timing_cache,
        data_batcher=data_batcher,
        layer_precision=layer_precision,
        layer_device=layer_device,
        dla_core=dla_core,  # ensure DLA core is maintained
        gpu_fallback=True,  # enable GPU fallback to account for input/copy
        fp16=True,
        int8=True,
        verbose=verbose,
    )