Source code for trtutils._profile

# Copyright (c) 2025 Justin Davis (davisjustin302@gmail.com)
#
# MIT License
from __future__ import annotations

from typing import TYPE_CHECKING

from trtutils.jetson._profile import profile_engine as jetson_profile_engine
from trtutils.profiling._profiler import ProfilerResult
from trtutils.profiling._profiler import profile_engine as inspect_profile_engine

if TYPE_CHECKING:
    from pathlib import Path

    from trtutils._engine import TRTEngine
    from trtutils.jetson._profile import JetsonProfilerResult



[docs]
def profile_engine(
    engine: Path | str | TRTEngine,
    iterations: int = 100,
    warmup_iterations: int = 10,
    dla_core: int | None = None,
    device: int | None = None,
    tegra_interval: int = 5,
    *,
    jetson: bool = False,
    warmup: bool | None = None,
    verbose: bool | None = None,
) -> ProfilerResult | JetsonProfilerResult:
    """
    Profile a TensorRT engine layer-by-layer.

    This is a dispatcher function that calls either the standard profiler or
    the Jetson-specific profiler based on the jetson parameter.

    This function runs inference multiple times and collects per-layer execution
    times using TensorRT's IProfiler interface. On Jetson devices with jetson=True,
    it also collects power and energy metrics. It returns aggregated statistics
    (mean, median, min, max) for each layer across all iterations.

    Notes
    -----
    For best results, build the engine with profiling_verbosity set to DETAILED
    when calling build_engine. Otherwise, layer names may be numeric indices.

    When jetson=True, the Jetson profiler function has a default of 10000 iterations
    (instead of 100) to ensure adequate tegrastats sampling coverage across all layers.
    You can override this by explicitly providing the iterations parameter.

    Parameters
    ----------
    engine : Path | str | TRTEngine
        The engine to profile. Either a TRTEngine object or path to the engine file.
        If a path is given, then a TRTEngine will be created automatically.
    iterations : int, optional
        The number of profiling iterations to run, by default 100 for standard profiling.
        Note: The Jetson profiler uses 10000 by default if not explicitly specified.
    warmup_iterations : int, optional
        The number of warmup iterations to run before profiling, by default 10.
    dla_core : int, optional
        The DLA core to assign DLA layers of the engine to. Default is None.
        If None, any DLA layers will be assigned to DLA core 0.
    device : int, optional
        The CUDA device index to use for the engine. Default is None,
        which uses the current device.
    tegra_interval : int, optional
        The interval in milliseconds between tegrastats samples (Jetson only),
        by default 5. Only used when jetson=True.
    jetson : bool, optional
        Whether to use Jetson-specific profiling with power/energy metrics,
        by default False.
    warmup : bool, optional
        Whether to do warmup iterations, by default None.
        If None, warmup will be set to True.
    verbose : bool, optional
        Whether to output additional information to stdout.
        Default None/False.

    Returns
    -------
    ProfilerResult | JetsonProfilerResult
        If jetson=False: ProfilerResult containing per-layer timing statistics
        and total execution time.
        If jetson=True: JetsonProfilerResult containing per-layer timing statistics
        with power/energy data, total execution time, overall power draw, and overall
        energy consumption.

    """
    if jetson:
        return jetson_profile_engine(
            engine=engine,
            iterations=iterations,
            warmup_iterations=warmup_iterations,
            tegra_interval=tegra_interval,
            dla_core=dla_core,
            device=device,
            warmup=warmup,
            verbose=verbose,
        )
    return inspect_profile_engine(
        engine=engine,
        iterations=iterations,
        warmup_iterations=warmup_iterations,
        dla_core=dla_core,
        device=device,
        warmup=warmup,
        verbose=verbose,
    )