Source code for trtutils._profile

# Copyright (c) 2025 Justin Davis (davisjustin302@gmail.com)
#
# MIT License
from __future__ import annotations

from typing import TYPE_CHECKING

from trtutils.jetson._profile import profile_engine as jetson_profile_engine
from trtutils.profiling._profiler import ProfilerResult
from trtutils.profiling._profiler import profile_engine as inspect_profile_engine

if TYPE_CHECKING:
    from pathlib import Path

    from trtutils._engine import TRTEngine
    from trtutils.jetson._profile import JetsonProfilerResult


[docs] def profile_engine( engine: Path | str | TRTEngine, iterations: int = 100, warmup_iterations: int = 10, dla_core: int | None = None, device: int | None = None, tegra_interval: int = 5, *, jetson: bool = False, warmup: bool | None = None, verbose: bool | None = None, ) -> ProfilerResult | JetsonProfilerResult: """ Profile a TensorRT engine layer-by-layer. This is a dispatcher function that calls either the standard profiler or the Jetson-specific profiler based on the jetson parameter. This function runs inference multiple times and collects per-layer execution times using TensorRT's IProfiler interface. On Jetson devices with jetson=True, it also collects power and energy metrics. It returns aggregated statistics (mean, median, min, max) for each layer across all iterations. Notes ----- For best results, build the engine with profiling_verbosity set to DETAILED when calling build_engine. Otherwise, layer names may be numeric indices. When jetson=True, the Jetson profiler function has a default of 10000 iterations (instead of 100) to ensure adequate tegrastats sampling coverage across all layers. You can override this by explicitly providing the iterations parameter. Parameters ---------- engine : Path | str | TRTEngine The engine to profile. Either a TRTEngine object or path to the engine file. If a path is given, then a TRTEngine will be created automatically. iterations : int, optional The number of profiling iterations to run, by default 100 for standard profiling. Note: The Jetson profiler uses 10000 by default if not explicitly specified. warmup_iterations : int, optional The number of warmup iterations to run before profiling, by default 10. dla_core : int, optional The DLA core to assign DLA layers of the engine to. Default is None. If None, any DLA layers will be assigned to DLA core 0. device : int, optional The CUDA device index to use for the engine. Default is None, which uses the current device. tegra_interval : int, optional The interval in milliseconds between tegrastats samples (Jetson only), by default 5. Only used when jetson=True. jetson : bool, optional Whether to use Jetson-specific profiling with power/energy metrics, by default False. warmup : bool, optional Whether to do warmup iterations, by default None. If None, warmup will be set to True. verbose : bool, optional Whether to output additional information to stdout. Default None/False. Returns ------- ProfilerResult | JetsonProfilerResult If jetson=False: ProfilerResult containing per-layer timing statistics and total execution time. If jetson=True: JetsonProfilerResult containing per-layer timing statistics with power/energy data, total execution time, overall power draw, and overall energy consumption. """ if jetson: return jetson_profile_engine( engine=engine, iterations=iterations, warmup_iterations=warmup_iterations, tegra_interval=tegra_interval, dla_core=dla_core, device=device, warmup=warmup, verbose=verbose, ) return inspect_profile_engine( engine=engine, iterations=iterations, warmup_iterations=warmup_iterations, dla_core=dla_core, device=device, warmup=warmup, verbose=verbose, )