Source code for trtutils.jetson._profile

# Copyright (c) 2025-2026 Justin Davis (davisjustin302@gmail.com)
#
# MIT License
from __future__ import annotations

import time
from collections import defaultdict
from dataclasses import dataclass
from pathlib import Path
from statistics import mean, median
from typing import TYPE_CHECKING

from jetsontools import TegraData, TegraStats, filter_data, get_powerdraw

from trtutils._benchmark import Metric
from trtutils._engine import TRTEngine
from trtutils._log import LOG
from trtutils.compat._libs import trt
from trtutils.profiling._profiler import LayerTiming, ProfilerResult

if TYPE_CHECKING:
    from collections.abc import Sequence

    from jetsontools._parsing import Metric as JMetric
    from typing_extensions import Self


[docs] @dataclass class JetsonLayerInfo(LayerTiming): """ A dataclass to store per-layer profiling statistics for Jetson devices. Extends LayerTiming with power and energy metrics. Attributes ---------- name : str The name of the layer. mean : float The mean execution time in milliseconds. median : float The median execution time in milliseconds. min : float The minimum execution time in milliseconds. max : float The maximum execution time in milliseconds. raw : list[float] The raw execution times in milliseconds across all iterations. power : float The mean power draw in milliwatts during layer execution. energy : float The mean energy consumption in millijoules per layer execution. """ power: float energy: float def __str__(self: Self) -> str: return ( f"{self.name}: mean={self.mean:.3f}ms, median={self.median:.3f}ms, " f"min={self.min:.3f}ms, max={self.max:.3f}ms, " f"power={self.power:.1f}mW, energy={self.energy:.3f}mJ" ) def __repr__(self: Self) -> str: return ( f"JetsonLayerInfo(name={self.name!r}, mean={self.mean}, median={self.median}, " f"min={self.min}, max={self.max}, power={self.power}, energy={self.energy})" )
[docs] @dataclass class JetsonProfilerResult(ProfilerResult): """ A dataclass to store the complete profiling results for Jetson devices. This extends the standard profiling results with energy and power metrics. Attributes ---------- layers : list[JetsonLayerInfo] The per-layer timing, power, and energy statistics. total_time : LayerTiming The total execution time statistics across all layers. iterations : int The number of profiling iterations performed. power_draw : Metric The power draw statistics in milliwatts. energy : Metric The energy consumption statistics in milliwatt-seconds. """ layers: Sequence[JetsonLayerInfo] power_draw: Metric energy: Metric def __str__(self: Self) -> str: return ( f"JetsonProfilerResult(layers={len(self.layers)}, " f"total_time={self.total_time.mean:.3f}ms, " f"iterations={self.iterations}, " f"power_draw={self.power_draw.mean:.1f}mW, " f"energy={self.energy.mean:.3f}mJ)" ) def __repr__(self: Self) -> str: return ( f"JetsonProfilerResult(layers={self.layers!r}, " f"total_time={self.total_time!r}, " f"iterations={self.iterations}, " f"power_draw={self.power_draw!r}, " f"energy={self.energy!r})" )
class JetsonLayerProfiler(trt.IProfiler): """ A profiler for Jetson devices that tracks per-layer timing and power/energy metrics. This class collects per-layer execution times and timestamps across multiple inference iterations, then correlates these with tegrastats data to compute per-layer power and energy consumption. """ def __init__(self: Self) -> None: """Initialize the JetsonLayerProfiler.""" super().__init__() # Store timings for each layer across iterations # Key: layer_name, Value: list of timings in milliseconds self._timings: dict[str, list[float]] = defaultdict(list) # Store timestamps for each layer across iterations # Key: layer_name, Value: list of (start_time, end_time) tuples self._layer_timestamps: dict[str, list[tuple[float, float]]] = defaultdict(list) # Store current iteration timings and timestamps self._current_iteration_timings: dict[str, float] = {} self._current_iteration_start_time: float = 0.0 self._current_layer_start_time: float = 0.0 def start_iteration(self: Self) -> None: """ Mark the start of a new iteration. This should be called before each inference run to set the baseline timestamp. """ self._current_iteration_start_time = time.time() self._current_layer_start_time = self._current_iteration_start_time def report_layer_time(self: Self, layer_name: str, ms: float) -> None: """ Record the execution time for a layer. This method is called by TensorRT once per layer after inference. Parameters ---------- layer_name : str The name of the layer. ms : float The execution time in milliseconds. """ # Record the timing self._current_iteration_timings[layer_name] = ms # Calculate layer end time # ms is in milliseconds, so convert to seconds layer_end_time = self._current_layer_start_time + (ms / 1000.0) # Store timestamp range for this layer if layer_name not in self._current_iteration_timings: self._current_iteration_timings[layer_name] = ms # Update for next layer self._current_layer_start_time = layer_end_time def finalize_iteration(self: Self) -> None: """ Finalize the current iteration by storing all layer timings and timestamps. This should be called after inference to commit the current iteration's data. """ for layer_name, time_ms in self._current_iteration_timings.items(): self._timings[layer_name].append(time_ms) # Reconstruct timestamps for all layers based on cumulative timing current_time = self._current_iteration_start_time for layer_name, time_ms in self._current_iteration_timings.items(): layer_start = current_time layer_end = current_time + (time_ms / 1000.0) # Convert ms to seconds self._layer_timestamps[layer_name].append((layer_start, layer_end)) current_time = layer_end self._current_iteration_timings.clear() def get_statistics(self: Self) -> list[LayerTiming]: """ Compute basic timing statistics for each layer (without power/energy). Returns ------- list[LayerTiming] A list of LayerTiming objects with timing statistics only. """ layer_stats: list[LayerTiming] = [] for layer_name, times in self._timings.items(): if not times: continue layer_timing = LayerTiming( name=layer_name, mean=mean(times), median=median(times), min=min(times), max=max(times), raw=times.copy(), ) layer_stats.append(layer_timing) return layer_stats def get_statistics_with_tegra( self: Self, tegradata: TegraData, verbose: bool = False, # noqa: FBT001, FBT002 ) -> list[JetsonLayerInfo]: """ Get statistics with tegrastats data to compute power and energy. For each layer, finds tegrastats samples that fall within the layer's execution time window and computes mean power and energy. If no samples are found, uses the last known power value. Parameters ---------- tegradata : TegraData TegraData object containing parsed tegrastats data with timestamps and power measurements. verbose : bool, optional Whether to output verbose logging, by default False. Returns ------- list[JetsonLayerInfo] A list of JetsonLayerInfo objects with timing, power, and energy metrics. """ if verbose: LOG.info("Correlating layer timestamps with tegrastats data") layer_stats: list[JetsonLayerInfo] = [] statistics: list[LayerTiming] = self.get_statistics() for layer_timing in statistics: layer_name = layer_timing.name timestamps = self._layer_timestamps[layer_name] layer_data, _ = filter_data(tegradata.data, timestamps) if len(layer_data) == 0: # for now, set power and energy to 0 power_mw = 0.0 energy_mj = 0.0 else: power_data = get_powerdraw(layer_data) power_mw = power_data["VDD_TOTAL"].mean energy_mj = power_mw * layer_timing.mean / 1000.0 layer_stats.append( JetsonLayerInfo( name=layer_name, mean=layer_timing.mean, median=layer_timing.median, min=layer_timing.min, max=layer_timing.max, raw=layer_timing.raw, power=power_mw, energy=energy_mj, ) ) return layer_stats def reset(self: Self) -> None: """Reset all stored timings and timestamps.""" self._timings.clear() self._layer_timestamps.clear() self._current_iteration_timings.clear() self._current_iteration_start_time = 0.0 self._current_layer_start_time = 0.0
[docs] def profile_engine( engine: Path | str | TRTEngine, iterations: int = 10000, warmup_iterations: int = 10, tegra_interval: int = 5, dla_core: int | None = None, device: int | None = None, *, warmup: bool | None = None, cuda_graph: bool | None = None, verbose: bool | None = None, ) -> JetsonProfilerResult: """ Profile a TensorRT engine layer-by-layer on a Jetson device. This function runs inference multiple times and collects per-layer execution times using TensorRT's IProfiler interface, along with power and energy metrics using tegrastats. It returns aggregated statistics (mean, median, min, max) for each layer across all iterations, plus per-layer power and energy consumption. Notes ----- For best results, build the engine with profiling_verbosity set to DETAILED when calling build_engine. Otherwise, layer names may be numeric indices. The default iteration count is 10000 (higher than standard profiling) to ensure adequate tegrastats sampling coverage across all layers, especially fast-executing ones. Parameters ---------- engine : Path | str | TRTEngine The engine to profile. Either a TRTEngine object or path to the engine file. If a path is given, then a TRTEngine will be created automatically. iterations : int, optional The number of profiling iterations to run, by default 10000. Higher iteration counts provide better coverage for per-layer power metrics. warmup_iterations : int, optional The number of warmup iterations to run before profiling, by default 10. tegra_interval : int, optional The interval in milliseconds between tegrastats samples, by default 5. dla_core : int, optional The DLA core to assign DLA layers of the engine to. Default is None. If None, any DLA layers will be assigned to DLA core 0. device : int, optional The CUDA device index to use for the engine. Default is None, which uses the current device. warmup : bool, optional Whether to do warmup iterations, by default None. If None, warmup will be set to True. cuda_graph : bool, optional Whether to enable CUDA graph capture for optimized execution. By default None, which enables CUDA graphs. Set to False for engines with DLA layers, as DLA does not support CUDA graphs. verbose : bool, optional Whether to output additional information to stdout. Default None/False. Returns ------- JetsonProfilerResult A dataclass containing per-layer timing/power/energy statistics, total execution time, overall power draw, and overall energy consumption. """ if verbose: LOG.info("Starting Jetson engine profiling with per-layer power tracking") if warmup is None: warmup = True engine_loaded = False if isinstance(engine, (Path, str)): engine = TRTEngine( engine, dla_core=dla_core, device=device, warmup=False, cuda_graph=cuda_graph, verbose=verbose, ) engine_loaded = True # issue warning if not built with detailed profiling engine_verbosity = engine.engine.profiling_verbosity if engine_verbosity != trt.ProfilingVerbosity.DETAILED and verbose: LOG.warning( "Engine profiling verbosity is not DETAILED. Layer names may be numeric indices. " "Rebuild the engine with profiling_verbosity=trt.ProfilingVerbosity.DETAILED for best results.", ) # attach JetsonLayerProfiler for per-layer power tracking profiler = JetsonLayerProfiler() engine.context.profiler = profiler # do warmup iterations # always do a single pass regardless of warmup_iterations profiler.start_iteration() engine.mock_execute(verbose=False) profiler.finalize_iteration() if warmup: for _ in range(warmup_iterations): profiler.start_iteration() engine.mock_execute(verbose=False) profiler.finalize_iteration() # report_layer_time is called by the context, so reset after warmup profiler.reset() if verbose: LOG.info(f"Running {iterations} profiling iterations with per-layer power monitoring") # pre-generate the false data false_data = engine.get_random_input(verbose=verbose) # store the start/stop times of each inference timeslices: list[tuple[float, float]] = [] tegrastats = TegraStats(interval=tegra_interval) tegrastats.start() for _ in range(iterations): profiler.start_iteration() t0 = time.time() engine.mock_execute(false_data, verbose=False) t1 = time.time() profiler.finalize_iteration() timeslices.append((t0, t1)) tegrastats.stop() tegradata = tegrastats.data layer_stats = profiler.get_statistics_with_tegra(tegradata, verbose=bool(verbose)) if verbose: LOG.info(f"Profiling complete: {len(layer_stats)} layers profiled with power/energy metrics") total_times: list[float] = [] for idx in range(iterations): iteration_total = sum(layer.raw[idx] for layer in layer_stats if idx < len(layer.raw)) total_times.append(iteration_total) total_timing = LayerTiming( name="TOTAL", mean=sum(total_times) / len(total_times) if total_times else 0.0, median=sorted(total_times)[len(total_times) // 2] if total_times else 0.0, min=min(total_times) if total_times else 0.0, max=max(total_times) if total_times else 0.0, raw=total_times, ) # filter the tegrastats data by actual times during execution tegradata.filter(timeslices) powerdraw_data: dict[str, JMetric] = tegradata.powerdraw power_raw = powerdraw_data["VDD_TOTAL"].raw # compute overall energy values per inference filtered_entries = tegradata.filtered_entries or [] energy_data = [ get_powerdraw(inf_data)["VDD_TOTAL"].mean * (inf_stop - inf_start) for (inf_start, inf_stop), inf_data in filtered_entries if len(inf_data) > 0 ] # create Metric objects for overall power and energy power_draw = Metric(power_raw) energy = Metric(energy_data) if verbose: LOG.info(f"Overall power draw: {power_draw}") LOG.info(f"Overall energy: {energy}") # if loaded here, delete engine if engine_loaded: del engine return JetsonProfilerResult( layers=layer_stats, total_time=total_timing, iterations=iterations, power_draw=power_draw, energy=energy, )