Source code for trtutils.jetson._benchmark

# Copyright (c) 2024-2026 Justin Davis (davisjustin302@gmail.com)
#
# MIT License
from __future__ import annotations

import gc
import time
from dataclasses import dataclass
from pathlib import Path
from typing import TYPE_CHECKING

from jetsontools import TegraStats, filter_data, get_powerdraw, parse_tegrastats

from trtutils._benchmark import Metric
from trtutils._engine import TRTEngine
from trtutils._log import LOG
from trtutils.parallel._parallel_engines import ParallelTRTEngines

if TYPE_CHECKING:
    from collections.abc import Sequence

    from jetsontools._parsing import Metric as JMetric  # typing fix
    from typing_extensions import Self


[docs] @dataclass class JetsonBenchmarkResult: latency: Metric power_draw: Metric energy: Metric def __str__(self: Self) -> str: return f"JetsonBenchmarkResult(latency={self.latency}, power_draw={self.power_draw}, energy={self.energy})" def __repr__(self: Self) -> str: return f"JetsonBenchmarkResult(latency={self.latency!r}, power_draw={self.power_draw!r}, energy={self.energy!r})"
[docs] def benchmark_engine( engine: TRTEngine | Path | str, iterations: int = 1000, warmup_iterations: int = 50, tegra_interval: int = 5, dla_core: int | None = None, *, warmup: bool | None = None, cuda_graph: bool | None = None, verbose: bool | None = None, ) -> JetsonBenchmarkResult: """ Benchmark a TensorRT engine on a Jetson device. Parameters ---------- engine : TRTEngine | Path | str The engine to benchmark. Either a TRTEngine object or path to the engine file. If a path is given, then a TRTEngine will be created automatically. iterations : int, optional The number of iterations to run the benchmark for, by default 1000. warmup_iterations : int, optional The number of warmup iterations to run before the benchmark, by default 50. tegra_interval : int, optional The number of milliseconds between each tegrastats sampling. The smaller the number, the more samples per second are generated. By default 5 milliseconds between samples. dla_core : int, optional The DLA core to assign DLA layers of the engine to. Default is None. If None, any DLA layers will be assigned to DLA core 0. warmup : bool, optional Whether to do warmup iterations, by default None If None, warmup will be set to True. cuda_graph : bool, optional Whether to enable CUDA graph capture for optimized execution. By default None, which enables CUDA graphs. Set to False for engines with DLA layers, as DLA does not support CUDA graphs. verbose : bool, optional Whether ot not to output additional information to stdout. Default None/False. Returns ------- BenchmarkResult A dataclass containing the results of the benchmark. """ loaded_engine: bool = False if isinstance(engine, (Path, str)): engine = TRTEngine( engine, warmup_iterations=warmup_iterations, warmup=warmup, dla_core=dla_core, cuda_graph=cuda_graph, verbose=verbose, ) loaded_engine = True else: if warmup: for _ in range(warmup_iterations): engine.mock_execute() # list of metrics metric_names = ["latency", "power_draw", "energy"] raw: dict[str, list[float]] = {metric: [] for metric in metric_names} # pre-generate the false data false_data = engine.get_random_input(verbose=verbose) # create temp file location for data to go temp_file = Path(Path.cwd()) / "temptegra.txt" # store the start/stop times of the engine execution start_stop_times: list[tuple[float, float]] = [] with TegraStats(temp_file, interval=tegra_interval): for _ in range(iterations): t0 = time.time() engine.mock_execute(false_data, verbose=verbose) t1 = time.time() raw["latency"].append(t1 - t0) start_stop_times.append((t0, t1)) # parse the tegra data tegradata = parse_tegrastats(temp_file) # delete the temp file if temp_file.exists(): temp_file.unlink() # filter the data by actual times during execution filtered_data, per_inference = filter_data(tegradata, start_stop_times) # get the energy values powerdraw_data: dict[str, JMetric] = get_powerdraw(filtered_data) raw["power_draw"] = powerdraw_data["VDD_TOTAL"].raw # compute energy values # for energy values need to compute powerdraw per infernece # then compute energy energy_data = [ get_powerdraw(inf_data)["VDD_TOTAL"].mean * (inf_stop - inf_start) for (inf_start, inf_stop), inf_data in per_inference if len(inf_data) > 0 ] raw["energy"] = energy_data # calculate the metrics metrics: dict[str, Metric] = {} for metric_name in metric_names: data = raw[metric_name] metric = Metric(data) metrics[metric_name] = metric LOG.debug(f"{metric_name}: {metric}") if loaded_engine: del engine gc.collect() return JetsonBenchmarkResult( latency=metrics["latency"], power_draw=metrics["power_draw"], energy=metrics["energy"], )
[docs] def benchmark_engines( engines: Sequence[TRTEngine | Path | str | tuple[TRTEngine | Path | str, int]], iterations: int = 1000, warmup_iterations: int = 50, tegra_interval: int = 5, *, warmup: bool | None = None, cuda_graph: bool | None = None, parallel: bool | None = None, verbose: bool | None = None, ) -> list[JetsonBenchmarkResult]: """ Benchmark a TensorRT engine. Parameters ---------- engines : Sequence[TRTEngine | Path | str | tuple[TRTEngine | Path | str, int]] The engines to benchmark as paths to the engine files. iterations : int, optional The number of iterations to run the benchmark for, by default 1000. warmup_iterations : int, optional The number of warmup iterations to run before the benchmark, by default 50. tegra_interval : int, optional The number of milliseconds between each tegrastats sampling. The smaller the number, the more samples per second are generated. By default 5 milliseconds between samples. warmup : bool, optional Whether to do warmup iterations, by default None If None, warmup will be set to True. cuda_graph : bool, optional Whether to enable CUDA graph capture for optimized execution. By default None, which enables CUDA graphs. Set to False for engines with DLA layers, as DLA does not support CUDA graphs. parallel : bool, optional Whether or not to process the engines in parallel. Useful for assessing concurrent execution performance. Will execute the engines in lockstep. If None, will benchmark each engine individually. verbose : bool, optional Whether ot not to output additional information to stdout. Default None/False. Returns ------- list[JetsonBenchmarkResult] A list of dataclasses containing the results of the benchmark. If parallel was True, will only contain one item. """ temp_engines: list[Path | TRTEngine] = [] dla_assignments: list[int | None] = [] for engine_info in engines: dla_core: int | None = None if isinstance(engine_info, tuple): engine, dla_core = engine_info # type: ignore[assignment] else: engine = engine_info if isinstance(engine, str): engine = Path(engine) temp_engines.append(engine) # type: ignore[arg-type] dla_assignments.append(dla_core) if not parallel: return [ benchmark_engine( engine, iterations, warmup_iterations, tegra_interval, dla_core=dla_core, warmup=warmup, cuda_graph=cuda_graph, verbose=verbose, ) for engine, dla_core in zip(temp_engines, dla_assignments) ] # otherwise we need a parallel setup trt_engines = ParallelTRTEngines( [(ep, dc) if dc is not None else ep for ep, dc in zip(temp_engines, dla_assignments)], warmup_iterations=warmup_iterations, warmup=warmup, ) # list of metrics metric_names = ["latency", "power_draw", "energy"] raw: dict[str, list[float]] = {metric: [] for metric in metric_names} # pre-generate the false data false_data = trt_engines.get_random_input() # create temp file location for data to go temp_file = Path(Path.cwd()) / "temptegra.txt" # store the start/stop times of the engine execution start_stop_times: list[tuple[float, float]] = [] with TegraStats(temp_file, interval=tegra_interval): for _ in range(iterations): t0 = time.time() trt_engines.submit(false_data) trt_engines.retrieve() t1 = time.time() raw["latency"].append(t1 - t0) start_stop_times.append((t0, t1)) # parse the tegra data tegradata = parse_tegrastats(temp_file) # delete the temp file if temp_file.exists(): temp_file.unlink() # filter the data by actual times during execution filtered_data, per_inference = filter_data(tegradata, start_stop_times) # get the energy values powerdraw_data: dict[str, JMetric] = get_powerdraw(filtered_data) raw["power_draw"] = powerdraw_data["VDD_TOTAL"].raw # compute energy values # for energy values need to compute powerdraw per infernece # then compute energy energy_data = [ get_powerdraw(inf_data)["VDD_TOTAL"].mean * (inf_stop - inf_start) for (inf_start, inf_stop), inf_data in per_inference if len(inf_data) > 0 ] raw["energy"] = energy_data # calculate the metrics metrics: dict[str, Metric] = {} for metric_name in metric_names: data = raw[metric_name] metric = Metric(data) metrics[metric_name] = metric LOG.debug(f"{metric_name}: {metric}") # no need to check if we loaded the engines, since passed engines are not deleted # explicitly by the ParallelTRTEngines class del trt_engines gc.collect() return [ JetsonBenchmarkResult( latency=metrics["latency"], power_draw=metrics["power_draw"], energy=metrics["energy"], ), ]