Source code for trtutils.jetson._benchmark

# Copyright (c) 2024-2026 Justin Davis (davisjustin302@gmail.com)
#
# MIT License
from __future__ import annotations

import gc
import time
from dataclasses import dataclass
from pathlib import Path
from typing import TYPE_CHECKING

from jetsontools import TegraStats, filter_data, get_powerdraw, parse_tegrastats

from trtutils._benchmark import Metric
from trtutils._engine import TRTEngine
from trtutils._log import LOG
from trtutils.parallel._parallel_engines import ParallelTRTEngines

if TYPE_CHECKING:
    from collections.abc import Sequence

    from jetsontools._parsing import Metric as JMetric  # typing fix
    from typing_extensions import Self



[docs]
@dataclass
class JetsonBenchmarkResult:
    latency: Metric
    power_draw: Metric
    energy: Metric

    def __str__(self: Self) -> str:
        return f"JetsonBenchmarkResult(latency={self.latency}, power_draw={self.power_draw}, energy={self.energy})"

    def __repr__(self: Self) -> str:
        return f"JetsonBenchmarkResult(latency={self.latency!r}, power_draw={self.power_draw!r}, energy={self.energy!r})"




[docs]
def benchmark_engine(
    engine: TRTEngine | Path | str,
    iterations: int = 1000,
    warmup_iterations: int = 50,
    tegra_interval: int = 5,
    dla_core: int | None = None,
    *,
    warmup: bool | None = None,
    cuda_graph: bool | None = None,
    verbose: bool | None = None,
) -> JetsonBenchmarkResult:
    """
    Benchmark a TensorRT engine on a Jetson device.

    Parameters
    ----------
    engine : TRTEngine | Path | str
        The engine to benchmark. Either a TRTEngine object or path to the engine file.
        If a path is given, then a TRTEngine will be created automatically.
    iterations : int, optional
        The number of iterations to run the benchmark for, by default 1000.
    warmup_iterations : int, optional
        The number of warmup iterations to run before the benchmark, by default 50.
    tegra_interval : int, optional
        The number of milliseconds between each tegrastats sampling.
        The smaller the number, the more samples per second are generated.
        By default 5 milliseconds between samples.
    dla_core : int, optional
        The DLA core to assign DLA layers of the engine to. Default is None.
        If None, any DLA layers will be assigned to DLA core 0.
    warmup : bool, optional
        Whether to do warmup iterations, by default None
        If None, warmup will be set to True.
    cuda_graph : bool, optional
        Whether to enable CUDA graph capture for optimized execution.
        By default None, which enables CUDA graphs.
        Set to False for engines with DLA layers, as DLA does not support CUDA graphs.
    verbose : bool, optional
        Whether ot not to output additional information to stdout.
        Default None/False.

    Returns
    -------
    BenchmarkResult
        A dataclass containing the results of the benchmark.

    """
    loaded_engine: bool = False
    if isinstance(engine, (Path, str)):
        engine = TRTEngine(
            engine,
            warmup_iterations=warmup_iterations,
            warmup=warmup,
            dla_core=dla_core,
            cuda_graph=cuda_graph,
            verbose=verbose,
        )
        loaded_engine = True
    else:
        if warmup:
            for _ in range(warmup_iterations):
                engine.mock_execute()

    # list of metrics
    metric_names = ["latency", "power_draw", "energy"]
    raw: dict[str, list[float]] = {metric: [] for metric in metric_names}

    # pre-generate the false data
    false_data = engine.get_random_input(verbose=verbose)

    # create temp file location for data to go
    temp_file = Path(Path.cwd()) / "temptegra.txt"
    # store the start/stop times of the engine execution
    start_stop_times: list[tuple[float, float]] = []
    with TegraStats(temp_file, interval=tegra_interval):
        for _ in range(iterations):
            t0 = time.time()
            engine.mock_execute(false_data, verbose=verbose)
            t1 = time.time()
            raw["latency"].append(t1 - t0)
            start_stop_times.append((t0, t1))

    # parse the tegra data
    tegradata = parse_tegrastats(temp_file)

    # delete the temp file
    if temp_file.exists():
        temp_file.unlink()

    # filter the data by actual times during execution
    filtered_data, per_inference = filter_data(tegradata, start_stop_times)

    # get the energy values
    powerdraw_data: dict[str, JMetric] = get_powerdraw(filtered_data)
    raw["power_draw"] = powerdraw_data["VDD_TOTAL"].raw

    # compute energy values
    # for energy values need to compute powerdraw per infernece
    # then compute energy
    energy_data = [
        get_powerdraw(inf_data)["VDD_TOTAL"].mean * (inf_stop - inf_start)
        for (inf_start, inf_stop), inf_data in per_inference
        if len(inf_data) > 0
    ]
    raw["energy"] = energy_data

    # calculate the metrics
    metrics: dict[str, Metric] = {}
    for metric_name in metric_names:
        data = raw[metric_name]
        metric = Metric(data)
        metrics[metric_name] = metric
        LOG.debug(f"{metric_name}: {metric}")

    if loaded_engine:
        del engine
        gc.collect()

    return JetsonBenchmarkResult(
        latency=metrics["latency"],
        power_draw=metrics["power_draw"],
        energy=metrics["energy"],
    )




[docs]
def benchmark_engines(
    engines: Sequence[TRTEngine | Path | str | tuple[TRTEngine | Path | str, int]],
    iterations: int = 1000,
    warmup_iterations: int = 50,
    tegra_interval: int = 5,
    *,
    warmup: bool | None = None,
    cuda_graph: bool | None = None,
    parallel: bool | None = None,
    verbose: bool | None = None,
) -> list[JetsonBenchmarkResult]:
    """
    Benchmark a TensorRT engine.

    Parameters
    ----------
    engines : Sequence[TRTEngine | Path | str | tuple[TRTEngine | Path | str, int]]
        The engines to benchmark as paths to the engine files.
    iterations : int, optional
        The number of iterations to run the benchmark for, by default 1000.
    warmup_iterations : int, optional
        The number of warmup iterations to run before the benchmark, by default 50.
    tegra_interval : int, optional
        The number of milliseconds between each tegrastats sampling.
        The smaller the number, the more samples per second are generated.
        By default 5 milliseconds between samples.
    warmup : bool, optional
        Whether to do warmup iterations, by default None
        If None, warmup will be set to True.
    cuda_graph : bool, optional
        Whether to enable CUDA graph capture for optimized execution.
        By default None, which enables CUDA graphs.
        Set to False for engines with DLA layers, as DLA does not support CUDA graphs.
    parallel : bool, optional
        Whether or not to process the engines in parallel.
        Useful for assessing concurrent execution performance.
        Will execute the engines in lockstep.
        If None, will benchmark each engine individually.
    verbose : bool, optional
        Whether ot not to output additional information to stdout.
        Default None/False.

    Returns
    -------
    list[JetsonBenchmarkResult]
        A list of dataclasses containing the results of the benchmark.
        If parallel was True, will only contain one item.

    """
    temp_engines: list[Path | TRTEngine] = []
    dla_assignments: list[int | None] = []
    for engine_info in engines:
        dla_core: int | None = None
        if isinstance(engine_info, tuple):
            engine, dla_core = engine_info  # type: ignore[assignment]
        else:
            engine = engine_info
        if isinstance(engine, str):
            engine = Path(engine)
        temp_engines.append(engine)  # type: ignore[arg-type]
        dla_assignments.append(dla_core)

    if not parallel:
        return [
            benchmark_engine(
                engine,
                iterations,
                warmup_iterations,
                tegra_interval,
                dla_core=dla_core,
                warmup=warmup,
                cuda_graph=cuda_graph,
                verbose=verbose,
            )
            for engine, dla_core in zip(temp_engines, dla_assignments)
        ]

    # otherwise we need a parallel setup
    trt_engines = ParallelTRTEngines(
        [(ep, dc) if dc is not None else ep for ep, dc in zip(temp_engines, dla_assignments)],
        warmup_iterations=warmup_iterations,
        warmup=warmup,
    )

    # list of metrics
    metric_names = ["latency", "power_draw", "energy"]
    raw: dict[str, list[float]] = {metric: [] for metric in metric_names}

    # pre-generate the false data
    false_data = trt_engines.get_random_input()

    # create temp file location for data to go
    temp_file = Path(Path.cwd()) / "temptegra.txt"
    # store the start/stop times of the engine execution
    start_stop_times: list[tuple[float, float]] = []
    with TegraStats(temp_file, interval=tegra_interval):
        for _ in range(iterations):
            t0 = time.time()
            trt_engines.submit(false_data)
            trt_engines.retrieve()
            t1 = time.time()
            raw["latency"].append(t1 - t0)
            start_stop_times.append((t0, t1))

    # parse the tegra data
    tegradata = parse_tegrastats(temp_file)

    # delete the temp file
    if temp_file.exists():
        temp_file.unlink()

    # filter the data by actual times during execution
    filtered_data, per_inference = filter_data(tegradata, start_stop_times)

    # get the energy values
    powerdraw_data: dict[str, JMetric] = get_powerdraw(filtered_data)
    raw["power_draw"] = powerdraw_data["VDD_TOTAL"].raw

    # compute energy values
    # for energy values need to compute powerdraw per infernece
    # then compute energy
    energy_data = [
        get_powerdraw(inf_data)["VDD_TOTAL"].mean * (inf_stop - inf_start)
        for (inf_start, inf_stop), inf_data in per_inference
        if len(inf_data) > 0
    ]
    raw["energy"] = energy_data

    # calculate the metrics
    metrics: dict[str, Metric] = {}
    for metric_name in metric_names:
        data = raw[metric_name]
        metric = Metric(data)
        metrics[metric_name] = metric
        LOG.debug(f"{metric_name}: {metric}")

    # no need to check if we loaded the engines, since passed engines are not deleted
    # explicitly by the ParallelTRTEngines class
    del trt_engines
    gc.collect()

    return [
        JetsonBenchmarkResult(
            latency=metrics["latency"],
            power_draw=metrics["power_draw"],
            energy=metrics["energy"],
        ),
    ]