Source code for trtutils.core._kernels

# Copyright (c) 2024 Justin Davis (davisjustin302@gmail.com)
#
# MIT License
# ruff: noqa: PYI041
# mypy: disable-error-code="import-untyped"
from __future__ import annotations

import contextlib
from collections import deque
from pathlib import Path
from typing import TYPE_CHECKING

import numpy as np
import nvtx

from trtutils._flags import FLAGS
from trtutils._log import LOG
from trtutils.compat._libs import cuda

from ._cuda import cuda_call
from ._nvrtc import compile_and_load_kernel

if TYPE_CHECKING:
    from typing_extensions import Self

    from trtutils.compat._libs import cudart


[docs] class Kernel: """Holds kernel coda and PTX for execution.""" def __init__( self: Self, kernel_file: Path | str, name: str, max_arg_cache: int = 1, *, verbose: bool | None = None, ) -> None: """ Create the specified kernel from the CUDA code. Parameters ---------- kernel_file : Path | str The CUDA file containing the kernel definition. name : str The name of the kernel to compile. max_arg_cache : int The number of arg arrays to store cached to prevent garbage collection. Since args are created per-call only 1 is typically needed. Default 1. verbose : bool, optional Whether or not to output additional information to stdout. If not provided, will default to overall engines verbose setting. """ if FLAGS.NVTX_ENABLED: nvtx.push_range(f"kernel::init [{name}]") kernel_file = kernel_file if isinstance(kernel_file, Path) else Path(kernel_file) with kernel_file.open("r") as f: kernel_code: str = f.read() self._name = name self._nvtx_tag_call = f"kernel::call [{name}]" self._module, self._kernel = compile_and_load_kernel( kernel_code, name, verbose=verbose, ) self._inter_args: deque[list[np.ndarray]] = deque(maxlen=max_arg_cache) self._freed = False if FLAGS.NVTX_ENABLED: nvtx.pop_range()
[docs] def free(self: Self) -> None: """Free the memory of the loaded kernel.""" if self._freed or self._module is None: return # CUDA context may already be destroyed during interpreter shutdown with contextlib.suppress(Exception): cuda_call(cuda.cuModuleUnload(self._module)) self._module = None self._freed = True
def __del__(self: Self) -> None: with contextlib.suppress(AttributeError, RuntimeError, SystemError): self.free()
[docs] def create_args( self: Self, *args: int | float | np.ndarray, verbose: bool | None = False, ) -> np.ndarray: """ Create the argument pointer array for a CUDA kernel call. Is a wrapper around :func:`trtutils.core.create_kernel_args`, which stores the intermediate pointer results in inside of the class. The intermediate arrays can be cleaned up by the garbage collector if the kernel does not access the memory fast enough. Parameters ---------- *args : int | float | np.ndarray All args to pass to the kernel as integers, floats, or pre-formed args. If arrays are to be passed to the kernel, they should be given as an integer representing the pointer returned from CUDA malloc. A preformed arg is one which is already wrapped as an np.ndarray with specific type. verbose : bool, optional Whether or not to output additional information about the passed args. Returns ------- np.ndarray The np.ndarray of argument pointers (one pointer per arg) """ ptrs, intermediate = create_kernel_args(*args, verbose=verbose) self._inter_args.append(intermediate) return ptrs
def __call__( self: Self, num_blocks: tuple[int, int, int], num_threads: tuple[int, int, int], stream: cudart.cudaStream_t, args: np.ndarray, *, verbose: bool | None = None, ) -> None: self.call(num_blocks, num_threads, stream, args, verbose=verbose)
[docs] def call( self: Self, num_blocks: tuple[int, int, int], num_threads: tuple[int, int, int], stream: cudart.cudaStream_t, args: np.ndarray, *, verbose: bool | None = None, ) -> None: """ Launch the kernel with the specified blocks, threads, and args in a stream. Parameters ---------- num_blocks: tuple[int, int, int] The number of blocks to use for the kernel calls. num_threads: tuple[int, int, int] The number of threads to use for the kernel calls. stream: cudart.cudaStream_t The CUDA stream to execute the kernel in. args: np.ndarray The NumPy array containing the pointers to the arguments. This array should be 1D containing int64 pointers to a NumPy array containing each individual argument. verbose : bool, optional Whether or not to output additional information to stdout. If not provided, will default to overall engines verbose setting. """ if FLAGS.NVTX_ENABLED: nvtx.push_range(self._nvtx_tag_call) if verbose: LOG.debug( f"Calling kernel: {self._name}, blocks: {num_blocks}, threads: {num_threads}, args: {args}", ) launch_kernel( self._kernel, num_blocks, num_threads, stream, args, ) if FLAGS.NVTX_ENABLED: nvtx.pop_range()
[docs] def launch_kernel( kernel: cuda.CUkernel, num_blocks: tuple[int, int, int], num_threads: tuple[int, int, int], stream: cudart.cudaStream_t, args: np.ndarray, ) -> None: """ Launch a CUDA kernel with specified blocks, threads, and args in a stream. Parameters ---------- kernel: cuda.CUKernel The CUDA kernel as compiled by NVRTC using the compile_kernel function. num_blocks: tuple[int, int, int] The number of blocks to use for the kernel call. num_threads: tuple[int, int, int] The number of threads to use for the kernel call. stream: cudart.cudaStream_t The CUDA stream to execute the kernel in. args: np.ndarray The NumPy array containing the pointers to the arguments. This array should be 1D containing int64 pointers to a NumPy array containing each individual argument. """ if FLAGS.NVTX_ENABLED: nvtx.push_range("core::launch_kernel") cuda_call( cuda.cuLaunchKernel( kernel, *num_blocks, *num_threads, 0, stream, args.ctypes.data, 0, ), ) if FLAGS.NVTX_ENABLED: nvtx.pop_range()
[docs] def create_kernel_args( *args: int | float | np.ndarray, verbose: bool | None = False, ) -> tuple[np.ndarray, list[np.ndarray]]: """ Create the argument pointer array for a CUDA kernel call. Adapted from the workflow present in: https://nvidia.github.io/cuda-python/overview.html#cuda-python-workflow This MUST be called for each kernel call. If the args are not regenerated the CUDA runtime will crash. The intermediate argument buffers MUST be saved as variable to ensure the garbage collector does not delete them before use. The Kernel wrapper class handles this and is the recomended way to interact with kernels inside of trtutils. Parameters ---------- *args : int | float | np.ndarray All args to pass to the kernel as integers, floats, or pre-formed args. If arrays are to be passed to the kernel, they should be given as an integer representing the pointer returned from CUDA malloc. A preformed arg is one which is already wrapped as an np.ndarray with specific type. verbose : bool, optional Whether or not to output additional information about the passed args. Returns ------- tuple[np.ndarray, list[np.ndarray]] The np.ndarray of argument pointers (one pointer per arg), and the allocated arrays Raises ------ TypeError If the type of an argument is not integer or float """ # verbose output if verbose: LOG.debug(f"Converting args: {args}") # convert all args to np.ndarrays converted_args: list[np.ndarray] = [] for arg in args: if isinstance(arg, int): converted_args.append(np.array([arg], dtype=np.uint64)) elif isinstance(arg, float): converted_args.append(np.array([arg], dtype=np.float32)) elif isinstance(arg, np.ndarray): converted_args.append(arg) else: err_msg = f"Unrecognized arg type for CUDA kernel: {type(arg)}" raise TypeError(err_msg) if verbose: last_arg = converted_args[-1] LOG.debug(f"Converted Arg: {arg} -> Array: {last_arg} {last_arg.dtype}") # get a pointer to each np.ndarray and pack into new array ptrs: np.ndarray = np.array( [arg.ctypes.data for arg in converted_args], dtype=np.uint64, ) if verbose: LOG.debug(f"Generated pointers: {ptrs}") return ptrs, converted_args