Source code for grunnur._static

from __future__ import annotations

from typing import TYPE_CHECKING, Any

from ._api import cuda_api_id
from ._context import Context
from ._device import Device
from ._program import (
    PreparedKernel,
    SingleDeviceProgram,
    _check_set_constant_array,
    _set_constant_array,
    normalize_sizes,
)
from ._utils import prod, update_dict
from ._vsize import VirtualSizeError, VirtualSizes

# the name of the global in the template containing static kernel modules
_STATIC_MODULES_GLOBAL = "static"

if TYPE_CHECKING:  # pragma: no cover
    from collections.abc import Callable, Iterable, Mapping, Sequence

    import numpy
    from numpy.typing import NDArray

    from ._array import Array, MultiArray
    from ._array_metadata import AsArrayMetadata
    from ._buffer import Buffer
    from ._context import BoundDevice, BoundMultiDevice
    from ._modules import Snippet
    from ._queue import MultiQueue, Queue
    from ._template import DefTemplate


[docs] class StaticKernel: """ An object containing a GPU kernel with fixed call sizes. The globals for the source template will contain an object with the name ``static`` of the type :py:class:`~grunnur._vsize.VsizeModules` containing the id/size functions to be used instead of regular ones. """ devices: BoundMultiDevice """Devices on which this kernel was compiled.""" queue: Queue """The queue this static kernel was compiled and prepared for.""" sources: dict[BoundDevice, str] """Source files used for each device.""" def __init__( self, devices: Sequence[BoundDevice], template_src: str | Callable[..., str] | DefTemplate | Snippet, name: str, global_size: Sequence[int] | Mapping[BoundDevice, Sequence[int]], *, local_size: Sequence[int] | None | Mapping[BoundDevice, Sequence[int] | None] = None, render_args: Sequence[Any] = (), render_globals: Mapping[str, Any] = {}, constant_arrays: Mapping[str, AsArrayMetadata] = {}, keep: bool = False, fast_math: bool = False, compiler_options: Iterable[str] = [], ): """ :param devices: a single- or a multi-device object on which to compile this program. :param template_src: a string with the source code, or a Mako template source to render. :param name: the kernel's name. :param global_size: see :py:meth:`~grunnur._program.Kernel.prepare`. :param local_size: see :py:meth:`~grunnur._program.Kernel.prepare`. :param render_globals: a dictionary of globals to pass to the template. :param constant_arrays: (**CUDA only**) a dictionary ``name: (size, dtype)`` of global constant arrays to be declared in the program. """ multi_device, n_global_size, n_local_size = normalize_sizes( devices, global_size, local_size ) self.devices = multi_device kernel_adapters = {} sources = {} vs_metadata = {} for device in multi_device: device_params = device.params kernel_ls = n_local_size[device] kernel_gs = n_global_size[device] # Since virtual size function require some registers, # they affect the maximum local size. # Start from the device's max local size as the first approximation # and recompile kernels with smaller local sizes until convergence. max_total_local_size = device_params.max_total_local_size while True: # Try to find kernel launch parameters for the requested local size. # May raise VirtualSizeError if it's not possible, # just let it pass to the caller. vs = VirtualSizes( max_total_local_size=max_total_local_size, max_local_sizes=device_params.max_local_sizes, max_num_groups=device_params.max_num_groups, local_size_multiple=device_params.warp_size, virtual_global_size=kernel_gs, virtual_local_size=kernel_ls, ) new_render_globals = update_dict( render_globals, {_STATIC_MODULES_GLOBAL: vs.vsize_modules}, error_msg=( f"The global name '{_STATIC_MODULES_GLOBAL}' is reserved in static kernels" ), ) # Try to compile the kernel with the corresponding virtual size functions program = SingleDeviceProgram( device, template_src, render_args=render_args, render_globals=new_render_globals, constant_arrays=constant_arrays, keep=keep, fast_math=fast_math, compiler_options=compiler_options, ) kernel_adapter = program.get_kernel_adapter(name) if kernel_adapter.max_total_local_size >= prod(vs.real_local_size): # Kernel will execute with this local size, use it break # By the contract of VirtualSizes, prod(vs.real_local_size) <= max_total_local_size # Also, since we're still in this loop, # kernel_adapter.max_total_local_size < prod(vs.real_local_size). # Therefore the new max_total_local_size value is guaranteed # to be smaller than the previous one. max_total_local_size = kernel_adapter.max_total_local_size # In most cases the iteration should stop at `max_total_local_size == 1`, # where the virtual size is trivial and always possible. # But occasionally we may get a kernel that cannot be executed at all # (e.g. it requests too much local memory), # and some platforms may return 0, which `VirtualSizes` will not like. # So we'll have a sanity check here. if max_total_local_size == 0: raise VirtualSizeError( "The kernel requires too much resourses to be executed with any local size" ) kernel_adapters[device] = kernel_adapter sources[device] = program.source vs_metadata[device] = vs self.sources = sources self._vs_metadata = vs_metadata self._sd_kernel_adapters = kernel_adapters global_sizes = {device: vs.real_global_size for device, vs in self._vs_metadata.items()} local_sizes = {device: vs.real_local_size for device, vs in self._vs_metadata.items()} self._prepared_kernel = PreparedKernel( multi_device, kernel_adapters, global_sizes, local_sizes )
[docs] def __call__( self, queue: Queue | MultiQueue, *args: Mapping[BoundDevice, Array | Buffer | numpy.generic] | MultiArray | Array | Buffer | numpy.generic, ) -> Any: """ Execute the kernel. In case of the OpenCL backend, returns a ``pyopencl.Event`` object. :param queue: the multi-device queue to use. :param args: kernel arguments. See :py:meth:`grunnur._program.PreparedKernel.__call__`. """ return self._prepared_kernel(queue, *args)
[docs] def set_constant_array( self, queue: Queue, name: str, arr: Array | Buffer | NDArray[Any] ) -> None: """ Uploads a constant array to the context's devices (**CUDA only**). :param queue: the queue to use for the transfer. :param name: the name of the constant array symbol in the code. :param arr: either a device or a host array. """ _check_set_constant_array(queue, self.devices) kernel_adapter = self._sd_kernel_adapters[queue.device] _set_constant_array(queue, kernel_adapter.program_adapter, name, arr)