Source code for grunnur.static

from __future__ import annotations

from typing import Any, Callable, Optional, Union, Dict, Mapping, Tuple, Sequence

import numpy

from .array import Array
from .array_metadata import ArrayMetadataLike
from .device import Device
from .api import cuda_api_id
from .template import DefTemplate
from .modules import Snippet
from .context import Context, BoundDevice, BoundMultiDevice
from .queue import Queue
from .array import Array
from .utils import prod, update_dict
from .vsize import VirtualSizes, VirtualSizeError
from .program import (
    SingleDeviceProgram,
    PreparedKernel,
    normalize_sizes,
    _check_set_constant_array,
    _set_constant_array,
)


# the name of the global in the template containing static kernel modules
_STATIC_MODULES_GLOBAL = "static"


[docs]class StaticKernel: """ An object containing a GPU kernel with fixed call sizes. The globals for the source template will contain an object with the name ``static`` of the type :py:class:`~grunnur.vsize.VsizeModules` containing the id/size functions to be used instead of regular ones. """ devices: BoundMultiDevice """Devices on which this kernel was compiled.""" queue: Queue """The queue this static kernel was compiled and prepared for.""" sources: Dict[BoundDevice, str] """Source files used for each device.""" def __init__( self, devices: Sequence[BoundDevice], template_src: Union[str, Callable[..., str], DefTemplate, Snippet], name: str, global_size: Union[Sequence[int], Mapping[BoundDevice, Sequence[int]]], local_size: Union[ Sequence[int], None, Mapping[BoundDevice, Optional[Sequence[int]]] ] = None, render_args: Sequence[Any] = (), render_globals: Mapping[str, Any] = {}, constant_arrays: Optional[Mapping[str, ArrayMetadataLike]] = None, keep: bool = False, fast_math: bool = False, compiler_options: Optional[Sequence[str]] = None, ): """ :param devices: a single- or a multi-device object on which to compile this program. :param template_src: a string with the source code, or a Mako template source to render. :param name: the kernel's name. :param global_size: see :py:meth:`~grunnur.program.Kernel.prepare`. :param local_size: see :py:meth:`~grunnur.program.Kernel.prepare`. :param render_globals: a dictionary of globals to pass to the template. :param constant_arrays: (**CUDA only**) a dictionary ``name: (size, dtype)`` of global constant arrays to be declared in the program. """ multi_device, n_global_size, n_local_size = normalize_sizes( devices, global_size, local_size ) self.devices = multi_device kernel_adapters = {} sources = {} vs_metadata = {} for device in multi_device: device_params = device.params kernel_ls = n_local_size[device] kernel_gs = n_global_size[device] # Since virtual size function require some registers, # they affect the maximum local size. # Start from the device's max local size as the first approximation # and recompile kernels with smaller local sizes until convergence. max_total_local_size = device_params.max_total_local_size while True: # Try to find kernel launch parameters for the requested local size. # May raise VirtualSizeError if it's not possible, # just let it pass to the caller. vs = VirtualSizes( max_total_local_size=max_total_local_size, max_local_sizes=device_params.max_local_sizes, max_num_groups=device_params.max_num_groups, local_size_multiple=device_params.warp_size, virtual_global_size=kernel_gs, virtual_local_size=kernel_ls, ) new_render_globals = update_dict( render_globals, {_STATIC_MODULES_GLOBAL: vs.vsize_modules}, error_msg=f"The global name '{_STATIC_MODULES_GLOBAL}' is reserved in static kernels", ) # Try to compile the kernel with the corresponding virtual size functions program = SingleDeviceProgram( device, template_src, render_args=render_args, render_globals=new_render_globals, constant_arrays=constant_arrays, keep=keep, fast_math=fast_math, compiler_options=compiler_options, ) kernel_adapter = program.get_kernel_adapter(name) if kernel_adapter.max_total_local_size >= prod(vs.real_local_size): # Kernel will execute with this local size, use it break # By the contract of VirtualSizes, # prod(vs.real_local_size) <= max_total_local_size # Also, since we're still in this loop, # kernel_adapter.max_total_local_size < prod(vs.real_local_size). # Therefore the new max_total_local_size value is guaranteed # to be smaller than the previous one. max_total_local_size = kernel_adapter.max_total_local_size # In most cases the iteration should stop at `max_total_local_size == 1`, # where the virtual size is trivial and always possible. # But occasionally we may get a kernel that cannot be executed at all # (e.g. it requests too much local memory), # and some platforms may return 0, which `VirtualSizes` will not like. # So we'll have a sanity check here. if max_total_local_size == 0: raise VirtualSizeError( "The kernel requires too much resourses to be executed with any local size" ) kernel_adapters[device] = kernel_adapter sources[device] = program.source vs_metadata[device] = vs self.sources = sources self._vs_metadata = vs_metadata self._sd_kernel_adapters = kernel_adapters global_sizes = {device: vs.real_global_size for device, vs in self._vs_metadata.items()} local_sizes = {device: vs.real_local_size for device, vs in self._vs_metadata.items()} self._prepared_kernel = PreparedKernel( multi_device, kernel_adapters, global_sizes, local_sizes )
[docs] def __call__(self, queue: Queue, *args: Union[Array, numpy.generic]) -> Any: """ Execute the kernel. In case of the OpenCL backend, returns a ``pyopencl.Event`` object. :param queue: the multi-device queue to use. :param args: kernel arguments. See :py:meth:`grunnur.program.PreparedKernel.__call__`. """ return self._prepared_kernel(queue, *args)
[docs] def set_constant_array( self, queue: Queue, name: str, arr: Union[Array, "numpy.ndarray[Any, numpy.dtype[Any]]"] ) -> None: """ Uploads a constant array to the context's devices (**CUDA only**). :param queue: the queue to use for the transfer. :param name: the name of the constant array symbol in the code. :param arr: either a device or a host array. """ _check_set_constant_array(queue, self.devices) kernel_adapter = self._sd_kernel_adapters[queue.device] _set_constant_array(queue, kernel_adapter.program_adapter, name, arr)