Source code for grunnur.static

from __future__ import annotations

from typing import Any, Callable, Optional, Union, Dict, Mapping, Tuple, Sequence

import numpy

from .array import Array
from .array_metadata import ArrayMetadataLike
from .device import Device
from .api import cuda_api_id
from .template import DefTemplate
from .modules import Snippet
from .context import Context, BoundDevice, BoundMultiDevice
from .queue import Queue
from .array import Array
from .utils import prod, update_dict
from .vsize import VirtualSizes, VirtualSizeError
from .program import (
    SingleDeviceProgram,
    PreparedKernel,
    normalize_sizes,
    _check_set_constant_array,
    _set_constant_array,
)


# the name of the global in the template containing static kernel modules
_STATIC_MODULES_GLOBAL = "static"


[docs]class StaticKernel:
    """
    An object containing a GPU kernel with fixed call sizes.

    The globals for the source template will contain an object with the name ``static``
    of the type :py:class:`~grunnur.vsize.VsizeModules` containing the id/size functions
    to be used instead of regular ones.
    """

    devices: BoundMultiDevice
    """Devices on which this kernel was compiled."""

    queue: Queue
    """The queue this static kernel was compiled and prepared for."""

    sources: Dict[BoundDevice, str]
    """Source files used for each device."""

    def __init__(
        self,
        devices: Sequence[BoundDevice],
        template_src: Union[str, Callable[..., str], DefTemplate, Snippet],
        name: str,
        global_size: Union[Sequence[int], Mapping[BoundDevice, Sequence[int]]],
        local_size: Union[
            Sequence[int], None, Mapping[BoundDevice, Optional[Sequence[int]]]
        ] = None,
        render_args: Sequence[Any] = (),
        render_globals: Mapping[str, Any] = {},
        constant_arrays: Optional[Mapping[str, ArrayMetadataLike]] = None,
        keep: bool = False,
        fast_math: bool = False,
        compiler_options: Optional[Sequence[str]] = None,
    ):
        """
        :param devices: a single- or a multi-device object on which to compile this program.
        :param template_src: a string with the source code, or a Mako template source to render.
        :param name: the kernel's name.
        :param global_size: see :py:meth:`~grunnur.program.Kernel.prepare`.
        :param local_size: see :py:meth:`~grunnur.program.Kernel.prepare`.
        :param render_globals: a dictionary of globals to pass to the template.
        :param constant_arrays: (**CUDA only**) a dictionary ``name: (size, dtype)``
            of global constant arrays to be declared in the program.
        """
        multi_device, n_global_size, n_local_size = normalize_sizes(
            devices, global_size, local_size
        )

        self.devices = multi_device

        kernel_adapters = {}
        sources = {}
        vs_metadata = {}
        for device in multi_device:
            device_params = device.params

            kernel_ls = n_local_size[device]
            kernel_gs = n_global_size[device]

            # Since virtual size function require some registers,
            # they affect the maximum local size.
            # Start from the device's max local size as the first approximation
            # and recompile kernels with smaller local sizes until convergence.

            max_total_local_size = device_params.max_total_local_size

            while True:
                # Try to find kernel launch parameters for the requested local size.
                # May raise VirtualSizeError if it's not possible,
                # just let it pass to the caller.
                vs = VirtualSizes(
                    max_total_local_size=max_total_local_size,
                    max_local_sizes=device_params.max_local_sizes,
                    max_num_groups=device_params.max_num_groups,
                    local_size_multiple=device_params.warp_size,
                    virtual_global_size=kernel_gs,
                    virtual_local_size=kernel_ls,
                )

                new_render_globals = update_dict(
                    render_globals,
                    {_STATIC_MODULES_GLOBAL: vs.vsize_modules},
                    error_msg=f"The global name '{_STATIC_MODULES_GLOBAL}' is reserved in static kernels",
                )

                # Try to compile the kernel with the corresponding virtual size functions
                program = SingleDeviceProgram(
                    device,
                    template_src,
                    render_args=render_args,
                    render_globals=new_render_globals,
                    constant_arrays=constant_arrays,
                    keep=keep,
                    fast_math=fast_math,
                    compiler_options=compiler_options,
                )
                kernel_adapter = program.get_kernel_adapter(name)

                if kernel_adapter.max_total_local_size >= prod(vs.real_local_size):
                    # Kernel will execute with this local size, use it
                    break

                # By the contract of VirtualSizes,
                # prod(vs.real_local_size) <= max_total_local_size
                # Also, since we're still in this loop,
                # kernel_adapter.max_total_local_size < prod(vs.real_local_size).
                # Therefore the new max_total_local_size value is guaranteed
                # to be smaller than the previous one.
                max_total_local_size = kernel_adapter.max_total_local_size

                # In most cases the iteration should stop at `max_total_local_size == 1`,
                # where the virtual size is trivial and always possible.
                # But occasionally we may get a kernel that cannot be executed at all
                # (e.g. it requests too much local memory),
                # and some platforms may return 0, which `VirtualSizes` will not like.
                # So we'll have a sanity check here.
                if max_total_local_size == 0:
                    raise VirtualSizeError(
                        "The kernel requires too much resourses to be executed with any local size"
                    )

            kernel_adapters[device] = kernel_adapter
            sources[device] = program.source
            vs_metadata[device] = vs

        self.sources = sources
        self._vs_metadata = vs_metadata
        self._sd_kernel_adapters = kernel_adapters

        global_sizes = {device: vs.real_global_size for device, vs in self._vs_metadata.items()}
        local_sizes = {device: vs.real_local_size for device, vs in self._vs_metadata.items()}

        self._prepared_kernel = PreparedKernel(
            multi_device, kernel_adapters, global_sizes, local_sizes
        )

[docs]    def __call__(self, queue: Queue, *args: Union[Array, numpy.generic]) -> Any:
        """
        Execute the kernel.
        In case of the OpenCL backend, returns a ``pyopencl.Event`` object.

        :param queue: the multi-device queue to use.
        :param args: kernel arguments. See :py:meth:`grunnur.program.PreparedKernel.__call__`.
        """
        return self._prepared_kernel(queue, *args)

[docs]    def set_constant_array(
        self, queue: Queue, name: str, arr: Union[Array, "numpy.ndarray[Any, numpy.dtype[Any]]"]
    ) -> None:
        """
        Uploads a constant array to the context's devices (**CUDA only**).

        :param queue: the queue to use for the transfer.
        :param name: the name of the constant array symbol in the code.
        :param arr: either a device or a host array.
        """
        _check_set_constant_array(queue, self.devices)
        kernel_adapter = self._sd_kernel_adapters[queue.device]
        _set_constant_array(queue, kernel_adapter.program_adapter, name, arr)