Source code for probnum.randprocs.kernels._kernel

"""Kernel / covariance function."""

import abc
from typing import Optional, Union

import numpy as np

from probnum import utils as _pn_utils
from probnum.typing import ArrayLike, IntLike, ShapeLike, ShapeType


class Kernel(abc.ABC):
    r"""(Cross-)covariance function(s)

    Abstract base class representing one or multiple (cross-)covariance function(s),
    also known as kernels.
    A cross-covariance function

    .. math::
        :nowrap:

        \begin{equation}
            k_{fg} \colon
            \mathcal{X}^{d_\text{in}} \times \mathcal{X}^{d_\text{in}}
            \to \mathbb{R}
        \end{equation}

    is a function of two arguments :math:`x_0` and :math:`x_1`, which represents the
    covariance between two evaluations :math:`f(x_0)` and :math:`g(x_1)` of two
    scalar-valued random processes :math:`f` and :math:`g` (or, equivalently, two
    outputs :math:`h_i(x_0)` and :math:`h_j(x_1)` of a vector-valued random process
    :math:`h`).
    If :math:`f = g`, then the cross-covariance function is also referred to as a
    covariance function, in which case it must be symmetric and positive
    (semi-)definite.

    An instance of a :class:`Kernel` can compute multiple different (cross-)covariance
    functions on the same pair of inputs simultaneously. For instance, it can be used to
    compute the full covariance matrix

    .. math::
        :nowrap:

        \begin{equation}
            C^f \colon
            \mathcal{X}^{d_\text{in}} \times \mathcal{X}^{d_\text{in}}
            \to \mathbb{R}^{d_\text{out} \times d_\text{out}},
            C^f_{i j}(x_0, x_1) := k_{f_i f_j}(x_0, x_1)
        \end{equation}

    of the vector-valued random process :math:`f`. To this end, we understand any
    :class:`Kernel` with a non-empty :attr:`shape` as a tensor with the given
    :attr:`shape`, which contains different (cross-)covariance functions as its entries.

    Parameters
    ----------
    input_dim :
        Input dimension of the kernel.
    shape :
        If ``shape`` is set to ``()``, the :class:`Kernel` instance represents a
        single (cross-)covariance function. Otherwise, i.e. if ``shape`` is non-empty,
        the :class:`Kernel` instance represents a tensor of (cross-)covariance
        functions whose shape is given by ``shape``.

    Examples
    --------

    >>> D = 3
    >>> k = pn.randprocs.kernels.Linear(input_dim=D)

    Generate some input data.

    >>> xs = np.repeat(np.linspace(0, 1, 4)[:, None], D, axis=-1)
    >>> xs.shape
    (4, 3)
    >>> xs
    array([[0.        , 0.        , 0.        ],
           [0.33333333, 0.33333333, 0.33333333],
           [0.66666667, 0.66666667, 0.66666667],
           [1.        , 1.        , 1.        ]])

    We can compute kernel matrices like so.

    >>> k.matrix(xs)
    array([[0.        , 0.        , 0.        , 0.        ],
           [0.        , 0.33333333, 0.66666667, 1.        ],
           [0.        , 0.66666667, 1.33333333, 2.        ],
           [0.        , 1.        , 2.        , 3.        ]])

    Inputs to :meth:`Kernel.__call__` are broadcast according to the "kernel
    broadcasting" rules detailed in the "Notes" section of the :meth:`Kernel._call__`
    documentation.

    >>> k(xs[:, None, :], xs[None, :, :])  # same as `.matrix`
    array([[0.        , 0.        , 0.        , 0.        ],
           [0.        , 0.33333333, 0.66666667, 1.        ],
           [0.        , 0.66666667, 1.33333333, 2.        ],
           [0.        , 1.        , 2.        , 3.        ]])

    A shape of ``1`` along the last axis is broadcast to :attr:`input_dim`.

    >>> xs_d1 = xs[:, [0]]
    >>> xs_d1.shape
    (4, 1)
    >>> xs_d1
    array([[0.        ],
           [0.33333333],
           [0.66666667],
           [1.        ]])
    >>> k(xs_d1[:, None, :], xs_d1[None, :, :])  # same as `.matrix`
    array([[0.        , 0.        , 0.        , 0.        ],
           [0.        , 0.33333333, 0.66666667, 1.        ],
           [0.        , 0.66666667, 1.33333333, 2.        ],
           [0.        , 1.        , 2.        , 3.        ]])
    >>> k(xs[:, None, :], xs_d1[None, :, :])  # same as `.matrix`
    array([[0.        , 0.        , 0.        , 0.        ],
           [0.        , 0.33333333, 0.66666667, 1.        ],
           [0.        , 0.66666667, 1.33333333, 2.        ],
           [0.        , 1.        , 2.        , 3.        ]])

    No broadcasting is applied if both inputs have the same shape. For instance, one can
    efficiently compute just the diagonal of the kernel matrix via

    >>> k(xs, xs)
    array([0.        , 0.33333333, 1.33333333, 3.        ])
    >>> k(xs, None)  # x1 = None is an efficient way to set x1 == x0
    array([0.        , 0.33333333, 1.33333333, 3.        ])

    and the diagonal above the main diagonal of the kernel matrix is retrieved through

    >>> k(xs[:-1, :], xs[1:, :])
    array([0.        , 0.66666667, 2.        ])
    """

    def __init__(
        self,
        input_dim: IntLike,
        shape: ShapeLike = (),
    ):
        self._input_dim = int(input_dim)

        self._shape = _pn_utils.as_shape(shape)

    def __repr__(self) -> str:
        return f"<{self.__class__.__name__}>"

[docs]    def __call__(
        self,
        x0: ArrayLike,
        x1: Optional[ArrayLike],
    ) -> Union[np.ndarray, np.floating]:
        """Evaluate the (cross-)covariance function(s).

        The inputs are broadcast to a common shape following the "kernel broadcasting"
        rules outlined in the "Notes" section.

        Parameters
        ----------
        x0 : array-like
            An array of shape ``()`` or ``(Nn, ..., N2, N1, D_in)``, where ``D_in`` is
            either ``1`` or :attr:`input_dim`, whose entries will be passed to the first
            argument of the kernel.
        x1 : array-like
            An array of shape ``()`` or ``(Mm, ..., M2, M1, D_in)``, where ``D_in`` is
            either ``1`` or :attr:`input_dim`, whose entries will be
            passed to the second argument of the kernel. Can also be set to ``None``,
            in which case the function will behave as if ``x1 = x0``.

        Returns
        -------
        k_x0_x1 : numpy.ndarray
            The (cross-)covariance function(s) evaluated at ``x0`` and ``x1``.
            If :attr:`shape` is ``()``, this method returns an array of shape
            ``(Lk, ..., L2, L1)`` whose entry at index ``(ik, ..., i2, i1)`` contains
            the evaluation of the (cross-)covariance function at the inputs
            ``x0[ik, ..., i2, i1, :] and ``x1[il, ..., i2, i1, :])``.
            For any non-empty :attr:`shape`, it returns an array of shape
            ``(Sl, ..., S2, S1, Lk, ..., L2, L1)``, where ``S`` is :attr:`shape`, whose
            entry at index ``(sl, ..., s2, s1, ik, ..., i2, i1)`` contains
            evaluation of the (cross-)covariance function at index ``(sl, ..., s2, s1)``
            at the inputs ``x0[ik, ..., i2, i1, :]`` and ``x1[ik, ..., i2, i1, :]``.
            Above, we assume that ``x0`` and ``x1`` have been broadcast according to the
            rules described in the "Notes" section.

        Raises
        ------
        ValueError
            If the inputs can not be "kernel broadcast" to a common shape.

        See Also
        --------
        matrix: Convenience function to compute a kernel matrix, i.e. a matrix of
            pairwise evaluations of the kernel on two sets of points.

        Notes
        -----
        A :class:`Kernel` operates on its two inputs by a slightly modified version of
        Numpy's broadcasting rules. First of all, the operation of the kernel is
        vectorized over all but the last dimension, applying standard broadcasting
        rules. An input with shape ``()`` is promoted to an input with shape ``(1,)``.
        Additionally, a ``1`` along the last axis of an input is interpreted as a (set
        of) point(s) with equal coordinates in all input dimensions, i.e. the inputs are
        broadcast to :attr:`input_dim` dimensions along the last axis. We refer to this
        modified set of broadcasting rules as "kernel broadcasting".

        Examples
        --------
        See documentation of class :class:`Kernel`.
        """

        x0 = np.atleast_1d(x0)

        if x1 is not None:
            x1 = np.atleast_1d(x1)

        # Shape checking
        broadcast_input_shape = self._kernel_broadcast_shapes(x0, x1)

        # Evaluate the kernel
        k_x0_x1 = self._evaluate(x0, x1)

        assert k_x0_x1.shape == self._shape + broadcast_input_shape[:-1]

        return k_x0_x1

[docs]    def matrix(
        self,
        x0: ArrayLike,
        x1: Optional[ArrayLike] = None,
    ) -> np.ndarray:
        """A convenience function for computing a kernel matrix for two sets of inputs.

        This is syntactic sugar for ``k(x0[:, None, :], x1[None, :, :])``. Hence, it
        computes the matrix of pairwise covariances between two sets of input points.
        If ``k`` represents a covariance function, then the resulting matrix will be
        symmetric positive (semi-)definite for ``x0 == x1``.

        Parameters
        ----------
        x0 : array-like
            First set of inputs to the (cross-)covariance function as an array of shape
            ``(M, D)``, where ``D`` is either 1 or :attr:`input_dim`.
        x1 : array-like
            Optional second set of inputs to the (cross-)covariance function as an array
            of shape ``(N, D)``, where ``D`` is either 1 or :attr:`input_dim`.
            If ``x1`` is not specified, the function behaves as if ``x1 = x0``.

        Returns
        -------
        kernmat : numpy.ndarray
            The matrix / stack of matrices containing the pairwise evaluations of the
            (cross-)covariance function(s) on ``x0`` and ``x1`` as an array of shape
            ``(M, N)`` if :attr:`shape` is ``()`` or
            ``(S[l - 1], ..., S[1], S[0], M, N)``, where ``S`` is :attr:`shape` if
            :attr:`shape` is non-empty.

        Raises
        ------
        ValueError
            If the shapes of the inputs don't match the specification.

        See Also
        --------
        __call__: Evaluate the kernel more flexibly.

        Examples
        --------
        See documentation of class :class:`Kernel`.
        """

        x0 = np.array(x0)
        x1 = x0 if x1 is None else np.array(x1)

        # Shape checking
        errmsg = (
            "`{argname}` must have shape `(N, D)` or `(D,)`, where `D` is the input "
            f"dimension of the kernel (D = {self.input_dim}), but an array with shape "
            "`{shape}` was given."
        )

        if not (1 <= x0.ndim <= 2 and x0.shape[-1] in (self.input_dim, 1)):
            raise ValueError(errmsg.format(argname="x0", shape=x0.shape))

        if not (1 <= x1.ndim <= 2 and x1.shape[-1] in (self.input_dim, 1)):
            raise ValueError(errmsg.format(argname="x1", shape=x1.shape))

        # Pairwise kernel evaluation
        return self(x0[:, None, :], x1[None, :, :])

    @property
    def input_dim(self) -> int:
        """Dimension of arguments of the covariance function."""
        return self._input_dim

    @property
    def shape(self) -> ShapeType:
        """If :attr:`shape` is ``()``, the :class:`Kernel` instance represents a
        single (cross-)covariance function. Otherwise, i.e. if :attr:`shape` is
        non-empty, the :class:`Kernel` instance represents a tensor of
        (cross-)covariance functions whose shape is given by ``shape``."""
        return self._shape

    @abc.abstractmethod
    def _evaluate(
        self,
        x0: ArrayLike,
        x1: Optional[ArrayLike],
    ) -> Union[np.ndarray, np.float_]:
        """Implementation of the kernel evaluation which is called after input checking.

        When implementing a particular kernel, the subclass should implement the kernel
        computation by overwriting this method. It is called by the
        :meth:`Kernel.__call__` method after applying input checking and promoting
        scalar inputs, i.e. inputs with shape ``()``, to arrays with shape ``(1,)``.
        The implementation of the kernel must implement the rules of kernel broadcasting
        which are outlined in the "Notes" section of the :meth:`Kernel.__call__`
        docstring. Note that the inputs are not automatically broadcast to a common
        shape, but it is guaranteed that the inputs can be broadcast to a common shape
        when applying the rules of "kernel broadcasting".

        Parameters
        ----------
        x0 : array-like
            An array of shape ``(Nn, ..., N2, N1, D_in)``, where ``D_in`` is either
            ``1`` or :attr:`input_dim`, whose entries will be passed to the first
            argument of the kernel.
        x1 : array-like
            An array of shape ``(Mm, ..., M2, M1, D_in)``, where ``D_in`` is either
            ``1`` or :attr:`input_dim`, whose entries will be passed to the second
            argument of the kernel. Can also be set to ``None``, in which case the
            method must behave as if ``x1 == x0``.

        Returns
        -------
        k_x0_x1 : numpy.ndarray
            The (cross-)covariance function(s) evaluated at ``x0`` and ``x1`` as an
            array of shape ``(S[l - 1], ..., S[1], S[0], Lk, ..., L2, L1)``,
            where ``S`` is :attr:`shape`, whose entry at index
            ``(sl, ..., s2, s1, ik, ..., i2, i1)`` contains the
            evaluation of the (cross-)covariance function at index ``(sl, ..., s2, s1)``
            at the inputs ``x0[ik, ..., i2, i1, :]`` and ``x1[ik, ..., i2, i1, :]``,
            where we assume that ``x0`` and ``x1`` have been broadcast according to the
            rules described in the "Notes" section of :meth:`Kernel.__call__`.
        """

    def _kernel_broadcast_shapes(
        self,
        x0: np.ndarray,
        x1: Optional[np.ndarray] = None,
    ) -> ShapeType:
        """Applies the "kernel broadcasting" rules to the input shapes.

        A description of the "kernel broadcasting" rules can be found in the docstring
        of :meth:`Kernel.__call__`.
        Appropriate exceptions are raised if the shapes can not be "kernel broadcast" to
        a common shape.

        Parameters
        ----------
        x0 :
            First input to the covariance function.
        x1 :
            Second input to the covariance function.

        Returns
        -------
        broadcast_input_shape : tuple of int
            Shape of the inputs after applying "kernel broadcasting".

        Raises
        -------
        ValueError
            If the inputs can not be "kernel broadcast" to a common shape.
        """

        err_msg = (
            "`{argname}` can not be broadcast to the kernel's input dimension "
            f"{self.input_dim} along the last axis, since "
            "`{argname}.shape = {shape}`."
        )

        if x0.shape[-1] not in (self.input_dim, 1):
            # This will never be called if the original input was a scalar
            raise ValueError(err_msg.format(argname="x0", shape=x0.shape))

        broadcast_input_shape = x0.shape

        if x1 is not None:
            if x1.shape[-1] not in (self.input_dim, 1):
                # This will never be called if the original input was a scalar
                raise ValueError(err_msg.format(argname="x1", shape=x1.shape))

            try:
                # Ironically, `np.broadcast_arrays` seems to be more efficient than
                # `np.broadcast_shapes`
                broadcast_input_shape = np.broadcast_arrays(x0, x1)[0].shape
            except ValueError as v:
                raise ValueError(
                    f"The input arrays `x0` and `x1` with shapes {x0.shape} and "
                    f"{x1.shape} can not be broadcast to a common shape."
                ) from v

        if broadcast_input_shape[-1] == 1:
            broadcast_input_shape = broadcast_input_shape[:-1] + (self.input_dim,)

        return broadcast_input_shape

    def _euclidean_inner_products(
        self, x0: np.ndarray, x1: Optional[np.ndarray]
    ) -> np.ndarray:
        """Implementation of the Euclidean inner product, which supports kernel
        broadcasting semantics."""
        prods = x0 ** 2 if x1 is None else x0 * x1

        if prods.shape[-1] == 1:
            return self.input_dim * prods[..., 0]

        return np.sum(prods, axis=-1)


class IsotropicMixin(abc.ABC):  # pylint: disable=too-few-public-methods
    r"""Mixin for isotropic kernels.

    An isotropic kernel is a kernel which only depends on the Euclidean norm of the
    distance between the arguments, i.e.

    .. math ::

        k(x_0, x_1) = k(\lVert x_0 - x_1 \rVert_2).

    Hence, all isotropic kernels are stationary.
    """

    def _squared_euclidean_distances(
        self, x0: np.ndarray, x1: Optional[np.ndarray]
    ) -> np.ndarray:
        """Implementation of the squared Euclidean distance, which supports kernel
        broadcasting semantics."""
        if x1 is None:
            return np.zeros_like(  # pylint: disable=unexpected-keyword-arg
                x0,
                shape=x0.shape[:-1],
            )

        sqdiffs = (x0 - x1) ** 2

        if sqdiffs.shape[-1] == 1:
            return self.input_dim * sqdiffs[..., 0]

        return np.sum(sqdiffs, axis=-1)

    def _euclidean_distances(
        self, x0: np.ndarray, x1: Optional[np.ndarray]
    ) -> np.ndarray:
        """Implementation of the Euclidean distance, which supports kernel
        broadcasting semantics."""
        if x1 is None:
            return np.zeros_like(  # pylint: disable=unexpected-keyword-arg
                x0,
                shape=x0.shape[:-1],
            )

        return np.sqrt(self._squared_euclidean_distances(x0, x1))