Source code for kernelmethods.base

"""
This module implements the Base classes as well as the core classes for the library.

.. autosummary::
    :toctree: _autosummary

"""

from abc import ABC, abstractmethod
from collections.abc import Iterable
from copy import copy
from itertools import product as iter_product
from warnings import warn

import numpy as np
from kernelmethods import config as cfg
from kernelmethods.config import (KMAccessError, KMSetAdditionError,
                                  KernelMethodsWarning)
from kernelmethods.operations import (center_km, frobenius_norm, is_PSD,
                                      normalize_km,
                                      normalize_km_2sample)
from kernelmethods.utils import (check_callable, contains_nan_inf, ensure_ndarray_1D,
                                 ensure_ndarray_2D, get_callable_name, not_symmetric)
from scipy.sparse import issparse, lil_matrix


[docs]class BaseKernelFunction(ABC):
    """
    Abstract base class for kernel functions.

    Enforces each derived kernel:
    1. to be callable, with two inputs
    2. to have a name and a str representation
    3. provides a method to check whether the derived kernel func is a valid kernel
       i.e. kernel matrix derived on a random sample is positive semi-definite (PSD)
    4. and that it is symmetric (via tests) as required.

    """


    def __init__(self, name):
        """
        Constructor.

        Parameters
        ----------
        name : str
            short name to describe the nature of the kernel function

        """

        self.name = name


    @abstractmethod
    def __call__(self, x, y):
        """Actual computation to defined in the inherited class!"""


[docs]    def is_psd(self):
        """Tests whether kernel matrix produced via this function is PSD"""

        # passing the instance of the derived class
        km = KernelMatrix(self)

        km.attach_to(np.random.rand(50, 4))  # random_sample
        return is_PSD(km.full)


    @abstractmethod
    def __str__(self):
        """Representation"""


    # aliasing others to __str__ for now
    def __format__(self, _):
        """Representation"""

        return self.__str__()


    def __repr__(self):
        """Representation"""

        return self.__str__()


[docs]class KernelFromCallable(BaseKernelFunction):
    """Class to create a custom kernel from a given callable.

    Parameters
    ----------
    input_func : callable
        A callable that can accept atleast 2 args
        Must not be builtin or C function.
        If func is a C or builtin func, wrap it in a python def

    name : str
        A name to identify this kernel in a human readable way

    func_params : dict
        Parameters to func
    """


    def __init__(self, input_func, name=None, **func_params):
        """
        Constructor.

        Parameters
        ----------
        input_func : callable
            A callable that can accept atleast 2 args
            Must not be builtin or C function.
            If func is a C or builtin func, wrap it in a python def

        name : str
            A name to identify this kernel in a human readable way

        func_params : dict
            Parameters to func

        """

        self.func = check_callable(input_func, min_num_args=2)
        self.params = func_params

        super().__init__(name=get_callable_name(input_func, name))


    def __call__(self, x, y):
        """Actual computation!"""

        return self.func(x, y, **self.params)


    def __str__(self):
        """human readable repr"""

        arg_repr = '({})'.format(self.params) if len(self.params) > 0 else ''
        return "{}{}".format(self.name, arg_repr)


    # aliasing them to __str__ for now
    __format__ = __str__
    __repr__ = __str__


[docs]class KernelMatrix(object):
    """
    KernelMatrix is a self-contained class for the Gram matrix induced by a kernel
    function on a sample.

    KernelMatrix behaves just like numpy arrays in terms of accessing its elements:

    KM[i,j] --> kernel function between samples i and j

    KM[set_i,set_j] where len(set_i)=m and len(set_i)=n returns a matrix KM of
    size m x n, where KM_ij = kernel between samples set_i(i) and set_j(j)

    Parameters
    ----------
    kernel : BaseKernelFunction
        kernel function that populates the kernel matrix

    normalized : bool
        Flag to indicate whether to normalize the kernel matrix
        Normalization is recommended, unless you have clear reasons not to.

    name : str
        short name to describe the nature of the kernel function

    """


    def __init__(self,
                 kernel,
                 normalized=True,
                 name='KernelMatrix'):
        """
        Constructor for the KernelMatrix class.

        Parameters
        ----------
        kernel : BaseKernelFunction
            kernel function that populates the kernel matrix

        normalized : bool
            Flag to indicate whether to normalize the kernel matrix
            Normalization is recommended, unless you have clear reasons not to.

        name : str
            short name to describe the nature of the kernel function

        """

        if not isinstance(kernel, BaseKernelFunction):
            raise TypeError('Input kernel must be derived from '
                            ' kernelmethods.BaseKernelFunction')

        if not isinstance(normalized, bool):
            raise TypeError('normalized flag must be True or False')

        self.kernel = kernel
        self._keep_normed = normalized
        self.name = name

        # to ensure we can always query the size attribute
        self._num_samples = None
        self._sample = None
        self._sample_name = None

        # user-defined attribute dictionary
        self._attr = dict()

        self._reset()


[docs]    def attach_to(self,
                  sample_one,
                  name_one='sample',
                  sample_two=None,
                  name_two=None):
        """
        Attach this kernel to a given sample.

        Any computations from previous samples and their results will be reset,
        along with all the previously set attributes.

        Parameters
        ----------
        sample_one : ndarray
            Input sample to operate on
            Must be a 2D dataset of shape (num_samples, num_features) e.g.
            MLDataset or ndarray When sample_two=None (e.g. during training),
            sample_two refers to sample_one.

        name_one : str
            Name for the first sample.

        sample_two : ndarray
            Second sample for the kernel matrix i.e. Y in K(X,Y)
            Must be a 2D dataset of shape (num_samples, num_features) e.g.
            MLDataset or ndarray The dimensionality of this sample (number of
            columns, sample_two.shape[1]) must match with that of sample_one

        name_two : str
            Name for the second sample.
        """

        self._sample = ensure_ndarray_2D(sample_one, ensure_dtype=sample_one.dtype)
        self._sample_name = name_one

        if sample_two is None:
            self._sample_two = self._sample
            self._name_two = name_one

            self._num_samples = self._sample.shape[0]
            self.shape = (self._num_samples, self._num_samples)
            self._two_samples = False

            self._sample_descr = "{} {}".format(self._sample_name,
                                                self._sample.shape)

        else:
            self._sample_two = ensure_ndarray_2D(sample_two,
                                                 ensure_dtype=sample_two.dtype)

            if self._sample.shape[1] != self._sample_two.shape[1]:
                raise ValueError('Dimensionalities of the two samples differ!')

            self._name_two = name_two
            self._num_samples = (self._sample.shape[0], self._sample_two.shape[0])
            self.shape = (self._sample.shape[0], self._sample_two.shape[0])

            self._two_samples = True

            self._sample_descr = "{} {} x {} {}" \
                                 "".format(self._sample_name, self._sample.shape,
                                           self._name_two, self._sample_two.shape)

        # cleanup old flags and reset to ensure fresh slate for this sample
        self._reset()


[docs]    def set_attr(self, name, value):
        """
        Sets user-defined attributes for the kernel matrix.

        Useful to identify this kernel matrix in various aspects!
        You could think of them as tags or identifiers etc.
        As they are user-defined, they are ideal to represent user needs and
        applications.

        Parameters
        ----------
        name : str or hashable
            Names of the attribute.

        value : object
            Value of the attribute

        """

        self._attr[name] = value


[docs]    def get_attr(self, attr_name, value_if_not_found=None):
        """
        Returns the value of the user-defined attribute.

        Parameters
        ----------
        attr_name : str or hashable

        value_if_not_found : object
            If attribute was not set previously, returns this value

        Returns
        -------
        attr_value : object
            Value of the attribute if found.
            Or value_if_not_found if attribute is not found.

        """

        return self._attr.get(attr_name, value_if_not_found)


[docs]    def attributes(self):
        """
        Returns all the attributes currently set.

        Returns
        -------
        attributes : dict
            Dict of the all the attributes currently set.
        """

        return self._attr


    @property  # this is to prevent accidental change of value
    def num_samples(self):
        """
        Returns the number of samples in the sample this kernel is attached to.

        This would be a scalar when the current instance is attached to a single
        sample. When a product of two samples i.e. K(X,Y) instead of K(X,X), it is an
        array of 2 scalars representing num_samples from those two samples.
        """

        return self._num_samples


    def _reset(self):
        """Convenience routine to reset internal state"""

        self._populated_fully = False
        self._lower_tri_km_filled = False
        if hasattr(self, '_full_km'):
            delattr(self, '_full_km')
        self._is_centered = False
        self._is_normed = False

        # As K(i,j) is the same as K(j,i), only one of them needs to be computed!
        #  so internally we could store both K(i,j) and K(j,i) as K(min(i,j),
        #  max(i,j))
        self._KM = dict()

        # restricting attributes to the latest sample only, to avoid leakage!!
        self._attr.clear()

        # debugging and efficiency measurement purposes
        # for a given sample (of size n),
        #   number of kernel evals must never be more than n+ n*(n-1)/2 (or n(n+1)/2)
        #   regardless of the number of times different forms of KM are accessed!
        self._num_ker_eval = 0


    @property
    def size(self):
        """
        Returns the size of the KernelMatrix (total number of elements)
        i.e. num_samples from which the kernel matrix is computed from.
        In a single-sample case, it is the num_samples in the dataset.
        In two-sample case, it is the product of num_samples from two datasets.

        Defining this to correspond to .size attr of numpy arrays
        """

        if not self._two_samples:
            return self._num_samples ** 2
        else:
            return np.prod(self._num_samples)


    def __len__(self):
        """Convenience wrapper for .size attribute, to enable use of len(
        KernelMatrix)"""

        return self.size


    @property
    def full(self):
        """Fully populated kernel matrix in dense ndarray format."""

        if self._sample is None:
            raise ValueError('No sample is attached yet!\n Attach a sample first '
                             'before trying to use the KernelMatrix')

        if not self._populated_fully:
            self._populate_fully(fill_lower_tri=True, dense_fmt=True)

        if self._keep_normed:
            if not self._is_normed:
                self.normalize()
            return self._normed_km
        else:
            return self._full_km


    @property
    def full_sparse(self):
        """Kernel matrix populated in upper tri in sparse array format."""

        return self._populate_fully(dense_fmt=False, fill_lower_tri=False)


[docs]    def center(self):
        """
        Method to center the kernel matrix

        Returns
        -------
        None

        Raises
        ------
        NotImplementedError
            If the KM is attached two separate samples.
            Centering a KM is possible only when attached to a single sample.
        """

        if self._two_samples:
            raise NotImplementedError('Centering is not implemented (or possible)'
                                      ' when KM is attached two separate samples.')

        if not self._populated_fully:
            self._full_km = self._populate_fully(fill_lower_tri=True, dense_fmt=True)

        self._centered = center_km(self._full_km)
        self._is_centered = True


[docs]    def normalize(self, method='cosine'):
        """

        Normalize the kernel matrix to have unit diagonal.

        Cosine normalization implements definition according to Section 5.1 in
        Shawe-Taylor and Cristianini, "Kernels Methods for Pattern Analysis", 2004

        Parameters
        ----------
        method : str
            Identifier of the method.

        Returns
        -------
        None

        """

        if not self._populated_fully:
            self._populate_fully(dense_fmt=True, fill_lower_tri=True)

        if not self._is_normed:
            if not self._two_samples:
                self._normed_km = normalize_km(self._full_km, method=method)
            else:
                # KM_XX and KM_YY must NOT be normalized for correct norm of K_XY
                #   NOTE: K_XY may NOT have unit diagonal
                #       as k(x,y) != sqrt(k(x,x))*sqrt(k(y,y))
                KM_XX = KernelMatrix(self.kernel, normalized=False)
                KM_XX.attach_to(sample_one=self._sample)

                KM_YY = KernelMatrix(self.kernel, normalized=False)
                KM_YY.attach_to(sample_one=self._sample_two)

                # not passing .full_km for KM_XX and KM_YY as we only need their
                # diagonal
                self._normed_km = normalize_km_2sample(self._full_km,
                                                       KM_XX.diagonal(),
                                                       KM_YY.diagonal())
            self._is_normed = True

            if contains_nan_inf(self._normed_km):
                warn('Kernel matrix computation resulted in Inf or NaN values!'
                     ' Check your parameters and data!\n Kernel function: {}'
                     ''.format(self.kernel), KernelMethodsWarning)


    @property
    def centered(self):
        """Exposes the centered version of the kernel matrix"""

        if self._two_samples:
            raise KMAccessError('Centering not defined when attached to 2 samples!')

        if not self._is_centered:
            self.center()

        return self._centered


    @property
    def frob_norm(self):
        """Returns the Frobenius norm of the current kernel matrix"""

        if not self._populated_fully:
            self._populate_fully(dense_fmt=True, fill_lower_tri=True)

        if not hasattr(self, '_frob_norm'):
            self._frob_norm = frobenius_norm(self._full_km)

        return self._frob_norm


[docs]    def diagonal(self):
        """
        Returns the diagonal of the kernel matrix, when attached to a single sample.

        Raises
        ------
            ValueError
                When this instance is attached to more than one sample
        """

        if self._two_samples:
            raise KMAccessError('Diagonal() not defined when attached to 2 samples!')

        return np.array(
            [self._eval_kernel(idx, idx) for idx in range(self.shape[0])])


    @property
    def normed_km(self):
        """Access to the normalized kernel matrix."""

        if not self._is_normed:
            self.normalize()

        return self._normed_km


    def _eval_kernel(self, idx_one, idx_two):
        """Returns kernel value between samples identified by indices one and two"""

        # maintaining only upper triangular parts, when attached to a single sample
        #   by ensuring the first index is always <= second index
        if idx_one > idx_two and not self._two_samples:
            idx_one, idx_two = idx_two, idx_one
        # above is more efficient than below:
        #  idx_one, idx_two = min(idx_one, idx_two), max(idx_one, idx_two)

        if not (idx_one, idx_two) in self._KM:
            self._KM[(idx_one, idx_two)] = \
                self.kernel(self._sample[idx_one, :],  # from 1st sample
                            self._sample_two[idx_two, :])  # from 2nd sample
            # second refers to the first in the default case!
            self._num_ker_eval += 1

        return self._KM[(idx_one, idx_two)]


    def _features(self, index):
        """
        Returns the sample [features] corresponding to a given index.

        Using this would help abstract out the underlying data structure for
        samples and their features. For example, inputs can be simply CSVs,
        or numpy arrays or MLDataset or xarray or pandas etc. Disadvantages
        include the 2 extra function calls to be made for each kernel eval,
        which could be saved when operating on a predetermined format.
        """

        return self._sample[index, :]


    def __getitem__(self, index_obj):
        """
        Item getter to allow for efficient access
        to partial or random portions of kernel matrix!

        Indexing here is aimed to be compliant with numpy implementation
        as much as possible: https://docs.scipy.org/doc/numpy-1.13.0/reference
        /arrays.indexing.html#arrays-indexing

        """

        if np.issubdtype(type(index_obj), np.int_):
            index_obj = np.unravel_index(index_obj, self.shape)

        if (not isinstance(index_obj, Iterable)) or len(index_obj) != 2 or \
            isinstance(index_obj, str) or index_obj is None:
            raise KMAccessError('Indexing object must be an iterable of length 2. '
                                'Supply two [sets/ranges of] indices in a tuple! '
                                'It can not be a string or None either. '
                                'Provided: {}'.format(index_obj))

        set_one, are_all_selected_dim_one = self._get_indices_in_sample(index_obj[0],
                                                                        dim=0)
        set_two, are_all_selected_dim_two = self._get_indices_in_sample(index_obj[1],
                                                                        dim=1)

        # below code prevents user from [VERY] inefficiently computing
        # the entire kernel matrix with KM[:,:],
        # without exploiting the fact that KM is symmetric
        if are_all_selected_dim_one and are_all_selected_dim_two:
            return self._populate_fully(fill_lower_tri=True)
        else:
            return self._compute_for_index_combinations(set_one, set_two)


    def _get_indices_in_sample(self, index_obj_per_dim, dim):
        """
        Turn an index or slice object on a given dimension
        into a set of row indices into sample the kernel matrix is attached to.

        As the kernel matrix is 2D and symmetric of known size,
        dimension size doesn't need to be specified, it is taken from
        self.num_samples

        """

        are_all_selected = False

        if np.issubdtype(type(index_obj_per_dim), np.int_):
            indices = [index_obj_per_dim, ]  # making it iterable
        elif isinstance(index_obj_per_dim, slice):
            if index_obj_per_dim is None:
                are_all_selected = True
            _slice_index_list = index_obj_per_dim.indices(self.shape[dim])
            indices = list(range(*_slice_index_list))  # *list expands it as args
        elif isinstance(index_obj_per_dim, Iterable) and \
            not isinstance(index_obj_per_dim, str):
            # TODO no restriction on float: float indices will be rounded down
            #  towards 0
            indices = list(map(int, index_obj_per_dim))
        else:
            raise KMAccessError('Invalid index method/indices for kernel matrix '
                                'of shape : {km_shape}.'
                                ' Only int, slice or iterable objects are allowed!'
                                ''.format(km_shape=self.shape))

        # enforcing constraints
        if any([index >= self.shape[dim] or index < 0 or np.isnan(index)
                for index in indices]):
            raise KMAccessError('Invalid index method/indices for kernel matrix!\n'
                                ' Some indices in {} are out of range: '
                                ' shape : {km_shape},'
                                ' index values must all be >=0 and < corr. dimension'
                                ''.format(indices, km_shape=self.shape))

        # slice object returns empty list if all specified are out of range
        if len(indices) == 0:
            raise KMAccessError('No samples were selected in dim {}'.format(dim))

        # removing duplicates and sorting
        indices = sorted(list(set(indices)))

        if len(indices) == self.shape[dim]:
            are_all_selected = True

        return indices, are_all_selected


    def _compute_for_index_combinations(self, set_one, set_two):
        """
        Computes value of kernel matrix for all combinations of given set of indices
        """

        return np.array([self._eval_kernel(idx_one, idx_two)
                         for idx_one, idx_two in iter_product(set_one, set_two)],
                        dtype=self._sample.dtype).reshape(len(set_one), len(set_two))


    def _populate_fully(self, dense_fmt=False, fill_lower_tri=False):
        """Applies the kernel function on all pairs of points in a sample.

        CAUTION: this may not always be necessary,
            and can take HUGE memory for LARGE datasets,
            and also can take a lot of time.

        """

        # kernel matrix is symmetric (in a single sample case)
        #   so we need only to STORE half the matrix!
        # as we are computing the full matrix anyways, it's better to keep a copy
        #   to avoid recomputing it for each access of self.full* attributes
        if not self._populated_fully and not hasattr(self, '_full_km'):
            if not dense_fmt:
                self._full_km = lil_matrix(self.shape, dtype=cfg.km_dtype)
            else:
                # filling with nan to avoid unexpected usage!
                self._full_km = np.full(self.shape, fill_value=np.nan,
                                        dtype=cfg.km_dtype)

            try:
                # kernel matrix is symmetric (in a single sample case)
                #   so we need only compute half the matrix!
                # computing the kernel for diagonal elements i,i as well
                #   as ix_two, even when equal to ix_one,
                #   refers to sample_two in the two_samples case
                for ix_one in range(self.shape[0]): # number of rows!
                    for ix_two in range(ix_one, self.shape[1]): # from second sample!
                        self._full_km[ix_one, ix_two] = \
                            self._eval_kernel(ix_one, ix_two)
            except:
                raise RuntimeError('Unable to fully compute the kernel matrix!')
            else:
                self._populated_fully = True

        if fill_lower_tri and not self._lower_tri_km_filled:
            try:
                # choosing k=-1 as main diag is already covered above (nested for
                # loop)
                ix_lower_tri = np.tril_indices(self.shape[0], m=self.shape[1], k=-1)

                if not self._two_samples and self.shape[0] == self.shape[1]:
                    self._full_km[ix_lower_tri] = self._full_km.T[ix_lower_tri]
                else:
                    # evaluating it for the lower triangle as well!
                    for ix_one, ix_two in zip(*ix_lower_tri):
                        self._full_km[ix_one, ix_two] = self._eval_kernel(ix_one,
                                                                          ix_two)
            except:
                raise RuntimeError('Unable to symmetrize the kernel matrix!')
            else:
                self._lower_tri_km_filled = True

        if issparse(self._full_km) and dense_fmt:
            self._full_km = self._full_km.todense()

        if contains_nan_inf(self._full_km):
            warn('Kernel matrix computation resulted in Inf or NaN values!'
                 ' Check your parameters and data!\n Kernel function: {}'
                 ''.format(self.kernel), KernelMethodsWarning)

        return self._full_km


    def __str__(self):
        """human readable presentation"""

        string = "{}: {}".format(self.name, str(self.kernel))
        if self._sample is not None:
            # showing normalization status only when attached to data!
            string += " (normed={}) on {}".format(self._keep_normed,
                                                  self._sample_descr)

        return string


    # aliasing them to __str__ for now
    __format__ = __str__
    __repr__ = __str__


    # TODO implement arithmetic operations on kernel matrices
    def __add__(self, other):
        """Addition"""
        raise NotImplementedError()


    def __mul__(self, other):
        """Multiplication"""
        raise NotImplementedError()


    def __sub__(self, other):
        """Subtraction"""
        raise NotImplementedError()


[docs]class KernelMatrixPrecomputed(object):
    """Convenience decorator for kernel matrices in ndarray or simple matrix
    format."""


    def __init__(self, matrix, name=None):
        """Constructor"""

        if not isinstance(matrix, np.ndarray):
            matrix = np.array(matrix)

        if matrix.ndim != 2 or not_symmetric(matrix) or \
            (not np.isreal(matrix).all()):
            raise ValueError('Input matrix appears to be NOT 2D or symmetric or '
                             'not real! A real-valued symmetric matrix is needed '
                             'for a valid kernel.')

        self._KM = matrix
        self.num_samples = self._KM.shape[0]

        if name is None:
            self.name = 'Precomputed'
        else:
            self.name = str(name)


    def __len__(self):
        """size of kernel matrix"""

        return self.size


    @property
    def size(self):
        """size of kernel matrix"""

        return self._KM.shape[0]


    @property
    def full(self):
        """Returns the full kernel matrix (in dense format, as its already
        precomputed)"""
        return self._KM


    @property
    def diag(self):
        """Returns the diagonal of the kernel matrix"""

        return self._KM.diagonal()


    def __getitem__(self, index_obj):
        """Access the matrix"""

        try:
            return self._KM[index_obj]
        except:
            raise KMAccessError('Invalid attempt to access the 2D kernel matrix!')


    def __str__(self):
        """human readable presentation"""

        return "{}(num_samples={})".format(self.name, self.num_samples)


    # aliasing them to __str__ for now
    __format__ = __str__
    __repr__ = __str__


[docs]class ConstantKernelMatrix(object):
    """Custom KernelMatrix (KM) to efficiently represent a constant.

    Parameters
    ----------
    num_samples : int
        Number of samples (size) for this KM

    value : float
        Constant value for all elements in this KM

    name : str
        Identifier and name for this KM

    dtype : dtype
        Data type for the constant value
    """


    def __init__(self,
                 num_samples,
                 value=0.0,
                 name='Constant',
                 dtype='float'):
        """
        Constant kernel matrix

        Parameters
        ----------
        num_samples : int
            Number of samples (size) for this KM

        value : float
            Constant value for all elements in this KM

        name : str
            Identifier and name for this KM

        dtype : dtype
            Data type for the constant value
        """

        self.num_samples = num_samples
        self.const_value = value
        self.dtype = dtype

        if name is None:
            self.name = 'Constant'
        else:
            self.name = str(name)


    def __len__(self):
        """size of kernel matrix"""

        return self.size


    @property
    def size(self):
        """Size of kernel matrix"""
        return self.num_samples


    @property
    def shape(self):
        """Shape of the kernel matrix"""
        return (self.num_samples, self.num_samples)


    @property
    def full(self):
        """Returns the full kernel matrix (in dense format)"""

        if not hasattr(self, '_KM'):
            self._KM = np.full((self.num_samples, self.num_samples),
                               fill_value=self.const_value,
                               dtype=self.dtype)

        return self._KM


    @property
    def diag(self):
        """Returns the diagonal of the kernel matrix"""

        return np.full((self.num_samples,),
                       fill_value=self.const_value, dtype=self.dtype)


    def __getitem__(self, index_obj):
        """Access the matrix"""

        if (not isinstance(index_obj, Iterable)) or len(index_obj) != 2 or \
            isinstance(index_obj, str) or index_obj is None:
            raise KMAccessError('Indexing object must be an iterable of length 2.'
                                'It can not be a string or None either.')

        # full-fledged behavior and eval of this getitem is needed to make this
        # fully compatible with the generic KernelMatrix class
        row_indices = self._get_indices_in_sample(index_obj[0])
        col_indices = self._get_indices_in_sample(index_obj[1])

        # all we need to know is the number of indices selected
        # (and they were indeed in admissible range)
        return np.full((len(row_indices), len(col_indices)),
                       fill_value=self.const_value,
                       dtype=self.dtype)


    def _get_indices_in_sample(self, index_obj_per_dim):
        """
        Turn an index or slice object on a given dimension
        into a set of row indices into sample the kernel matrix is attached to.

        As the kernel matrix is 2D and symmetric of known size,
        dimension size doesn't need to be specified, it is taken from
        self.num_samples

        """

        if isinstance(index_obj_per_dim, str) or index_obj_per_dim is None:
            raise KMAccessError('Indices can not be strings!')

        if np.issubdtype(type(index_obj_per_dim), np.int_):
            indices = [index_obj_per_dim, ]  # making it iterable
        elif isinstance(index_obj_per_dim, slice):
            _slice_index_list = index_obj_per_dim.indices(self.num_samples)
            indices = list(range(*_slice_index_list))  # *list expands it as args
        elif isinstance(index_obj_per_dim, Iterable):
            # TODO no restriction on float: float indices will be rounded down
            #  towards 0
            indices = list(map(int, index_obj_per_dim))
        else:
            raise KMAccessError('Invalid index method/indices {indices} '
                                'for kernel matrix of shape : {km_shape}.'
                                ' Only int, slice or iterable objects are allowed!'
                                ''.format(km_shape=self.shape,
                                          indices=index_obj_per_dim))

        # enforcing constraints
        if any([index >= self.num_samples or index < 0 for index in indices]):
            raise KMAccessError('Invalid index method/indices for kernel matrix!\n'
                                ' Some indices in {} are out of range: '
                                ' shape : {km_shape},'
                                ' index values must all be >=0 and < corr. dimension'
                                ''.format(indices, km_shape=self.shape))

        # slice object returns empty list if all specified are out of range
        if len(indices) == 0:
            raise KMAccessError('No samples were selected in dim {}'.format(dim))

        # removing duplicates and sorting
        indices = sorted(list(set(indices)))

        return indices


    def __str__(self):
        """human readable presentation"""

        return "{}(value={},size={})" \
               "".format(self.name, self.const_value, self.num_samples)


    # aliasing them to __str__ for now
    __format__ = __str__
    __repr__ = __str__


VALID_KERNEL_MATRIX_TYPES = (KernelMatrix, KernelMatrixPrecomputed, np.ndarray)


[docs]class KernelSet(object):
    """
    Container class to manage a set of compatible KernelMatrix instances.

    Compatibility is checked based on the size (number of samples they operate on).
    Provides methods to iterate over the KMs, access a subset and query the
    underlying kernel funcs.

    """


    def __init__(self,
                 km_list=None,
                 name='KernelSet',
                 num_samples=None):
        """
        Constructor of the KernelSet class.

        Parameters
        ----------
        km_list : Iterable or KernelMatrix or None
            Initial set of kernel matrices to be added to this KernelSet

        name : str
            Name for this kernel set.

        num_samples : int
            Specifying the number of samples to be expected in each kernel matrix.
            Matching number of samples is a condition for compatibility.
            If not set during instantiation, it is inferred from the first KM.
        """

        self.name = name

        # empty to start with
        self._km_set = list()

        # user can choose to set the properties of the kernel matrices
        # this num_samples property is key, as only KMs with same value are
        # allowed in
        if num_samples is not None:
            self._num_samples = num_samples
            self._is_init = True
        else:
            # to denote no KM has been added yet, or their size property is not set
            self._is_init = False
            self._num_samples = None

        if (not isinstance(km_list, str)) and isinstance(km_list, Iterable):
            for km in km_list:
                self.append(km)
        elif isinstance(km_list, VALID_KERNEL_MATRIX_TYPES):
            self.append(km_list)
        elif km_list is None:
            pass  # do nothing
        else:
            raise TypeError('Unknown type of input matrix! '
                            'Must be one of:\n'
                            '{}'.format(VALID_KERNEL_MATRIX_TYPES))


    @property
    def size(self):
        """Number of kernel matrices in this set"""

        return len(self._km_set)


    @property
    def num_samples(self):
        """Number of samples in each individual kernel matrix """

        return self._num_samples


    def __len__(self):
        """Returns the number of kernels in this set"""

        return len(self._km_set)

    # TODO not a priority, but we might need methods to remove existing KMs
[docs]    def append(self, KM):
        """
        Method to add a new kernel to the set.

        Checks to ensure the new KM is compatible in size to the existing set.

        Parameters
        ----------
        KM : KernelMatrix or ndarray or compatible
            kernel matrix to be appended to the KernelSet

        """

        if not isinstance(KM, (BaseKernelFunction, KernelMatrix,
                               KernelMatrixPrecomputed)):
            KM = KernelMatrixPrecomputed(KM)

        if not self._is_init and self._num_samples is None:
            self._num_samples = copy(KM.num_samples)
            self._is_init = True

        if self._num_samples != KM.num_samples:
            raise KMSetAdditionError('Dimension of this KM {} is incompatible '
                                     'with KMSet of {}! '
                                     ''.format(KM.num_samples, self.num_samples))

        self._km_set.append(KM)


    def __getitem__(self, index):
        """To retrieve individual kernels"""

        if not (isinstance(index, int) or
                np.issubdtype(np.asanyarray(index).dtype, np.integer)):
            raise ValueError('Only integer indices are permitted, '
                             'accessing one KM at a time')

        if index < 0 or index >= self.size:
            raise IndexError('Index out of range for KernelSet of size {}'
                             ''.format(self.size))

        # TODO elements need to accessible by more than a simple integer index!
        #   Perhaps KernelMatrix can provide a hash to uniquely refer to an instance
        return self._km_set[index]


[docs]    def take(self, indices, name='SelectedKMs'):
        """
        "Returns a new KernelSet with requested kernel matrices, identified by
        their indices.

        Parameters
        ----------
        indices : Iterable
            List of indices identifying the kernel matrices to return

        name : str
            Name for the new kernel set.

        Returns
        -------
        ks : KernelSet
            New kernel set with the selected KMs

        """

        indices = self._check_indices(indices)

        new_set = KernelSet(name=name)
        for idx in indices:
            # TODO should we add a copy of ith KM, or just a reference?
            #   No copy-->accidental changes!
            new_set.append(self._km_set[idx])

        return new_set


[docs]    def get_kernel_funcs(self, indices):
        """
        Returns kernel functions underlying the specified kernel matrices in this
        kernel set.

        This is helpful to apply a given set of kernel functions on new sets of
        data (e.g. test set)

        Parameters
        ----------
        indices : Iterable
            List of indices identifying the kernel matrices to return

        Returns
        -------
        kf_tuple : tuple
            Tuple of kernel functinons from  the selected KMs

        """

        indices = self._check_indices(indices)

        return (self._km_set[index].kernel for index in indices)


    def _check_indices(self, indices):
        """Checks the validity and type of indices."""

        if not isinstance(indices, Iterable):
            indices = [indices, ]

        indices = np.array(indices, dtype='int64')

        if any(indices < 0) or any(indices >= self.size):
            raise IndexError(
                'One/more indices are out of range for KernelSet of size {}'
                ''.format(self.size))

        return indices


    def __str__(self):
        """Human readable repr"""

        return "{}({} kernels, {} samples):\n\t{} " \
               "".format(self.name, self.size, self.num_samples,
                         "\n\t".join(map(str, self._km_set)))


    # aliasing them to __str__ for now
    __format__ = __str__
    __repr__ = __str__


    def __iter__(self):
        """Making an iterable."""

        for index in range(self.size):
            yield self._km_set[index]


[docs]    def attach_to(self, sample,
                  name='sample',
                  attr_name=None,
                  attr_value=None):
        """
        Attach all the kernel matrices in this set to a given sample.

        Any previous evaluations to other samples and their results will be reset.

        Parameters
        ----------
        sample : ndarray
            Input sample to operate on
            Must be 2D of shape (num_samples, num_features)

        name : str
            Identifier for the sample (esp. when multiple are in the same set)

        """

        self.sample = ensure_ndarray_2D(sample)
        if self._num_samples is not None and sample.shape[0] != self._num_samples:
            raise ValueError('Number of samples in input differ from this KernelSet')
        else:
            self._num_samples = sample.shape[0]

        for index in range(self.size):
            self._km_set[index].attach_to(sample, name_one=name)

        if attr_name is not None:
            for index in range(self.size):
                self._km_set[index].set_attr(attr_name, attr_value)


[docs]    def extend(self, another_km_set):
        """Extends the current set by adding in all elements from another set."""

        if not isinstance(another_km_set, KernelSet):
            raise KMSetAdditionError('Input is not a KernelSet!'
                                     'Build a KernelSet() first.')

        if another_km_set.num_samples != self.num_samples:
            raise KMSetAdditionError('The two KernelSets are not compatible'
                                     ', in size (# samples)')

        for km in another_km_set:
            self.append(km)


[docs]    def set_attr(self, name, values):
        """
        Sets user-defined attributes for the kernel matrices in this set.

        If len(values)==1, same value is set for all. Otherwise values must be of
        size as KernelSet, providing a separate value for each element.

        Useful to identify this kernel matrix in various aspects!
        You could think of them as tags or identifiers etc.
        As they are user-defined, they are ideal to represent user needs and
        applications.

        Parameters
        ----------
        name : str or hashable
            Names of the attribute.

        values : object
            Value of the attribute

        """

        if not isinstance(values, Iterable) or isinstance(values, str):
            values = [values] * self.size
        elif len(values) != self.size:
            raise ValueError('Values must be single element, or '
                             'of the same size as this KernelSet ({}), '
                             'providing a separate value for each element.'
                             'It is {}'.format(self.size, len(values)))

        for index in range(self.size):
            self._km_set[index].set_attr(name, values[index])


[docs]    def get_attr(self, name, value_if_not_found=None):
        """Returns the value of an user-defined attribute.

        If not set previously, or no match found, returns value_if_not_found.

        Parameters
        ----------
        attr_name : str or hashable

        value_if_not_found : object
            If attribute was not set previously, returns this value

        Returns
        -------
        attr_values : object
            Values of the attribute from each KM in the set.
            Or value_if_not_found if attribute is not found.
        """

        return [self._km_set[index].get_attr(name, value_if_not_found)
                for index in range(self.size)]


[docs]class CompositeKernel(ABC):
    """
    Class to combine a set of kernels into a composite kernel.

    Parameters
    -----------
    km_set : KernelSet
        KernelSet on which the composite kernel will be applied to

    name : str
        Identifier for the composite kernel

    """


    def __init__(self, km_set, name='Composite'):
        """Constructor."""

        if not isinstance(km_set, KernelSet):
            raise TypeError('Input must be a KernelSet')

        if km_set.size < 2:
            raise ValueError('KernelSet must have atleast 2 kernels')

        if km_set.num_samples is None:
            raise ValueError('KernelSet is not attached to any sample!')

        self.km_set = km_set
        self.num_samples = km_set.num_samples
        self._is_fitted = False
        self.name = name


[docs]    @abstractmethod
    def fit(self):
        """Abstract methods that needs to be defined later."""
        pass


    @property
    def composite_KM(self):
        """Returns the result of composite operation"""

        if self._is_fitted:
            return self.KM
        else:
            raise ValueError('{} is not fitted yet!'.format(self.name))


    @property
    def full(self):
        """
        Returns the result of compsoite operation.

        Alias for composite_KM to match the KernelMatrix interface.
        """

        return self.composite_KM


    def __str__(self):
        """human readable presentation"""

        return "{}-->{}".format(self.name, str(self.km_set))


    # aliasing them to __str__ for now
    __format__ = __str__
    __repr__ = __str__


[docs]class SumKernel(CompositeKernel):
    """Class to define and compute a weighted sum kernel from a KernelSet

    Parameters
    -----------
    km_set : KernelSet
        KernelSet from which the summ kernel will be computed from

    name : str
        Identifier for the composite kernel
    """


    def __init__(self, km_set, name='SumKernel'):
        """Constructor."""

        super().__init__(km_set, name=name)


[docs]    def fit(self, kernel_weights=None):
        """Computes the sum kernel"""

        if kernel_weights is None:
            kernel_weights = np.ones(self.km_set.size)
        else:
            kernel_weights = ensure_ndarray_1D(kernel_weights)
            if kernel_weights.size != self.km_set.size:
                raise ValueError('Incompatible set of kernel_weights given.'
                                 'Must be an array of length exactly {}'
                                 ''.format(self.km_set.size))

        self.KM = np.zeros((self.num_samples, self.num_samples))
        for weight, km in zip(kernel_weights, self.km_set):
            self.KM = self.KM + weight * km.full

        self._is_fitted = True


[docs]class ProductKernel(CompositeKernel):
    """Class to define and compute a Product kernel from a KernelSet

    Parameters
    -----------
    km_set : KernelSet
        KernelSet from which the product kernel will be computed from

    name : str
        Identifier for the composite kernel

    """


    def __init__(self, km_set, name='ProductKernel'):
        """Constructor."""

        super().__init__(km_set, name=name)


[docs]    def fit(self):
        """Computes the product kernel."""

        self.KM = np.ones((self.num_samples, self.num_samples))
        for km in self.km_set:
            self.KM = self.KM * km.full  # * is element-wise multiplication here

        self._is_fitted = True


[docs]class AverageKernel(CompositeKernel):
    """Class to define and compute an Average kernel from a KernelSet

    Parameters
    -----------
    km_set : KernelSet
        KernelSet from which the average kernel will be computed

    name : str
        Identifier for the composite kernel
    """


    def __init__(self, km_set, name='AverageKernel'):
        """Constructor."""

        super().__init__(km_set, name=name)


[docs]    def fit(self):
        """Computes the average kernel"""

        self.KM = np.zeros((self.num_samples, self.num_samples))
        for km in self.km_set:
            self.KM = self.KM + km.full  # * is element-wise multiplication here

        # dividing by N, to make it an average
        self.KM = self.KM / self.km_set.size

        self._is_fitted = True


[docs]class WeightedAverageKernel(CompositeKernel):
    """Class to define and compute a weighted verage kernel from a KernelSet

    Parameters
    -----------
    km_set : KernelSet
        KernelSet from which the average kernel will be computed

    name : str
        Identifier for the composite kernel

    """


    def __init__(self,
                 km_set,
                 weights,
                 name='WeightedAverageKernel'):
        """Constructor."""

        super().__init__(km_set, name=name)

        if self.km_set.size == len(weights):
            self.weights = ensure_ndarray_1D(weights)
        else:
            raise ValueError('Number of weights ({}) supplied differ from the '
                             'kernel set size ({})'
                             ''.format(self.km_set.size, len(weights)))


[docs]    def fit(self):
        """Computes the weighted average kernel"""

        self.KM = np.zeros((self.num_samples, self.num_samples))
        for weight, km in zip(self.weights, self.km_set):
            self.KM = self.KM + weight * km.full

        self._is_fitted = True
kernelmethods 0.2 documentation

Source code for kernelmethods.base