Source code for kernelmethods.sampling

from functools import partial
from warnings import warn

import numpy as np
from kernelmethods import config as cfg
from kernelmethods.base import BaseKernelFunction, KernelMatrix, KernelSet
from kernelmethods.config import KernelMethodsException, KernelMethodsWarning
from kernelmethods.numeric_kernels import (GaussianKernel, LaplacianKernel,
                                           LinearKernel, PolyKernel)
from kernelmethods.operations import alignment_centered
from kernelmethods.utils import is_iterable_but_not_str
from scipy.stats.stats import pearsonr


[docs]class KernelBucket(KernelSet): """ Class to generate and/or maintain a "bucket" of candidate kernels. Applications: 1. to rank/filter/select kernels based on a given sample via many metrics 2. to be defined. **Note**: 1. Linear kernel is always added during init without your choosing. 2. This is in contrast to Chi^2 kernel, which is not added to the bucket by default, as it requires positive feature values and may break default use for common applications. You can easily add Chi^2 or any other kernels via the ``add_parametrized_kernels`` method. Parameters ---------- poly_degree_values : Iterable List of values for the degree parameter of the PolyKernel. One KernelMatrix will be added to the bucket for each value. rbf_sigma_values : Iterable List of values for the sigma parameter of the GaussianKernel. One KernelMatrix will be added to the bucket for each value. laplace_gamma_values : Iterable List of values for the gamma parameter of the LaplacianKernel. One KernelMatrix will be added to the bucket for each value. name : str String to identify the purpose or type of the bucket of kernels. Also helps easily distinguishing it from other buckets. normalize_kernels : bool Flag to indicate whether the kernel matrices need to be normalized skip_input_checks : bool Flag to indicate whether checks on input data (type, format etc) can be skipped. This helps save a tiny bit of runtime for expert uses when data types and formats are managed thoroughly in numpy. Default: False. Disable this only when you know exactly what you're doing! """ def __init__(self, poly_degree_values=cfg.default_degree_values_poly_kernel, rbf_sigma_values=cfg.default_sigma_values_gaussian_kernel, laplace_gamma_values=cfg.default_gamma_values_laplacian_kernel, name='KernelBucket', normalize_kernels=True, skip_input_checks=False, ): """ Constructor. Parameters ---------- poly_degree_values : Iterable List of values for the degree parameter of the PolyKernel. One KernelMatrix will be added to the bucket for each value. rbf_sigma_values : Iterable List of values for the sigma parameter of the GaussianKernel. One KernelMatrix will be added to the bucket for each value. laplace_gamma_values : Iterable List of values for the gamma parameter of the LaplacianKernel. One KernelMatrix will be added to the bucket for each value. name : str String to identify the purpose or type of the bucket of kernels. Also helps easily distinguishing it from other buckets. normalize_kernels : bool Flag to indicate whether the kernel matrices need to be normalized skip_input_checks : bool Flag to indicate whether checks on input data (type, format etc) can be skipped. This helps save a tiny bit of runtime for expert uses when data types and formats are managed thoroughly in numpy. Default: False. Disable this only when you know exactly what you're doing! """ if isinstance(normalize_kernels, bool): self._norm_kernels = normalize_kernels else: raise TypeError('normalize_kernels must be bool') if isinstance(skip_input_checks, bool): self._skip_input_checks = skip_input_checks else: raise TypeError('skip_input_checks must be bool') # start with the addition of kernel matrix for linear kernel init_kset = [KernelMatrix(LinearKernel(), normalized=self._norm_kernels), ] super().__init__(km_list=init_kset, name=name) # not attached to a sample yet self._num_samples = None self.add_parametrized_kernels(PolyKernel, 'degree', poly_degree_values) self.add_parametrized_kernels(GaussianKernel, 'sigma', rbf_sigma_values) self.add_parametrized_kernels(LaplacianKernel, 'gamma', laplace_gamma_values)
[docs] def add_parametrized_kernels(self, kernel_func, param, values): """ Adds a list of kernels parametrized by various values for a given param Parameters ---------- kernel_func : BaseKernelFunction Kernel function to be added (not an instance, but callable class) param : str Name of the parameter to the above kernel function values : Iterable List of parameter values. One kernel will be added for each value """ if (not isinstance(kernel_func, type)) or \ (not issubclass(kernel_func, BaseKernelFunction)): raise KernelMethodsException('Input {} is not a valid kernel func!' ' Must be derived from BaseKernelFunction' ''.format(kernel_func)) if values is None: # warn('No values provided for {}. Doing nothing!'.format(param)) return if not is_iterable_but_not_str(values, min_length=1): raise ValueError('values must be an iterable set of param values (n>=1)') for val in values: try: param_dict = {param : val, 'skip_input_checks': self._skip_input_checks} self.append(KernelMatrix(kernel_func(**param_dict), normalized=self._norm_kernels)) except: warn('Unable to add {} to the bucket for {}={}. Skipping it.' ''.format(kernel_func, param, val), KernelMethodsWarning)
[docs]def make_kernel_bucket(strategy='exhaustive', normalize_kernels=True, skip_input_checks=False): """ Generates a candidate kernels based on user preferences. Parameters ---------- strategy : str Name of the strategy for populating the kernel bucket. Options: 'exhaustive' and 'light'. Default: 'exhaustive' normalize_kernels : bool Flag to indicate whether to normalize the kernel matrices skip_input_checks : bool Flag to indicate whether checks on input data (type, format etc) can be skipped. This helps save a tiny bit of runtime for expert uses when data types and formats are managed thoroughly in numpy. Default: False. Disable this only when you know exactly what you're doing! Returns ------- kb : KernelBucket Kernel bucket populated according to the requested strategy """ if isinstance(strategy, (KernelBucket, KernelSet)): import warnings warnings.warn('Input is already a kernel bucket/set - simply returning it!') return strategy strategy = strategy.lower() if strategy == 'exhaustive': return KernelBucket(name='KBucketExhaustive', normalize_kernels=normalize_kernels, skip_input_checks=skip_input_checks, poly_degree_values=cfg.default_degree_values_poly_kernel, rbf_sigma_values=cfg.default_sigma_values_gaussian_kernel, laplace_gamma_values=cfg.default_gamma_values_laplacian_kernel) elif strategy == 'light': return KernelBucket(name='KBucketLight', normalize_kernels=normalize_kernels, skip_input_checks=skip_input_checks, poly_degree_values=cfg.light_degree_values_poly_kernel, rbf_sigma_values=cfg.light_sigma_values_gaussian_kernel, laplace_gamma_values=cfg.light_gamma_values_laplacian_kernel) else: raise ValueError('Invalid choice of strategy ' '- must be one of {}'.format(cfg.kernel_bucket_strategies))
[docs]def ideal_kernel(targets): """ Computes the kernel matrix from the given target labels. Parameters ---------- targets : Iterable Target values (``y``) to compute the ideal kernel from. Returns ------- ideal_kernel : ndarray The ideal kernel from (``yy\ :sup:`T` ``) """ targets = np.array(targets).reshape((-1, 1)) # row vector return targets.dot(targets.T)
[docs]def correlation_km(k1, k2): """ Computes [pearson] correlation coefficient between two kernel matrices Parameters ---------- k1, k2 : ndarray Two kernel matrices of the same size Returns ------- corr_coef : float Correlation coefficient between the vectorized kernel matrices """ corr_coef, p_val = pearsonr(k1.ravel(), k2.ravel()) return corr_coef
[docs]def pairwise_similarity(k_bucket, metric='corr'): """ Computes the similarity between all pairs of kernel matrices in a given bucket. Parameters ---------- k_bucket : KernelBucket Container of length num_km, with each an instance ``KernelMatrix`` metric : str Identifies the metric to be used. Options: ``corr`` (correlation coefficient) and ``align`` (centered alignment). Returns ------- pairwise_metric : ndarray of shape (num_km, num_km) A symmetric matrix computing the pairwise similarity between the various kernel matrices """ # mutual info? metric_func = {'corr' : correlation_km, 'align': partial(alignment_centered, value_if_zero_division=0.0)} num_kernels = k_bucket.size estimator = metric_func[metric] pairwise_metric = np.full((k_bucket.size, k_bucket.size), fill_value=np.nan) for idx_one in range(num_kernels): # kernel matrix is symmetric for idx_two in range(idx_one, num_kernels): # computing i,i as well to be consistent pairwise_metric[idx_one, idx_two] = estimator(k_bucket[idx_one].full, k_bucket[idx_two].full) # not computing diagonal entries (can also be set to 1 for some metrics) # making it symmetric idx_lower_tri = np.tril_indices(num_kernels) pairwise_metric[idx_lower_tri] = pairwise_metric.T[idx_lower_tri] return pairwise_metric