Source code for kernelmethods.categorical

"""

Module for categorical kernels

Please refer to the following papers and theses for more details:

 - Villegas García, Marco Antonio. "An investigation into new kernels for
   categorical variables." Master's thesis, Universitat Politècnica de Catalunya,
   2013.


"""

import numpy as np

from kernelmethods.base import BaseKernelFunction
from kernelmethods.utils import check_input_arrays
from kernelmethods import config as cfg


[docs]class MatchCountKernel(BaseKernelFunction): """ Categorical kernel measuring similarity via the number of matching categorical dimensions. Parameters ---------- return_perc : bool If True, the return value would be normalized by the number of dimensions. References ---------- Villegas García, Marco A., "An investigation into new kernels for categorical variables." Master's thesis, Universitat Politècnica de Catalunya, 2013. """ def __init__(self, return_perc=True, skip_input_checks=False): """Constructor.""" self.return_perc = return_perc if self.return_perc: super().__init__('MatchPerc') else: super().__init__('MatchCount') self.skip_input_checks = skip_input_checks def __call__(self, vec_c, vec_d): """ Actual implementation of the kernel func. Parameters ---------- vec_c, vec_d : array of equal-sized categorical variables """ vec_c, vec_d = _check_categorical_arrays(vec_c, vec_d) if not np.issubdtype(vec_c.dtype, cfg.dtype_categorical) or \ not np.issubdtype(vec_d.dtype, cfg.dtype_categorical): raise TypeError('Categorical kernels require str or unicode dtype') match_count = np.sum(vec_c==vec_d) if self.return_perc: return match_count / len(vec_d) else: return match_count def __str__(self): """human readable repr""" return self.name
def _check_categorical_arrays(x, y): """ Ensures the inputs are 1) 1D arrays (not matrices) 2) with compatible size 3) of categorical data type and hence are safe to operate on. This is a variation of utils.check_input_arrays() to accommodate the special needs for categorical dtype, where we do not have lists of originally numbers/bool data to be converted to strings, and assume they are categorical. Parameters ---------- x : iterable y : iterable Returns ------- x : ndarray y : ndarray """ x = _ensure_type_size(x, ensure_num_dim=1) y = _ensure_type_size(y, ensure_num_dim=1) if x.size != y.size: raise ValueError('x (n={}) and y (n={}) differ in size! ' 'They must be of same length'.format(x.size, y.size)) return x, y def _ensure_type_size(array, ensure_num_dim=1): """Checking type and size of arrays""" if not isinstance(array, np.ndarray): array = np.squeeze(np.asarray(array)) if array.ndim != ensure_num_dim: raise ValueError('array must be {}-dimensional! ' 'It has {} dims with shape {} ' ''.format(ensure_num_dim, array.ndim, array.shape)) return array