Source code for base

# -*- coding: utf-8 -*-

docopt_string = """
Conquering confounds and covariates in machine learning


Definition of confound from Rao et al., 2017:
"For a given data sample D, a confound is a variable that affects the image data
and whose sample association with the target variable is not representative of the
population-of-interest. The sample D is then said to be biased (by the confound),
with respect to the population-of-interest.

Note that if a variable affects the image data but its association with the target
variable is representative of the population-of-interest, we would then consider
the sample to be unbiased, and the variable is not a true confound."


Other definitions used:

samplet: one row referring to single subject in sample feature matrix X (size Nxp )

"""

from abc import ABC

import numpy as np
from sklearn.base import BaseEstimator, TransformerMixin, clone
from sklearn.utils.validation import (check_array, check_consistent_length,
                                      check_is_fitted)

from confounds.utils import get_model


[docs]class ConfoundsException(BaseException):
    """Custom exception to indicate confounds-library specific issues."""


class BaseDeconfound(BaseEstimator, TransformerMixin, ABC):
    """Base class for all deconfounding or covariate adjustment methods."""

    _estimator_type = "deconfounder"


    def __init__(self, name='Deconfounder'):
        """Constructor"""

        self.name = name


    def fit(self,
            X,  # variable names chosen to correspond to sklearn when possible
            y,  # y is the confound variables here, not the target!
            ):
        """Fit method"""


    def transform(self,
                  X,  # variable names chosen to correspond to sklearn when possible
                  y,  # y is the confound variables here, not the target!
                  ):
        """Transform method"""


[docs]class Augment(BaseDeconfound):
    """
    Deconfounding estimator class  that simply augments/concatenates the confounding
    variables to input features prior to prediction.
    """


    def __init__(self):
        """Constructor"""

        super().__init__(name='Augment')

        # this class has no parameters


[docs]    def fit(self,
            X,  # variable names chosen to correspond to sklearn when possible
            y=None,  # y is the confound variables here, not the target!
            ):
        """
        Learns the dimensionality of confounding variables to be augmented.

        Variable names X, y had to be used to pass sklearn conventions. y here
        refers to the confound variables, and NOT the target. See examples in docs!

        Parameters
        ----------
        X : {array-like, sparse matrix}, shape (n_samples, n_features)
            The training input samples.
        y : ndarray
            Array of covariates, shape (n_samples, n_covariates)
            This does not refer to target as is typical in scikit-learn.

        Returns
        -------
        self : object
            Returns self
        """

        return self._fit(X, y)  # which itself must return self


    def _fit(self, in_features, confounds=None):
        """Actual fit method"""

        in_features = check_array(in_features)
        confounds = check_array(confounds, ensure_2d=False)

        # turning it into 2D, in case if its just a column
        if confounds.ndim == 1:
            confounds = confounds[:, np.newaxis]

        try:
            check_consistent_length(in_features, confounds)
        except:
            raise ValueError('X (features) and y (confounds) must have the same '
                             'number rows/samplets!')

        self.n_features_ = in_features.shape[1]

        return self


[docs]    def transform(self, X, y=None):
        """
        Transforms the given feature set by augmenting the confounding variables.

        Variable names X, y had to be used to pass sklearn conventions. y here
        refers to the confound variables for the [test] to be transformed, and NOT
        their target values. See examples in docs!

        Parameters
        ----------
        X : {array-like, sparse matrix}, shape (n_samples, n_features)
            The training input samples.
        y : ndarray
            Array of covariates, shape (n_samples, n_covariates)
            This does not refer to target as is typical in scikit-learn.

        Returns
        -------
        self : object
            Returns self
        """

        return self._transform(X, y)


    def _transform(self, test_features, test_confounds):
        """Actual deconfounding of the test features"""

        check_is_fitted(self, 'n_features_')
        test_features = check_array(test_features, accept_sparse=True)

        if test_features.shape[1] != self.n_features_:
            raise ValueError('number of features must be {}. Given {}'
                             ''.format(self.n_features_, test_features.shape[1]))

        if test_confounds is None:  # during estimator checks
            return test_features  # do nothing

        test_confounds = check_array(test_confounds, ensure_2d=False)
        check_consistent_length(test_features, test_confounds)

        return np.column_stack((test_features, test_confounds))


[docs]class Residualize(BaseDeconfound):
    """
    Deconfounding estimator class that residualizes the input features by
    subtracting the contributions from the confound variables

    Example methods: Linear, Kernel Ridge, Gaussian Process Regression etc
    """


    def __init__(self, model='linear'):
        """Constructor"""

        super().__init__(name='Residualize')

        self.model = model


[docs]    def fit(self,
            X,  # variable names chosen to correspond to sklearn when possible
            y=None,  # y is the confound variables here, not the target!
            ):
        """
        Fits the residualizing model (estimates the contributions of confounding
        variables (y) to the given [training] feature set X.  Variable names X,
        y had to be used to pass sklearn conventions. y here refers to the
        confound variables, and NOT the target. See examples in docs!

        Parameters
        ----------
        X : {array-like, sparse matrix}, shape (n_samples, n_features)
            The training input samples.
        y : ndarray
            Array of covariates, shape (n_samples, n_covariates)
            This does not refer to target as is typical in scikit-learn.

        Returns
        -------
        self : object
            Returns self
        """

        return self._fit(X, y)  # which itself must return self


    def _fit(self, in_features, confounds=None):
        """Actual fit method"""

        in_features = check_array(in_features)
        confounds = check_array(confounds, ensure_2d=False)

        # turning it into 2D, in case if its just a column
        if confounds.ndim == 1:
            confounds = confounds[:, np.newaxis]

        try:
            check_consistent_length(in_features, confounds)
        except:
            raise ValueError('X (features) and y (confounds) '
                             'must have the same number of rows/samplets!')

        self.n_features_ = in_features.shape[1]

        regr_model = clone(get_model(self.model))
        regr_model.fit(confounds, in_features)
        self.model_ = regr_model

        return self


[docs]    def transform(self, X, y=None):
        """
        Transforms the given feature set by residualizing the [test] features
        by subtracting the contributions of their confounding variables.

        Variable names X, y had to be used to pass scikit-learn conventions. y here
        refers to the confound variables for the [test] to be transformed,
        and NOT their target values. See examples in docs!

        Parameters
        ----------
        X : {array-like, sparse matrix}, shape (n_samples, n_features)
            The training input samples.
        y : ndarray
            Array of covariates, shape (n_samples, n_covariates)
            This does not refer to target as is typical in scikit-learn.

        Returns
        -------
        self : object
            Returns self
        """

        return self._transform(X, y)


    def _transform(self, test_features, test_confounds):
        """Actual deconfounding of the test features"""

        check_is_fitted(self, 'model_', 'n_features_')
        test_features = check_array(test_features, accept_sparse=True)

        if test_features.shape[1] != self.n_features_:
            raise ValueError('number of features must be {}. Given {}'
                             ''.format(self.n_features_, test_features.shape[1]))

        if test_confounds is None:  # during estimator checks
            return test_features  # do nothing

        test_confounds = check_array(test_confounds, ensure_2d=False)
        check_consistent_length(test_features, test_confounds)

        # test features as can be explained/predicted by their covariates
        test_feat_predicted = self.model_.predict(test_confounds)
        residuals = test_features - test_feat_predicted

        return residuals


class ResidualizeTarget(BaseDeconfound):
    """
    Deconfounding estimator class that residualizes the input features by
    subtracting the contributions from the confound variables
    """


    def __init__(self):
        """Constructor"""

        super().__init__(name='ResidualizeTarget')

        raise NotImplementedError()


[docs]class DummyDeconfounding(BaseDeconfound):
    """
    A do-nothing dummy method, to serve as a reference for methodological comparisons
    """


    def __init__(self):
        """Constructor"""

        super().__init__(name='DummyPassThrough')


[docs]    def fit(self, X, y=None):
        """
        A do-nothing fit method.

        Parameters
        ----------
        X : {array-like, sparse matrix}, shape (n_samples, n_features)
            The training input samples.
        y : ndarray
            Array of covariates, shape (n_samples, n_covariates)

        Returns
        -------
        self : object
            Returns self.
        """

        X = check_array(X, accept_sparse=True)
        self.n_features_ = X.shape[1]

        return self


[docs]    def transform(self, X, y=None):
        """
         A do-nothing transform method.

        Parameters
        ----------
        X : {array-like, sparse-matrix}, shape (n_samples, n_features)
            The input samples.

        y : ndarray
            Array of covariates, shape (n_samples, n_covariates)

        Returns
        -------
        X_transformed : array, shape (n_samples, n_features)
            Same as the input ``X``.

        """

        check_is_fitted(self, 'n_features_')
        X = check_array(X, accept_sparse=True)

        if X.shape[1] != self.n_features_:
            raise ValueError('num_features differ between fit and transform!')

        return X  # dummy pass-through, doing nothing except for shape checks.
confounds 0.1.1 documentation

Source code for base