Source code for rhst

from __future__ import print_function

from neuropredict.classify import check_positive_class

__all__ = ['run', 'load_results', 'save_results']

import os
import sys
import pickle
import logging
import warnings
from warnings import catch_warnings, filterwarnings, simplefilter
from collections import Counter, namedtuple
from sys import version_info
from os.path import join as pjoin, exists as pexists
from multiprocessing import Pool, Manager
from functools import partial
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix, roc_auc_score
from sklearn.model_selection import GridSearchCV, ShuffleSplit
import traceback
import shutil

if version_info.major > 2:
    from neuropredict import config_neuropredict as cfg
    from neuropredict.algorithms import get_pipeline, get_feature_importance
    from neuropredict.reports import report_best_params, export_results
    from neuropredict.utils import (check_feature_sets_are_comparable,
                                    check_params_rhst, balanced_accuracy,
                                    load_options, sub_group_identifier,
                                    make_numeric_labels, impute_missing_data)
    from neuropredict.io import load_pyradigms
else:
    raise NotImplementedError('neuropredict requires Python 3+.')


def eval_optimized_model_on_testset(train_fs, test_fs,
                                    impute_strategy=cfg.default_imputation_strategy,
                                    label_order_in_conf_matrix=None,
                                    feat_sel_size=cfg.default_num_features_to_select,
                                    train_perc=0.5,
                                    grid_search_level=cfg.GRIDSEARCH_LEVEL_DEFAULT,
                                    classifier_name=cfg.default_classifier,
                                    feat_select_method=cfg.default_feat_select_method):
    """
    Optimize the classifier on the training set and return predictions on test set.

    Parameters
    ----------
    train_fs : MLDataset
        Dataset to optimize a given classifier on.

    test_fs : MLDataset
        Dataset to make predictions on using the classifier optimized on training set.

    impute_strategy : str
        Strategy to handle the missing data: whether to raise an error if data is missing, or
            to impute them using the method chosen here.

    label_order_in_conf_matrix : list
        List of labels to compute the order of confusion matrix.

    feat_sel_size : str or int
        Metho to choose the number of featurese to select.

    train_perc : float
        Training set fraction to run the inner cross-validation.

    grid_search_level : str
        If 'light', grid search resolution will be reduced to speed up optimization.
        If 'exhaustive', most values for most parameters will be user for optimization.

    classifier_name : str
        String identifying a scikit-learn classifier.

    feat_select_method : str
        String identifying a valid scikit-learn feature selection method.

    Returns
    -------

    """

    if label_order_in_conf_matrix is None:
        raise ValueError('Label order for confusion matrix must be specified '
                         'for accurate results/visulizations.')

    train_data_mat, train_labels, _ = train_fs.data_and_targets()
    test_data_mat, true_test_labels, test_sample_ids = test_fs.data_and_targets()

    if impute_strategy is not None:
        train_data_mat, test_data_mat = impute_missing_data(train_data_mat, train_labels,
                                                            impute_strategy, test_data_mat)

    train_class_sizes = list(train_fs.target_sizes.values())

    # TODO look for ways to avoid building this every iter and every dataset.
    pipeline, param_grid = get_pipeline(train_class_sizes,
                                        feat_sel_size,
                                        train_fs.num_features,
                                        gs_level=grid_search_level,
                                        clfr_name=classifier_name,
                                        fsr_name=feat_select_method)

    best_pipeline, best_params = optimize_pipeline_via_grid_search_CV(pipeline,
                                                                      train_data_mat,
                                                                      train_labels,
                                                                      param_grid,
                                                                      train_perc)
    # best_model, best_params = optimize_RF_via_training_oob_score(train_data_mat,
    #     train_labels,
    #     param_grid['random_forest_clf__min_samples_leaf'],
    #     param_grid['random_forest_clf__max_features'])

    # assuming order in pipeline construction :
    #   - step 0 : preprocessign (robust scaling)
    #   - step 1 : feature selector / dim reducer
    _, best_fsr = best_pipeline.steps[1]
    _, best_clf = best_pipeline.steps[-1]  # the final step in an sklearn pipeline
                                           #   is always an estimator/classifier

    # making predictions on the test set and assessing their performance
    pred_test_labels = best_pipeline.predict(test_data_mat)

    # only the selected features get non-nan value
    feat_importance = get_feature_importance(classifier_name, best_clf,
                                             best_fsr, train_fs.num_features)

    # TODO test if the gathering of prob data is consistent
    #   across multiple calls to this method
    #   perhaps by controlling the class order in input
    # Order of the classes corresponds to that in the attribute best_model.classes_
    if hasattr(best_pipeline, 'predict_proba'):
        pred_prob = best_pipeline.predict_proba(test_data_mat)
    else:
        pred_prob = None

    conf_mat = confusion_matrix(true_test_labels, pred_test_labels,
                                labels=label_order_in_conf_matrix)

    misclsfd_samples = test_sample_ids[true_test_labels != pred_test_labels]

    return pred_prob, pred_test_labels, true_test_labels, \
           conf_mat, misclsfd_samples, \
           feat_importance, best_params


def optimize_RF_via_training_oob_score(train_data_mat, train_labels,
                                       range_min_leafsize, range_num_predictors):
    """
    Finds the best parameters just based on out of bag error
    within the training set (supposed to reflect test error).
    """

    oob_error_train = np.full([len(range_min_leafsize), len(range_num_predictors)],
                              np.nan)

    for idx_ls, minls in enumerate(range_min_leafsize):
        for idx_np, num_pred in enumerate(range_num_predictors):
            rf = RandomForestClassifier(max_features=num_pred, min_samples_leaf=minls,
                                        n_estimators=cfg.NUM_TREES, max_depth=None,
                                        oob_score=True)  # , random_state=SEED_RANDOM)
            rf.fit(train_data_mat, train_labels)
            oob_error_train[idx_ls, idx_np] = rf.oob_score_

    # identifying the best parameters
    best_idx_ls, best_idx_numpred = np.unravel_index(oob_error_train.argmin(),
                                                     oob_error_train.shape)
    best_minleafsize = range_min_leafsize[best_idx_ls]
    best_num_predictors = range_num_predictors[best_idx_numpred]
    best_params = {'min_samples_leaf': best_minleafsize,
                   'max_features'    : best_num_predictors}

    # training the RF using the best parameters
    best_rf = RandomForestClassifier(max_features=best_num_predictors,
                                     min_samples_leaf=best_minleafsize,
                                     oob_score=True,
                                     n_estimators=cfg.NUM_TREES)  #, random_state=SEED_RANDOM)
    best_rf.fit(train_data_mat, train_labels)

    return best_rf, best_params


def optimize_pipeline_via_grid_search_CV(pipeline, train_data_mat, train_labels,
                                         param_grid, train_perc):
    """Performs GridSearchCV and returns the best parameters
    and refitted Pipeline on full dataset with the best parameters."""

    # TODO perhaps k-fold is a better inner CV,
    #   which guarantees full use of training set with fewer repeats?
    inner_cv = ShuffleSplit(n_splits=cfg.INNER_CV_NUM_SPLITS,
                            train_size=train_perc,
                            test_size=1.0 - train_perc)
    # inner_cv = RepeatedKFold(n_splits=cfg.INNER_CV_NUM_FOLDS,
    #   n_repeats=cfg.INNER_CV_NUM_REPEATS)

    # gs = GridSearchCV(estimator=pipeline, param_grid=param_grid, cv=inner_cv,
    #                   n_jobs=cfg.GRIDSEARCH_NUM_JOBS,
    #                   pre_dispatch=cfg.GRIDSEARCH_PRE_DISPATCH)

    # not specifying n_jobs to avoid any kind of parallelism (joblib) from within
    # sklearn to avoid potentially bad interactions with outer parallelization
    # with builtin multiprocessing library
    gs = GridSearchCV(estimator=pipeline, param_grid=param_grid, cv=inner_cv,
                      refit=cfg.refit_best_model_on_ALL_training_set)

    # ignoring some not-so-critical warnings
    with catch_warnings():
        filterwarnings(action='once', category=UserWarning, module='joblib',
                       message='Multiprocessing-backed parallel loops cannot be '
                               'nested, setting n_jobs=1')
        filterwarnings(action='once', category=UserWarning,
                       message='Some inputs do not have OOB scores')
        np.seterr(divide='ignore', invalid='ignore')
        filterwarnings(action='once', category=RuntimeWarning,
                       message='invalid value encountered in true_divide')
        simplefilter(action='once', category=DeprecationWarning)

        gs.fit(train_data_mat, train_labels)

    return gs.best_estimator_, gs.best_params_


[docs]def save_results(out_dir, dict_of_objects_to_save):
    "Serializes the results to disk."

    # LATER choose a more universal serialization method
    #   (that could be loaded from a web app)
    try:
        out_results_path = pjoin(out_dir, cfg.file_name_results)
        with open(out_results_path, 'wb') as resfid:
            pickle.dump(dict_of_objects_to_save, resfid)
    except:
        raise IOError('Error saving the results to disk!')
    else:
        # deleting temp results only when saving full results is successful
        cleanup(out_dir)

    return out_results_path


def load_results_from_folder(results_folder):
    """

    Given a base output folder, possibly containing results for multiple sub-groups,
        returns a dictionary of results, keyed in by sub group identifier.

    """

    results = dict()
    options = load_options(results_folder)
    for ix, sg in enumerate(options['sub_groups']):
        sg_id = sub_group_identifier(sg, ix)
        results_file_path = pjoin(results_folder, sg_id, cfg.file_name_results)
        if not pexists(results_file_path) or os.path.getsize(results_file_path) <= 0:
            raise IOError('Results file for sub group {} does not exist'
                          ' or is empty!'.format(sg_id))
        results[sg_id] = load_results_dict(results_file_path)

    return results


def load_results_dict(results_file_path):
    "Loads the results serialized by RHsT."
    # TODO need to standardize what needs to saved/read back

    if not pexists(results_file_path) or os.path.getsize(results_file_path) <= 0:
        raise IOError("Results file to be loaded doesn't exist, or empty!")

    try:
        with open(results_file_path, 'rb') as rf:
            results_dict = pickle.load(rf)
    except:
        raise IOError('Error loading the saved results from \n{}'
                      ''.format(results_file_path))

    return results_dict


[docs]def load_results(results_file_path):
    "Loads the results serialized by RHsT."
    # TODO need to standardize what needs to saved/read back

    if not pexists(results_file_path):
        raise IOError("Results file to be loaded doesn't exist!")

    try:
        with open(results_file_path, 'rb') as rf:
            results_dict = pickle.load(rf)
            # # below is possible, but not explicit and a bad practice
            # # importing the keys and their values into the workspace
            # locals().update(results_dict)

            dataset_paths, method_names, train_perc, num_repetitions, num_classes, \
            pred_prob_per_class, pred_labels_per_rep_fs, test_labels_per_rep, \
            best_params, feature_importances_rf, \
            feature_names, num_times_misclfd, num_times_tested, \
            confusion_matrix, class_set, target_sizes, accuracy_balanced, \
            auc_weighted, positive_class, classifier_name, feat_select_method = \
                [results_dict.get(var_name)
                 for var_name in cfg.rhst_data_variables_to_persist]

    except:
        raise IOError('Error loading the saved results from \n{}'
                      ''.format(results_file_path))

    # TODO need a consolidated way to identify the variables saved their order
    return dataset_paths, method_names, train_perc, num_repetitions, num_classes, \
           pred_prob_per_class, pred_labels_per_rep_fs, test_labels_per_rep, \
           best_params, feature_importances_rf, feature_names, \
           num_times_misclfd, num_times_tested, \
           confusion_matrix, class_set, target_sizes, \
           accuracy_balanced, auc_weighted, positive_class, \
           classifier_name, feat_select_method


[docs]def run(dataset_path_file, method_names, out_results_dir,
        train_perc=0.8, num_repetitions=200,
        positive_class=None, sub_group=None,
        feat_sel_size=cfg.default_num_features_to_select,
        impute_strategy=cfg.default_imputation_strategy,
        missing_flag=None,
        num_procs=4,
        grid_search_level=cfg.GRIDSEARCH_LEVEL_DEFAULT,
        classifier_name=cfg.default_classifier,
        feat_select_method=cfg.default_feat_select_method,
        options_path=None):
    """

    Parameters
    ----------
    dataset_path_file : str
        path to file containing list of paths (each containing a valid MLDataset).

    method_names : list
        A list of names to denote the different feature extraction methods

    out_results_dir : str
        Path to output directory to save the cross validation results to.

    train_perc : float or numpy.float, optional
        Percetange of subjects to train the classifier on.
        The percentage is applied to the size of the smallest class to estimate
        the number of subjects from each class to be reserved for training.
        The smallest class is chosen to avoid class-imbalance in the training set.

        Default: 0.8 (80%).
    num_repetitions : int or numpy.int, optional
        Number of repetitions of cross-validation estimation. Default: 200.

    positive_class : str
        Name of the class to be treated as positive in calculation of AUC

    feat_sel_size : str or int
        Number of features to retain after feature selection.
        Must be a method (tenth or square root of the size of smallest class in training set,
            or a finite integer smaller than the data dimensionality.

    sub_group : list
        List of classes to focus on for classification. Default: all classes available.

    num_procs : int
        Number of parallel processes to run to parallelize the repetitions of CV

    grid_search_level : str
        If 'none', no grid search will be performed, choosing parameters based on 'folk wisdom'.
        If 'light', grid search resolution will be reduced to speed up optimization.
        If 'exhaustive', most values for most parameters will be user for optimization.

    options_path : str
        Path to a pickle file which contains the all user chosen options.

    Returns
    -------
    results_path : str
        Path to pickle file containing full set of CV results.

    """

    dataset_paths, num_repetitions, num_procs, sub_group = \
        check_params_rhst(dataset_path_file, out_results_dir, num_repetitions,
                          train_perc, sub_group, num_procs, grid_search_level,
                          classifier_name, feat_select_method)

    # loading datasets
    datasets = load_pyradigms(dataset_paths, sub_group)

    # making sure different feature sets are comparable
    common_ds, class_set, target_sizes, \
    num_samples, num_classes, num_datasets, num_features = \
        check_feature_sets_are_comparable(datasets)
    # TODO warning when num_rep are not suficient: need a heuristic to assess it

    positive_class, pos_class_index = check_positive_class(class_set, positive_class)

    # the following is not necessary anymore, as labels are now strings!
    # # re-map the labels (from 1 to n) to ensure numeric labels do not differ
    # datasets = remap_labels(datasets, common_ds, class_set)

    # determine the common size for training
    train_size_common, total_test_samples = determine_training_size(train_perc,
                                                                    target_sizes,
                                                                    num_classes)

    # the main parallel loop to crunch optimizations, predictions and evaluations
    # chunk_size = int(np.ceil(num_repetitions/num_procs))
    if num_procs > 1:
        print('Parallelizing the repetitions of CV with {} processes ...'
              ''.format(num_procs))
        with Manager() as proxy_manager:
            shared_inputs = proxy_manager.list(
                    [datasets, impute_strategy, train_size_common, feat_sel_size,
                     train_perc, total_test_samples, num_classes, num_features,
                     class_set, method_names, pos_class_index, out_results_dir,
                     grid_search_level, classifier_name, feat_select_method])
            partial_func_holdout = partial(holdout_trial_compare_datasets,
                                           *shared_inputs)

            with Pool(processes=num_procs) as pool:
                cv_results = pool.map(partial_func_holdout, range(num_repetitions))
    else:
        # switching to regular sequential for loop
        partial_func_holdout = partial(holdout_trial_compare_datasets, datasets,
                                       impute_strategy, train_size_common,
                                       feat_sel_size, train_perc, total_test_samples,
                                       num_classes, num_features, class_set,
                                       method_names, pos_class_index,
                                       out_results_dir, grid_search_level,
                                       classifier_name, feat_select_method)
        cv_results = [partial_func_holdout(rep_id=rep) for rep in range(num_repetitions)]

    # re-assemble results into a convenient form
    pred_prob_per_class, pred_labels_per_rep_fs, test_labels_per_rep, \
    confusion_matrix, accuracy_balanced, auc_weighted, best_params, feature_names,\
    feature_importances_per_rep, feature_importances_rf, num_times_misclfd, \
    num_times_tested = gather_results_across_trials(
        cv_results, common_ds, datasets, total_test_samples, num_repetitions,
        num_datasets, num_classes, num_features)

    # saving the required variables to disk in a dict
    locals_var_dict = locals()
    dict_to_save = {var: locals_var_dict[var]
                    for var in cfg.rhst_data_variables_to_persist}
    out_results_path = save_results(out_results_dir, dict_to_save)

    report_best_params(best_params, method_names, out_results_dir)

    # exporting the results right away, without waiting for figures
    export_results(dict_to_save, out_results_dir, options_path)

    summarize_perf(accuracy_balanced, auc_weighted, method_names,
                   num_classes, num_datasets)

    return out_results_path


def determine_training_size(train_perc, target_sizes, num_classes):
    """Computes the maximum training size that the smallest class can provide """

    print("Different classes in the training set are stratified "
          "to match the smallest class!")
    train_size_per_class = np.int64(np.floor(train_perc * target_sizes).astype(np.float64))
    # per-class
    train_size_common = np.int64(np.minimum(min(train_size_per_class), train_size_per_class))
    # single number
    reduced_sizes = np.unique(train_size_common)
    if len(reduced_sizes) != 1:
        raise ValueError("Error in stratification of training set "
                         "based on the smallest class!")
    train_size_common = reduced_sizes[0]

    if train_size_common < 1:
        raise ValueError('Invalid state - Zero samples selected for training!'
                         'Check the class size distribution in dataset!')

    total_test_samples = np.int64(np.sum(target_sizes) - num_classes * train_size_common)

    return train_size_common, total_test_samples


def initialize_misclf_counters(sample_ids, num_datasets):
    """Initialize misclassification counters."""

    num_times_tested = list()
    num_times_misclfd = list()
    for dd in range(num_datasets):
        num_times_tested.append(Counter(sample_ids))
        num_times_misclfd.append(Counter(sample_ids))
        for subid in sample_ids:
            num_times_tested[dd][subid] = 0
            num_times_misclfd[dd][subid] = 0

    return num_times_misclfd, num_times_tested


def initialize_result_containers(common_ds, datasets, total_test_samples,
                                 n_repetitions, n_datasets, n_classes, num_features):
    """Prepare containers for various outputs"""

    pred_prob_per_class = np.full([n_repetitions, n_datasets,
                                   total_test_samples, n_classes], np.nan)
    pred_labels_per_rep_fs = np.full([n_repetitions, n_datasets, total_test_samples],
                                     fill_value=np.nan, dtype=object);
    test_labels_per_rep = np.full([n_repetitions, total_test_samples],
                                  fill_value=np.nan, dtype=object)

    best_params = [None] * n_repetitions

    num_times_misclfd, num_times_tested = \
        initialize_misclf_counters(common_ds.samplet_ids, n_datasets)

    # multi-class metrics
    confusion_matrix = np.full([n_repetitions, n_classes, n_classes, n_datasets],
                               np.nan)
    accuracy_balanced = np.full([n_repetitions, n_datasets], np.nan)
    auc_weighted = np.full([n_repetitions, n_datasets], np.nan)

    feature_names = [None] * n_datasets
    feature_importances_per_rep = [None] * n_repetitions
    feature_importances_rf = [None] * n_datasets
    for idx in range(n_datasets):
        feature_importances_rf[idx] = np.full([n_repetitions, num_features[idx]],
                                              np.nan)
        feature_names[idx] = datasets[idx].feature_names

    return pred_prob_per_class, pred_labels_per_rep_fs, test_labels_per_rep, \
           confusion_matrix, accuracy_balanced, auc_weighted, best_params, \
           feature_names, feature_importances_per_rep, feature_importances_rf, \
           num_times_misclfd, num_times_tested


def get_pretty_print_options(method_names, num_datasets):
    """Returns field widths for formatting"""

    if len(method_names) < num_datasets:
        raise ValueError('Insufficient number of names (n={}) '
                         'for the given feature sets (n={}).'
                         ''.format(len(method_names), num_datasets))

    max_width_method_names = max(map(len, method_names))
    ndigits_ndatasets = len(str(num_datasets))
    pretty_print = namedtuple('pprint', ['str_width', 'num_digits'])
    print_options = pretty_print(max_width_method_names, ndigits_ndatasets)

    return print_options


def remap_labels(datasets, common_ds, class_set):
    """re-map the labels (from 1 to n) to ensure numeric labels do not differ"""

    numeric_labels = make_numeric_labels(class_set)

    labels_with_correspondence = dict()
    for subid in common_ds.samplet_ids:
        labels_with_correspondence[subid] = numeric_labels[common_ds.targets[subid]]

    for idx in range(len(datasets)):
        datasets[idx].labels = labels_with_correspondence

    return datasets


def holdout_trial_compare_datasets(datasets, impute_strategy, train_size_common,
                                   feat_sel_size, train_perc, total_test_samples,
                                   num_classes, num_features_per_dataset, class_set,
                                   method_names, pos_class_index, out_results_dir,
                                   grid_search_level, classifier_name,
                                   feat_select_method, rep_id=None):
    """
    Runs a single iteration of optimizing the chosen pipeline on the chosen
    training set, and evaluations on the given test set.

    Parameters
    ----------
    datasets

    impute_strategy : str
        Strategy to handle the missing data: whether to raise an error if data is
        missing, or to impute them using the method chosen here.

    train_size_common
    feat_sel_size
    train_perc
    total_test_samples
    num_classes
    num_features_per_dataset
    class_set
    method_names
    pos_class_index
    out_results_dir
    rep_id

    grid_search_level : str
        If 'light', grid search resolution will be reduced to speed up optimization.
        If 'exhaustive', broadest range of values for most parameters will be used
        for optimization.

    Returns
    -------

    """

    common_ds = datasets[cfg.COMMON_DATASET_INDEX]
    num_datasets = len(datasets)

    # multi-class metrics
    confusion_matrix = np.full([num_classes, num_classes, num_datasets], np.nan)
    accuracy_balanced = np.full(num_datasets, np.nan)
    auc_weighted = np.full(num_datasets, np.nan)
    best_params = [None] * num_datasets
    misclsfd_ids_this_run = [None] * num_datasets

    feature_importances = [None] * num_datasets
    for idx in range(num_datasets):
        feature_importances[idx] = np.full(num_features_per_dataset[idx], np.nan)

    # set of subjects for training and testing, common for all datasets.
    train_set, test_set = common_ds.train_test_split_ids(count_per_class=train_size_common)
    # NOTE test labels are the same for all datasets - each feature/model
    # combination is being evaluated against the same set of test samplets
    true_test_labels = np.array([common_ds.targets[sid]
                        for sid in test_set if sid in common_ds.targets])

    pred_prob_per_class = np.full([num_datasets, total_test_samples, num_classes],
                                  np.nan)
    pred_labels_per_rep_fs = np.empty([num_datasets, total_test_samples],
                                     dtype=true_test_labels.dtype)

    # to uniquely identify this iteration
    if rep_id is None:
        rep_proc_id = 'process{}'.format(os.getpid())  # str(os.getpid())
    else:
        rep_proc_id = str(rep_id)
    print_options = get_pretty_print_options(method_names, num_datasets)

    # evaluating each feature/dataset
    for dd in range(num_datasets):
        print("CV trial {rep:6} "
              "feature {index:{nd}} "
              "{name:>{namewidth}} : "
              "".format(rep=rep_proc_id, index=dd, name=method_names[dd],
                        nd=print_options.num_digits,
                        namewidth=print_options.str_width),
              end='')

        # using the same train/test sets for all feature sets.
        train_fs = datasets[dd].get_subset(train_set)
        test_fs = datasets[dd].get_subset(test_set)

        pred_prob_per_class[dd, :, :], pred_labels_per_rep_fs[dd,:], \
        _ignored_true_test_labels, conf_mat, misclsfd_ids_this_run[dd], \
        feature_importances[dd], best_params[dd] = \
            eval_optimized_model_on_testset(train_fs, test_fs,
                                            impute_strategy=impute_strategy,
                                            train_perc=train_perc,
                                            feat_sel_size=feat_sel_size,
                                            label_order_in_conf_matrix=class_set,
                                            grid_search_level=grid_search_level,
                                            classifier_name=classifier_name,
                                            feat_select_method=feat_select_method)

        # TODO new feature: add additional metrics such as PPV
        accuracy_balanced[dd] = balanced_accuracy(conf_mat)
        confusion_matrix[:, :, dd] = conf_mat
        print('balanced accuracy: {:.4f} '.format(accuracy_balanced[dd]), end='')

        if num_classes == 2:
            auc_weighted[dd] = roc_auc_score(true_test_labels,
                                             pred_prob_per_class[dd, :, pos_class_index],
                                             average='weighted')
            print('\t weighted AUC: {:.4f}'.format(auc_weighted[dd]), end='')

        print('', flush=True)
        sys.stdout.flush()
        sys.stderr.flush()

    results_list = [pred_prob_per_class, pred_labels_per_rep_fs, true_test_labels,
                    accuracy_balanced,
                    confusion_matrix, auc_weighted, feature_importances, best_params,
                    misclsfd_ids_this_run, test_set]

    tmp_dir = get_temp_dir(out_results_dir)
    out_path = pjoin(tmp_dir, '{}_{}.pkl'.format(cfg.temp_prefix_rhst, rep_proc_id))
    logging.info('results from rep {} saved to {}'.format(rep_proc_id, out_path))
    with open(out_path, 'bw') as of:
        pickle.dump(results_list, of)

    return results_list


def get_temp_dir(out_results_dir):
    "Scratch directory to save temporary results to"

    tmp_dir = pjoin(out_results_dir, cfg.temp_results_dir)
    os.makedirs(tmp_dir, exist_ok=True)

    return tmp_dir


def cleanup(out_dir):
    "Helper to perform cleanup"

    tmp_dir = get_temp_dir(out_dir)
    try:
        shutil.rmtree(tmp_dir)
    except:
        traceback.print_exc()
        print('Unable to delete temporary folder at:\n\t{}\n'
              'Remove it manually if you would like to save space.'.format(tmp_dir))

    return


def gather_results_across_trials(cv_results, common_ds, datasets, total_test_samples,
                                 num_repetitions, num_datasets, num_classes,
                                 num_features):
    "Reorganizes list of indiv CV trial results into rectangular arrays."

    pred_prob_per_class, pred_labels_per_rep_fs, test_labels_per_rep, \
    confusion_matrix, accuracy_balanced, auc_weighted, best_params, feature_names,\
    feature_importances_per_rep, feature_importances_rf, num_times_misclfd, \
    num_times_tested = initialize_result_containers(
        common_ds, datasets, total_test_samples, num_repetitions, num_datasets,
        num_classes, num_features)

    for rep in range(num_repetitions):
        # unpacking each rep
        _rep_pred_prob_per_class, _rep_pred_labels_per_rep_fs, \
        _rep_true_test_labels, _rep_accuracy_balanced, _rep_confusion_matrix, \
        _rep_auc_weighted, _rep_feature_importances, _rep_best_params, \
        _rep_misclsfd_ids_this_run, _rep_test_set = cv_results[rep]

        pred_prob_per_class[rep, :, :, :] = _rep_pred_prob_per_class
        pred_labels_per_rep_fs[rep, :, :] = _rep_pred_labels_per_rep_fs
        test_labels_per_rep[rep, :] = _rep_true_test_labels
        accuracy_balanced[rep, :] = _rep_accuracy_balanced
        confusion_matrix[rep, :, :, :] = _rep_confusion_matrix
        auc_weighted[rep, :] = _rep_auc_weighted
        best_params[rep] = _rep_best_params

        for dd in range(num_datasets):
            num_times_misclfd[dd].update(_rep_misclsfd_ids_this_run[dd])
            num_times_tested[dd].update(_rep_test_set)
            feature_importances_rf[dd][rep, :] = _rep_feature_importances[dd]

        # this variable is not being saved/used in any other way.
        feature_importances_per_rep[rep] = _rep_feature_importances

    return pred_prob_per_class, pred_labels_per_rep_fs, test_labels_per_rep, \
           confusion_matrix, accuracy_balanced, auc_weighted, best_params, \
           feature_names, feature_importances_per_rep, feature_importances_rf, \
           num_times_misclfd, num_times_tested


def summarize_perf(accuracy_balanced, auc_weighted, method_names,
                   num_classes, num_datasets):
    """Prints median performance for each feature set"""

    with warnings.catch_warnings():
        warnings.filterwarnings(action='ignore', message='All-NaN slice encountered',
                                module='numpy', category=RuntimeWarning)

        # assuming the first column (axis 0) is over num_repititions
        median_bal_acc = np.nanmedian(accuracy_balanced, axis=0)
        if num_classes == 2:
            median_wtd_auc = np.nanmedian(auc_weighted, axis=0)

    print_options = get_pretty_print_options(method_names, num_datasets)

    print('\nMedian performance summary:', end='')
    for dd in range(num_datasets):
        print("\nfeature {index:{nd}} {name:>{namewidth}} : "
              "balanced accuracy {accuracy:2.2f} "
              "".format(index=dd, name=method_names[dd], accuracy=median_bal_acc[dd],
                        namewidth=print_options.str_width,
                        nd=print_options.num_digits), end='')
        if num_classes == 2:
            print("\t AUC {auc:2.2f}".format(auc=median_wtd_auc[dd]), end='')

    print('')
    return


if __name__ == '__main__':
    pass
neuropredict 0.6+3.g3e6b884.dirty documentation

Source code for rhst