Source code for multi_edge

__all__ = ['extract_multiedge', 'summarize_multigraph']

import sys
import traceback
import warnings
from functools import partial
from multiprocessing import Manager, Pool
from os.path import getsize, isfile
from sys import version_info

import hiwenet
import networkx as nx
import numpy as np

if version_info.major > 2:
    from graynet.utils import (stamp_expt_multiedge, check_params_multiedge,
                               make_output_path_graph,
                               save_graph, check_subjects, check_stat_methods,
                               check_num_bins,
                               check_weights,
                               check_num_procs, check_atlas, check_edge_range_dict,
                               mask_background_roi,
                               warn_nan,
                               stamp_expt_weight, import_features,
                               save_per_subject_graph)
    from graynet import parcellate
    from graynet import config_graynet as cfg
else:
    raise NotImplementedError(
            'graynet supports only Python 3 or higher. '
            'Upgrade to Python 3+ is highly recommended.')


[docs]def extract_multiedge(subject_id_list,
                      input_dir,
                      base_feature_list=cfg.default_features_multi_edge,
                      weight_method_list=cfg.default_weight_method,
                      summary_stats=cfg.multi_edge_summary_func_default,
                      num_bins=cfg.default_num_bins,
                      edge_range_dict=cfg.edge_range_predefined,
                      atlas=cfg.default_atlas,
                      smoothing_param=cfg.default_smoothing_param,
                      node_size=cfg.default_node_size,
                      out_dir=None,
                      return_results=False,
                      overwrite_results=False,
                      num_procs=cfg.default_num_procs):
    """
    Extracts weighted networks (matrix of pair-wise ROI distances)
    based on multiple gray matter features based on Freesurfer processing.

    Parameters
    ----------
    subject_id_list : str or list
         must be path to a file containing subject IDs, or a list of subject IDs
    input_dir : str
        Path to the input directory where features can be read.
        This can be Freesurfer's SUBJECTS_DIR, where output processing is stored.
        Or another directory with a structure that graynet can parse.
    base_feature_list : list
        Set of features that drive the different edges between the pair of ROIs.

        For example, if you choose thickness and pial_curv, each pair of ROIs will have two edges.

        This multi-edge network can be turned into a single network based on
        averaging weights from different individual networks,
        or computing another summary statistic of your interest.

    weight_method_list : string(s), optional
        Type of distance (or metric) to compute between the pair of histograms.

        It must be one of the following methods:

        - 'chebyshev'
        - 'chebyshev_neg'
        - 'chi_square'
        - 'correlate'
        - 'correlate_1'
        - 'cosine'
        - 'cosine_1'
        - 'cosine_2'
        - 'cosine_alt'
        - 'euclidean'
        - 'fidelity_based'
        - 'histogram_intersection'
        - 'histogram_intersection_1'
        - 'jensen_shannon'
        - 'kullback_leibler'
        - 'manhattan'
        - 'minowski'
        - 'noelle_1'
        - 'noelle_2'
        - 'noelle_3'
        - 'noelle_4'
        - 'noelle_5'
        - 'relative_bin_deviation'
        - 'relative_deviation'

        Note only the following are *metrics*:

        - 'manhattan'
        - 'minowski'
        - 'euclidean'
        - 'noelle_2'
        - 'noelle_4'
        - 'noelle_5'

        The following are *semi- or quasi-metrics*:

        - 'kullback_leibler'
        - 'jensen_shannon'
        - 'chi_square'
        - 'chebyshev'
        - 'cosine_1'
        - 'chebyshev_neg'
        - 'correlate_1'
        - 'histogram_intersection_1'
        - 'relative_deviation'
        - 'relative_bin_deviation'
        - 'noelle_1'
        - 'noelle_3'

        The following are  classified to be similarity functions:

        - 'histogram_intersection'
        - 'correlate'
        - 'cosine'
        - 'cosine_2'
        - 'cosine_alt'
        - 'fidelity_based'

        *Default* choice: 'manhattan'.

    summary_stats : list of str
        A string, or list of strings, each representing a method
            (like 'median', 'prod' or 'max'),
        to compute a summay statistic from the array of multiple weights computed.

        This must be available as a member of numpy or scipy.stats.

    num_bins : int
        Number of histogram bins to use when computing pair-wise weights.
        Default : 25

    edge_range_dict : tuple or list
        The range of edges (two finite values) within which to build the histogram
            e.g. ``--edge_range 0 5``.
        This can be helpful (and important) to ensure correspondence across multiple
        invocations of graynet (e.g. for different subjects),
        in terms of range across all bins as well as individual bin edges.

        Default :

            - ( 0.0, 5.0) for ``freesurfer_thickness`` and
            - (-0.3, 0.3) for ``freesurfer_curv``.

    atlas : str
        Name of the atlas whose parcellation to be used.
        Choices for parcellation: ['fsaverage', 'glasser2016'], which are primary cortical.
        Volumetric whole-brain atlases will be added soon.

    smoothing_param : scalar
        Smoothing parameter, which could be fwhm for Freesurfer cortical features,
        or another relevant for the chosen base_feature_list.
        Default: assumed as fwhm=10mm for the default feature choice 'thickness'

    node_size : scalar, optional
        Parameter to indicate the size of the ROIs, subparcels or patches,
        depending on type of atlas or feature. This feature is not implemented
        yet, just a placeholder and to enable default computation.

    out_dir : Path or str, optional
        Path to output directory to store results.
        Default: None, results are returned, but not saved to disk.
        If this is None, return_results must be true.

    return_results : bool
        Flag to indicate whether to return the results to be returned. This flag
        helps to reduce the memory requirements, when the number of nodes in a
        parcellation or the number of subjects or weight methods are large,
        as it doesn't retain results for all combinations, when running from
        commmand line interface (or HPC). Default: False If this is False,
        out_dir must be specified to save the results to disk.

    overwrite_results : bool
        Flag to request overwriting of existing results, in case of reruns/failed
        jobs. By default, if the expected output file exists and is of non-zero
        size, its computation is skipped (assuming the file is complete,
        usable and not corrupted).

    num_procs : int
        Number of parallel processes to use to speed up computation.

    Returns
    -------
    edge_weights_all : dict, None
        If return_results is True, this will be a dictionary keyed in by a tuple:
        (weight method, subject_ID) The value of each edge_weights_all[(weight
        method, subject_ID)] is a numpy array of length p = k*(k-1)/2, with k =
        number of nodes in the atlas parcellation. If return_results is False,
        this will be None, which is the default.
    """

    # volumetric version is not fully tested yet!
    for feat in base_feature_list:
        if feat in cfg.features_volumetric:
            raise NotImplementedError('MultiEdge networks are not yet supported '
                                      'for volumetric features! '
                                      'They are under development. Stay tuned.')

    # All the checks must happen here, as this is key function in the API
    check_params_multiedge(base_feature_list, input_dir, atlas, smoothing_param,
                           node_size, out_dir, return_results)
    atlas, atlas_name = check_atlas(atlas)

    subject_id_list, num_subjects, max_id_width, nd_id = check_subjects(subject_id_list)

    num_bins = check_num_bins(num_bins)
    edge_range_dict = check_edge_range_dict(edge_range_dict, base_feature_list)
    weight_method_list, num_weights, max_wtname_width, nd_wm = check_weights(
        weight_method_list)

    # validating the choice and getting a callable
    summary_stats, summary_stat_names, _, _, _ = check_stat_methods(summary_stats)

    num_procs = check_num_procs(num_procs)
    pretty_print_options = (max_id_width, nd_id, num_weights, max_wtname_width, nd_wm)

    # roi_labels, ctx_annot = parcellate.freesurfer_roi_labels(atlas)
    # uniq_rois, roi_size, num_nodes = roi_info(roi_labels)
    uniq_rois, centroids, roi_labels = parcellate.roi_labels_centroids(atlas)

    print('\nProcessing {} features resampled to {} atlas,'
          ' smoothed at {} with node size {}'.format(base_feature_list, atlas_name,
                                                     smoothing_param, node_size))

    if not return_results:
        if out_dir is None:
            raise ValueError('When return_results=False, '
                             'out_dir must be specified '
                             'to be able to save the results.')
        if not out_dir.exists():
            out_dir.mkdir(exist_ok=True, parents=True)

    partial_func_extract = partial(per_subject_multi_edge, input_dir,
                                   base_feature_list, roi_labels, centroids,
                                   weight_method_list, summary_stats,
                                   summary_stat_names, atlas, atlas_name,
                                   smoothing_param, node_size, num_bins,
                                   edge_range_dict, out_dir, return_results,
                                   overwrite_results, pretty_print_options)
    if num_procs > 1:
        chunk_size = int(np.ceil(num_subjects / num_procs))
        with Manager():
            with Pool(processes=num_procs) as pool:
                edge_weights_list_dicts = pool.map(partial_func_extract,
                                                   subject_id_list,
                                                   chunk_size)
    else:
        # reverting to sequential processing
        edge_weights_list_dicts = [partial_func_extract(subject=sub_id) for sub_id in
                                   subject_id_list]

    if return_results:
        edge_weights_all = dict()
        for combo in edge_weights_list_dicts:
            # each element from output of parallel loop is a dict
            #   keyed in by {subject, weight)
            edge_weights_all.update(combo)
    else:
        edge_weights_all = None

    print('\ngraynet computation done.')
    return edge_weights_all


def per_subject_multi_edge(input_dir, base_feature_list, roi_labels, centroids,
                           weight_method_list, summary_stats, summary_stat_names,
                           atlas, atlas_name, smoothing_param, node_size,
                           num_bins, edge_range_dict,
                           out_dir, return_results, overwrite_results, pretty_print_options,
                           subject=None):  # purposefully leaving it last to enable partial function creation
    """
    Extracts give set of weights for one subject.
    """

    if subject is None:
        return

    if return_results:
        edge_weights_all = dict()
    else:
        edge_weights_all = None

    max_id_width, nd_id, num_weights, max_wtname_width, nd_wm = pretty_print_options

    for ww, weight_method in enumerate(weight_method_list):

        expt_id_multi = stamp_expt_multiedge(base_feature_list, atlas_name,
                                             smoothing_param, node_size,
                                             weight_method)
        out_path_multigraph = make_output_path_graph(out_dir, subject,
                                                     [expt_id_multi, 'multigraph'])
        # skipping the computation if the file exists already
        if not overwrite_results and isfile(out_path_multigraph) and getsize(
                out_path_multigraph) > 0:
            print('\nMultigraph exists already at\n\t{}\n'
                  ' skipping its computation!'.format(out_path_multigraph))
            multigraph = None  # signal to re-read
        else:
            multigraph = nx.MultiGraph()

            for base_feature in base_feature_list:

                try:
                    features = import_features(input_dir,
                                               [subject, ],
                                               base_feature,
                                               fwhm=smoothing_param,
                                               atlas=atlas)

                except:
                    traceback.print_exc()
                    warnings.warn('Unable to read {} features'
                                  ' for {}\n Skipping it.'
                                  ''.format(base_feature, subject), UserWarning)
                    return

                data, rois = mask_background_roi(features[subject], roi_labels,
                                                               cfg.null_roi_name)

                # unique stamp for each subject and weight
                expt_id_single = stamp_expt_weight(base_feature, atlas_name,
                                                   smoothing_param, node_size,
                                                   weight_method)
                sys.stdout.write('\nProcessing id {:{id_width}} --'
                                 ' weight {:{wtname_width}} ({:{nd_wm}}/{:{nd_wm}})'
                                 ' :'.format(subject, weight_method, ww + 1,
                                             num_weights,
                                             nd_id=nd_id, nd_wm=nd_wm, id_width=max_id_width,
                                             wtname_width=max_wtname_width))

                # actual computation of pair-wise features
                try:
                    unigraph = hiwenet.extract(data,
                                               rois,
                                               weight_method=weight_method,
                                               num_bins=num_bins,
                                               edge_range=edge_range_dict[base_feature],
                                               return_networkx_graph=True)

                    # retrieving edge weights
                    weight_vec = np.array(list(nx.get_edge_attributes(unigraph, 'weight').values()))
                    warn_nan(weight_vec)
                    if return_results:
                        edge_weights_all[(weight_method, base_feature, subject)] = weight_vec

                except (RuntimeError, RuntimeWarning) as runexc:
                    print(runexc)
                except KeyboardInterrupt:
                    print('Exiting on keyborad interrupt! \n'
                          'Abandoning the remaining processing ')
                    sys.exit(1)
                except:
                    print('Unable to extract {} weights for {} for {}'
                          ''.format(weight_method, base_feature, subject))
                    traceback.print_exc()

                print('Done.')

                # TODO consider extracting some network features upon user request.

                add_nodal_positions(unigraph, centroids)
                save_per_subject_graph(unigraph, out_dir, subject, expt_id_single)

                # adding edges/weights from each feature to a multigraph
                # this also encodes the sources
                for u, v in unigraph.edges():
                    multigraph.add_edge(u, v,
                                        weight=unigraph[u][v]['weight'],
                                        base_feature=base_feature)

            # adding position info to nodes (for visualization later)
            add_nodal_positions(multigraph, centroids)
            save_graph(multigraph, out_path_multigraph, 'multi-edge')

        for stat_func, stat_name in zip(summary_stats, summary_stat_names):
            # creating single graph with a summary edge weight (like median)
            out_path_summary = make_output_path_graph(out_dir, subject,
                                                      [expt_id_multi, stat_name, 'multigraph'])
            if (not overwrite_results) and \
                    isfile(out_path_summary) and \
                    (getsize(out_path_summary) > 0):
                print('Summary {} of multigraph exists already at\n\t{}\n'
                      ' skipping its computation!'
                      ''.format(stat_name, out_path_summary))
            else:
                if multigraph is None:
                    multigraph = nx.read_graphml(out_path_multigraph)

                try:
                    summary_multigraph = summarize_multigraph(multigraph, stat_func)
                    add_nodal_positions(summary_multigraph, centroids)
                    save_graph(summary_multigraph, out_path_summary,
                               '{} summary'.format(stat_name))
                except:
                    print('Summary {} could not be computed - skipping!'
                          ''.format(stat_name))
                    traceback.print_exc()

    return edge_weights_all


def summarize_multigraph(multigraph, func_summary):
    """Creating single graph with a summary edge weight (like median)"""

    summary_multigraph = nx.Graph()
    for u, v in multigraph.edges():
        # looping through parallel edges and obtaining their weights.
        all_weights = np.array([edge_item['weight']
                                for idx, edge_item in multigraph[u][v].items()])
        # float() needed due to graphml limitation
        summary_weight = float(func_summary(all_weights))
        summary_multigraph.add_edge(u, v, weight=summary_weight)

    return summary_multigraph


def add_nodal_positions(graph, centroids):
    """Adds the x, y, z attributes to each node in graph."""

    # adding position info to nodes (for visualization later)
    for roi in centroids:
        graph.nodes[roi]['x'] = float(centroids[roi][0])
        graph.nodes[roi]['y'] = float(centroids[roi][1])
        graph.nodes[roi]['z'] = float(centroids[roi][2])

    return


def save_summary_graph(graph, out_dir, subject,
                       str_suffix=None,
                       summary_descr='summary'):
    """Saves the features to disk."""

    if out_dir is not None:
        # get outpath returned from hiwenet based on dist name and all other params
        # choose out_dir name based on dist name and all other parameters
        out_subject_dir = out_dir.joinpath(subject)
        if not out_subject_dir.exists():
            out_subject_dir.mkdir(exist_ok=True, parents=True)

        if str_suffix is not None:
            out_file_name = '{}_{}_multigraph_graynet.graphml' \
                            ''.format(str_suffix, summary_descr)
        else:
            out_file_name = '_{}_multigraph_graynet.graphml'.format(summary_descr)

        out_weights_path = out_subject_dir / out_file_name

        try:
            print(graph)
            nx.write_graphml(graph, out_weights_path, encoding='utf-8')
            print('\nSaved the summary multi-graph to \n{}'.format(out_weights_path))
        except:
            print('\nUnable to save summary multi-graph to \n{}'
                  ''.format(out_weights_path))
            traceback.print_exc()

    return
graynet 0.5+2.gd353520.dirty documentation

Source code for multi_edge