Source code for abagen.allen

# -*- coding: utf-8 -*-
"""
Functions for mapping AHBA microarray dataset to atlases and and parcellations
"""

from functools import reduce
import logging
import warnings

import nibabel as nib
import numpy as np
import pandas as pd

from . import (correct, datasets, images, io, matching, probes_, reporting,
               samples_, utils)
from .utils import first_entry, flatten_dict

logging.basicConfig(format='%(levelname)s: %(message)s', level=logging.WARNING)
LGR = logging.getLogger('abagen')


[docs]def get_expression_data(atlas,
                        atlas_info=None,
                        *,
                        ibf_threshold=0.5,
                        probe_selection='diff_stability',
                        donor_probes='aggregate',
                        sim_threshold=None,
                        lr_mirror=None,
                        exact=None, missing=None,
                        tolerance=2,
                        sample_norm='srs',
                        gene_norm='srs',
                        norm_matched=True,
                        norm_structures=False,
                        region_agg='donors',
                        agg_metric='mean',
                        corrected_mni=True,
                        reannotated=True,
                        return_counts=False,
                        return_donors=False,
                        return_report=False,
                        donors='all',
                        data_dir=None,
                        verbose=0,
                        n_proc=1):
    """
    Assigns microarray expression data to ROIs defined in `atlas`

    This function aims to provide a workflow for generating pre-processed,
    microarray expression data from the Allen Human Brain Atlas ([A2]_) for
    abitrary `atlas` designations. First, some basic filtering of genetic
    probes is performed, including:

        1. Intensity-based filtering of microarray probes to remove probes that
           do not exceed a certain level of background noise (specified via the
           `ibf_threshold` parameter),
        2. Selection of a single, representative probe (or collapsing across
           probes) for each gene, specified via the `probe_selection`
           parameter (and influenced by the `donor_probes` parameter), and
        3. Optional mirroring of the tissue samples across the left/right
           hemisphere boundary, as specified via the `lr_mirror` parameter
           (turned off by default).

    Tissue samples are then matched to parcels in the defined `atlas` for each
    donor. If `atlas_info` is provided then this matching is constrained by
    both hemisphere and tissue class designation (e.g., cortical samples from
    the left hemisphere are only matched to ROIs in the left cortex,
    subcortical samples from the right hemisphere are only matched to ROIs in
    the left subcortex); see the `atlas_info` parameter description for more
    information.

    Matching of microarray samples to parcels in `atlas` is done via a multi-
    step process:

        1. Determine if the sample falls directly within a parcel,
        2. Check to see if there are nearby parcels by slowly expanding the
           search space to include nearby voxels, up to a specified distance
           (specified via the `tolerance` parameter),
        3. If there are multiple nearby parcels, the sample is assigned to the
           closest parcel, as determined by the parcel centroid.

    If at any step a sample can be assigned to a parcel the matching process is
    terminated. When the provided atlas is not volumetric (i.e., surface-based)
    the samples are simply matched to the nearest vertex, and `tolerance` is
    used as a standard deviation threshold. More control over the sample
    matching can be obtained by setting the `exact` parameter; see the
    parameter description for more information.

    Once all samples have been matched to parcels for all supplied donors, the
    microarray expression data are optionally normalized via the provided
    `sample_norm` and `gene_norm` functions (which are influenced by the
    `norm_matched` and `norm_structures` parameters) before being aggregated
    across donors via the supplied `region_agg` and `agg_metric` parameters.

    Parameters
    ----------
    atlas : niimg-like object or dict
        A parcellation image in MNI space or a tuple of GIFTI images in
        fsaverage5 space, where each parcel is identified by a unique integer
        ID. Alternatively, a dictionary where keys are donor IDs and values are
        parcellation images (or surfaces) in the native space of each donor.
    atlas_info : os.PathLike or pandas.DataFrame, optional
        Filepath to or pre-loaded dataframe containing information about
        `atlas`. Must have at least columns 'id', 'hemisphere', and 'structure'
        containing information mapping atlas IDs to hemisphere (i.e, "L", "R",
        "B") and broad structural class (i.e., "cortex", "subcortex/brainstem",
        "cerebellum"). If provided, this will constrain matching of tissue
        samples to regions in `atlas`. If `atlas` is a tuple of GIFTI images
        with valid label tables this will be intuited from the data. Default:
        None
    ibf_threshold : [0, 1] float, optional
        Threshold for intensity-based filtering. This number specifies the
        ratio of samples, across all supplied donors, for which a probe must
        have signal significantly greater than background noise in order to be
        retained. Default: 0.5
    probe_selection : str, optional
        Selection method for subsetting (or collapsing across) probes that
        index the same gene. Must be one of 'average', 'max_intensity',
        'max_variance', 'pc_loading', 'corr_variance', 'corr_intensity', or
        'diff_stability', 'rnaseq'; see Notes for more information on different
        options. Default: 'diff_stability'
    donor_probes : {'aggregate', 'independent', 'common'}, optional
        Whether specified `probe_selection` method should be performed with
        microarray data from all donors ('aggregate'), independently for each
        donor ('independent'), or based on the most common selected probe
        across donors ('common'). Not all combinations of `probe_selection`
        and `donor_probes` methods are viable. Default: 'aggregate'
    sim_threshold : (0, inf) float, optional
        Threshold for inter-areal similarity filtering. Samples are correlated
        across probes and those samples with a total correlation less than
        `sim_threshold` standard deviations below the mean across samples are
        excluded from futher analysis. If not specified no filtering is
        performed. Default: None
    lr_mirror : {None, 'bidirectional', 'leftright', 'rightleft'}, optional
        Whether to mirror microarray expression samples across hemispheres to
        increase spatial coverage. Using 'bidirectional' will mirror samples
        across both hemispheres, 'leftright' will mirror samples in the left
        hemisphere to the right, and 'rightleft' will mirror the right to the
        left. Default: None
    missing : {'centroids', 'interpolate', None}, optional
        How to handle regions in `atlas` that are not assigned any tissue
        samples. If 'centroids', any empty regions will be assigned the
        expression value of the nearest tissue sample (defined as the sample
        with the closest Euclidean distance to the parcel centroid). If
        'interpolate', expression values will be interpolated in the empty
        regions by assigning every node in the region the expression of the
        nearest sample and taking a weighted (inverse distance) average. If
        not specified empty regions will be returned with expression values of
        NaN. Default: None
    tolerance : int, optional
        Distance (in mm) that a sample must be from a parcel for it to be
        matched to that parcel. If `atlas` is a tuple of surface files then
        this measure is a standard deviation threshold (i.e., samples greater
        than `tolerance` SDs away from the mean matched distance are ignored).
        Default: 2
    sample_norm : {'rs', 'srs', 'minmax', 'center', 'zscore', None}, optional
        Method by which to normalize microarray expression values for each
        sample. Expression values are normalized separately for each sample and
        donor across all genes; see Notes for more information on different
        methods. If None is specified then no normalization is performed.
        Default: 'srs'
    gene_norm : {'rs', 'srs', 'minmax', 'center', 'zscore', None}, optional
        Method by which to normalize microarray expression values for each
        donor. Expression values are normalized separately for each gene and
        donor across all samples; see Notes for more information on different
        methods. If None is specified then no normalization is performed.
        Default: 'srs'
    norm_matched : bool, optional
        Whether to perform gene normalization (`gene_norm`) across only those
        samples matched to regions in `atlas` instead of all available samples.
        If `atlas` is very small (i.e., only a few regions of interest), using
        `norm_matched=False` is suggested. Default: True
    norm_structures : bool, optional
        Whether to perform gene normalization (`gene_norm`) within structural
        classes (i.e., 'cortex', 'subcortex/brainstem', 'cerebellum') instead
        of across all available samples. Default: False
    region_agg : {'samples', 'donors'}, optional
        When multiple samples are identified as belonging to a region in
        `atlas` this determines how they are aggegated. If 'samples',
        expression data from all samples for all donors assigned to a given
        region are combined. If 'donors', expression values for all samples
        assigned to a given region are combined independently for each donor
        before being combined across donors. See `agg_metric` for mechanism by
        which samples are combined. Default: 'donors'
    agg_metric : {'mean', 'median'} or callable, optional
        Mechanism by which to reduce sample-level expression data into region-
        level expression (see `region_agg`). If a callable, should be able to
        accept an `N`-dimensional input and the `axis` keyword argument and
        return an `N-1`-dimensional output. Default: 'mean'
    corrected_mni : bool, optional
        Whether to use the "corrected" MNI coordinates shipped with the
        `alleninf` package instead of the coordinates provided with the AHBA
        data when matching tissue samples to anatomical regions. Default: True
    reannotated : bool, optional
        Whether to use reannotated probe information provided by [A1]_ instead
        of the default probe information from the AHBA dataset. Using
        reannotated information will discard probes that could not be reliably
        matched to genes. Default: True
    return_counts : bool, optional
        Whether to return dataframe containing information on how many samples
        were assigned to each parcel in `atlas` for each donor. Default: False
    return_donors : bool, optional
        Whether to return donor-level expression arrays instead of aggregating
        expression across donors with provided `agg_metric`. Default: False
    return_report : bool, optional
        Whether to return a string containing longform text describing the
        processing procedures used to generate the `expression` DataFrames
        returned by this function. Default: False
    donors : list, optional
        List of donors to use as sources of expression data. Can be either
        donor numbers or UID. If not specified will use all available donors.
        Note that donors '9861' and '10021' have samples from both left + right
        hemispheres; all other donors have samples from the left hemisphere
        only. Default: 'all'
    data_dir : os.PathLike, optional
        Directory where expression data should be downloaded (if it does not
        already exist) / loaded. If not specified will use the current
        directory. Default: None
    verbose : int, optional
        Specifies verbosity of status messages to display during workflow.
        Higher numbers increase verbosity of messages while zero suppresses all
        messages. Default: 1
    n_proc : int, optional
        Number of processors to use to download AHBA data. Can parallelize up
        to six times. Default: 1

    Returns
    -------
    expression : (R, G) pandas.DataFrame
        Microarray expression for `R` regions in `atlas` for `G` genes,
        aggregated across donors, where the index corresponds to the unique
        integer IDs of `atlas` and the columns are gene names. If
        ``return_donors=True`` then this is a list of (R, G) dataframes, one
        for each donor.
    counts : (R, D) pandas.DataFrame
        Number of samples assigned to each of `R` regions in `atlas` for each
        of `D` donors (if multiple donors were specified); only returned if
        ``return_counts=True``.
    report : str
        Methods describing processing procedures implemented to generate
        `expression`, suitable to be used in a manuscript Methods section. Only
        returned if ``return_report=True``.

    Notes
    -----
    The following methods can be used for collapsing across probes when
    multiple probes are available for the same gene:

        1. ``probe_selection='average'``

        Takes the average of expression data across all probes indexing the
        same gene. Providing 'mean' as the input method will return the same
        thing. This method can only be used when `donor_probes='aggregate'`.

        2. ``probe_selection='max_intensity'``

        Selects the probe with the maximum average expression across samples
        from all donors.

        3. ``probe_selection='max_variance'``

        Selects the probe with the maximum variance in expression across
        samples from all donors.

        4. ``probe_selection='pc_loading'``

        Selects the probe with the maximum loading along the first principal
        component of a decomposition performed across samples from all donors.

        5. ``probe_selection='corr_intensity'``

        Selects the probe with the maximum correlation to other probes from the
        same gene when >2 probes exist; otherwise, uses the same procedure as
        `max_intensity`.

        6. ``probe_selection='corr_variance'``

        Selects the probe with the maximum correlation to other probes from the
        same gene when >2 probes exist; otherwise, uses the same procedure as
        `max_varance`.

        7. ``probe_selection='diff_stability'``

        Selects the probe with the most consistent pattern of regional
        variation across donors (i.e., the highest average correlation across
        brain regions between all pairs of donors). This method can only be
        used when `donor_probes='aggregate'`.

        8. ``method='rnaseq'``

        Selects probes with most consistent pattern of regional variation to
        RNAseq data (across the two donors with RNAseq data). This method can
        only be used when `donor_probes='aggregate'`.

    Note that for incompatible combinations of `probe_selection` and
    `donor_probes` (as detailed above), the `probe_selection choice will take
    precedence. For example, providing ``probe_selection='diff_stability'`` and
    ``donor_probes='independent'`` will cause `donor_probes` to be reset to
    `'aggregate'`.

    The following methods can be used for normalizing microarray expression
    values prior to aggregating:

        1. ``{sample,gene}_norm=='rs'``

        Uses a robust sigmoid function as in [A3]_ to normalize values

        2. ``{sample,gene}_norm='srs'``

        Same as 'rs' but scales output to the unit normal (i.e., range 0-1)

        3. ``{sample,gene}_norm='minmax'``

        Scales data to the unit normal (i.e., range 0-1)

        4. ``{sample,gene}_norm='center'``

        Removes the mean of expression values

        5. ``{sample,gene}_norm='zscore'``

        Applies a basic z-score (subtract mean, divide by standard deviation);
        uses degrees of freedom equal to one for standard deviation

    References
    ----------
    .. [A1] Arnatkevic̆iūtė, A., Fulcher, B. D., & Fornito, A. (2019). A
       practical guide to linking brain-wide gene expression and neuroimaging
       data. NeuroImage, 189, 353-367.
    .. [A2] Hawrylycz, M.J. et al. (2012) An anatomically comprehensive atlas
       of the adult human transcriptome. Nature, 489, 391-399.
    .. [A3] Fulcher, B. D., & Fornito, A. (2016). A transcriptional signature
       of hub connectivity in the mouse connectome. Proceedings of the National
       Academy of Sciences, 113(5), 1435-1440.
    """

    # set logging verbosity level
    LGR.setLevel(dict(zip(range(3), [40, 20, 10])).get(int(verbose), 10))

    # get combination functions
    agg_metric = utils.check_metric(agg_metric)

    # check probe_selection input
    if probe_selection not in probes_.SELECTION_METHODS:
        raise ValueError('Provided probe_selection method is invalid, must be '
                         f'one of {list(probes_.SELECTION_METHODS)}. Received '
                         f'value: {probe_selection}')
    dprobe_opts = ('aggregate', 'independent', 'common')
    if donor_probes not in dprobe_opts:
        raise ValueError('Provided donor_probes method is invalid, must be one'
                         f' of {dprobe_opts}. Received value: {donor_probes}')

    # check lr_mirror input
    if isinstance(lr_mirror, bool):
        warnings.warn('Setting lr_mirror to a boolean value will be '
                      'deprecated in an upcoming release. Use either '
                      '`lr_mirror=None` or `lr_mirror="bidirectional"` '
                      'instead. See documentation for more details.',
                      FutureWarning)
        lr_mirror = 'bidirectional' if lr_mirror else None
    mirror_opts = (None, 'bidirectional', 'leftright', 'rightleft')
    if lr_mirror not in mirror_opts:
        raise ValueError('Provided lr_mirror method is invalid, must be one '
                         f'of {mirror_opts}. Received value: {lr_mirror}')

    # check exact/missing input
    if exact is not None:
        warnings.warn('The exact parameter will be deprecated in an upcoming '
                      'release. Use the `missing` parameter instead. See '
                      'documentation for more details.',
                      FutureWarning)
        missing = None if exact else 'centroids'
    missing_opts = (None, 'interpolate', 'centroids')
    if missing not in missing_opts:
        raise ValueError('Provided missing method is invalid, must be one '
                         f'of {missing_opts}. Received value: {missing}')

    # check that region_agg is viable with return_donors
    if return_donors and region_agg == 'samples':
        raise ValueError('Cannot return donor-level expresison data when '
                         'region_agg parameter is set to "samples".')

    # load atlas and atlas_info, if provided, and coerce to dict
    atlas, group_atlas = images.coerce_atlas_to_dict(atlas, donors, atlas_info)

    # fetch files (downloading if necessary) and unpack to variables
    files = datasets.fetch_microarray(data_dir=data_dir, donors=donors,
                                      verbose=verbose, n_proc=n_proc)

    if probe_selection == 'diff_stability' and len(files) == 1:
        raise ValueError('Cannot use diff_stability for probe_selection with '
                         'only one donor. Please specify a different probe_'
                         'selection method or use more donors.')
    elif probe_selection == 'rnaseq':  # fetch RNAseq if we're gonna need it
        datasets.fetch_rnaseq(data_dir=data_dir, donors=donors,
                              verbose=verbose)

    # get some info on labels in `atlas`
    all_labels = utils.first_entry(atlas).labels
    n_gb = (8 * len(all_labels) * 30000) / (1024 ** 3)
    if n_gb > 1:
        LGR.warning(f'Output matrix may require up to {n_gb:.2f} GB RAM')

    # get dataframe of probe information (reannotated or otherwise)
    # the Probes.csv files are the same for every donor so just grab the first
    probe_info = io.read_probes(first_entry(files, 'probes'))
    if reannotated:
        probe_info = probes_.reannotate_probes(probe_info)

    # drop probes with no/invalid Entrez ID
    probe_info = probe_info.dropna(subset=['entrez_id'])

    # update the annotation "files". this handles updating the MNI coordinates,
    # dropping mistmatched samples (where MNI coordinates don't match the
    # provided ontology), and mirroring samples across hemispheres, if desired
    for donor, data in files.items():
        annot, ontol = data['annotation'], data['ontology']
        t1w = None
        if not group_atlas:
            t1w = datasets.fetch_raw_mri(donors=donor,
                                         data_dir=data_dir,
                                         verbose=verbose)[donor]['t1w']
        annot = samples_.update_coords(annot, corrected_mni=corrected_mni,
                                       native_space=t1w)
        if lr_mirror is not None:
            annot = samples_.mirror_samples(annot, ontol, swap=lr_mirror)
        annot = samples_.drop_mismatch_samples(annot, ontol)
        if sim_threshold is not None:
            annot = samples_.similarity_threshold(data['microarray'],
                                                  annot, probe_info,
                                                  threshold=sim_threshold)
        data['annotation'] = annot
    annotation = flatten_dict(files, 'annotation')

    # intensity-based filtering of probes
    probe_info = probes_.filter_probes(flatten_dict(files, 'pacall'),
                                       annotation, probe_info,
                                       threshold=ibf_threshold)

    # get probe-reduced microarray expression data for all donors based on
    # selection method; this will be a list of gene x sample dataframes (one
    # for each donor)
    microarray = probes_.collapse_probes(flatten_dict(files, 'microarray'),
                                         annotation, probe_info,
                                         method=probe_selection,
                                         donor_probes=donor_probes)
    centroids = []
    counts = pd.DataFrame(np.zeros((len(all_labels) + 1, len(microarray)),
                                   dtype=int),
                          index=np.append([0], all_labels),
                          columns=microarray.keys())
    for subj in microarray:
        if lr_mirror is not None:  # reset index
            idx = pd.Series(range(1, len(microarray[subj]) + 1),
                            name='sample_id')
            microarray[subj].index = idx.copy()
            annotation[subj].index = idx.copy()

        # assign samples to regions
        labels = atlas[subj].label_samples(annotation[subj], tolerance)

        # if we're doing exact matching and want to aggregate samples w/i
        # regions, remove the non-labelled samples prior to normalization.
        # otherwise, we'll remove the non-labelled samples after normalization
        nz = np.asarray(labels != 0).squeeze()
        if nz.sum() == 0:
            LGR.warning(f'No samples matched to atlas for donor {subj}')
            microarray[subj].index = labels['label']
            if missing == 'centroids':
                missing += [(pd.DataFrame(), {})]
            continue

        if norm_matched:
            microarray[subj] = microarray[subj].loc[nz]
            annotation[subj] = annotation[subj].loc[nz]
            labels = labels.loc[nz]

        # if normalizing by structural class get annotation dataframe
        annot = annotation[subj][['structure']] if norm_structures else None
        if sample_norm is not None:
            microarray[subj] = correct.normalize_expression(microarray[subj].T,
                                                            norm=sample_norm,
                                                            ignore_warn=True).T
        if gene_norm is not None:
            microarray[subj] = correct.normalize_expression(microarray[subj],
                                                            norm=gene_norm,
                                                            structures=annot,
                                                            ignore_warn=True)

        # get counts of samples collapsed into each ROI
        labs, num = np.unique(labels, return_counts=True)
        counts.loc[labs, subj] = num
        LGR.info(f'{counts.iloc[1:][subj].sum():>3} / {len(nz)} '
                 f'samples matched to regions for donor #{subj}')

        # if we don't want to do exact matching then cache which parcels are
        # missing data and the expression data for the closest sample to that
        # parcel; we'll use this once we've iterated through all donors
        if missing is not None:
            if missing == 'interpolate':
                exp, lab = _interpolate_missing(atlas[subj], labels,
                                                microarray[subj],
                                                annotation[subj])
                microarray[subj] = pd.concat([microarray[subj], exp])
                labels = pd.concat([labels, lab])
            elif missing == 'centroids':
                centroids += [_match_centroids(atlas[subj], labels,
                                               microarray[subj],
                                               annotation[subj])]

        microarray[subj] = pd.merge(microarray[subj], labels, on='sample_id') \
                             .set_index('label') \
                             .rename_axis('gene_symbol', axis=1)

    # if we don't want to aggregate over regions return sample-level results
    if region_agg is None:
        microarray = {
            donor: micro.set_index(annotation[donor]['well_id'], append=True)
                        .dropna(axis=1, how='any')
            for donor, micro in microarray.items()
        }
        if not return_donors:
            microarray = pd.concat(microarray.values())
        return microarray

    if missing == 'centroids':
        # labels that are missing across all donors
        empty = reduce(set.intersection, [set(f.index) for f, d in centroids])
        LGR.info(f'Matching {len(empty)} region(s) with no data to the '
                 'nearest tissue sample(s)')
        for roi in empty:
            # take weighted average of samples closest to parcel centroid
            # across subjects
            exp, dist = zip(*[(micro.loc[[roi]], dist.get(roi))
                              for micro, dist in centroids])
            weights = _get_weights(np.asarray(dist)[:, None])
            exp = pd.DataFrame(
                (pd.concat(exp) * weights).sum(axis=0) / weights.sum(),
                columns=pd.Series([roi], name='label')
            ).T
            # assign same value to every subject
            for subj in microarray:
                microarray[subj] = pd.concat([microarray[subj], exp])

    microarray = samples_.aggregate_samples(microarray,
                                            labels=all_labels,
                                            region_agg=region_agg,
                                            agg_metric=agg_metric,
                                            return_donors=return_donors)

    counts = counts.drop([0], axis=0)
    if return_report:  # generate report
        report = reporting.Report(atlas, atlas_info=atlas[subj].atlas_info,
                                  ibf_threshold=ibf_threshold,
                                  probe_selection=probe_selection,
                                  donor_probes=donor_probes,
                                  lr_mirror=lr_mirror, missing=missing,
                                  tolerance=tolerance, sample_norm=sample_norm,
                                  gene_norm=gene_norm,
                                  norm_matched=norm_matched,
                                  norm_structures=norm_structures,
                                  region_agg=region_agg, agg_metric=agg_metric,
                                  corrected_mni=corrected_mni,
                                  reannotated=reannotated, donors=donors,
                                  return_donors=return_donors,
                                  data_dir=data_dir, counts=counts,
                                  n_probes=len(probe_info),
                                  n_genes=(first_entry(microarray).shape[1]
                                           if return_donors
                                           else microarray.shape[1])).body

    # pack outputs
    out = (microarray,)
    if return_counts:
        out += (counts,)
    if return_report:
        out += (report,)
    if len(out) == 1:
        out = out[0]

    return out


[docs]def get_samples_in_mask(mask=None, **kwargs):
    """
    Returns preprocessed microarray expression data for samples in `mask`

    Uses the same processing workflow as :func:`abagen.get_expression_data` but
    instead of aggregating samples within regions simply returns sample-level
    expression data for all samples that fall within boundaries of `mask`.

    Parameters
    ----------
    mask : niimg-like object or dict, optional
        A mask image in MNI space or a tuple of GIFTI images in fsaverage5
        space (where 0 is the background). Alternatively, a dictionary where
        keys are donor IDs and values are mask images (or surfaces) in the
        native space of each donor. If not supplied, all available samples will
        be returned. Default: None
    kwargs : key-value pairs
        All key-value pairs from :func:`abagen.get_expression_data` except for:
        `atlas`, `atlas_info`, `region_agg`, and `agg_metric`, which will be
        ignored. If `atlas` is supplied instead of `mask` then `atlas` will be
        used instead as a modified binary image. If both `atlas` and `mask` are
        supplied then `mask` will be used

    Returns
    -------
    expression : (S, G) pandas.DataFrame
        Microarray expression for `S` samples for `G` genes, aggregated across
        donors, where the columns are gene names
    coords : (S,) numpy.ndarray
        MNI coordinates of samples in `expression`. Even if donor-specific
        masks are provided MNI coordinates will be returned to ensure
        comparability between subjects
    """

    # fetch files (downloading if necessary) to get coordinates
    files = datasets.fetch_microarray(data_dir=kwargs.get('data_dir', None),
                                      donors=kwargs.get('donors', 'all'),
                                      verbose=kwargs.get('verbose', 1),
                                      n_proc=kwargs.get('n_proc', 1))

    # get updated coordinates
    for donor, data in files.items():
        annot = data['annotation']
        if kwargs.get('corrected_mni', True):
            annot = samples_.update_mni_coords(annot)
        files[donor]['annotation'] = annot
    cols = ['well_id', 'mni_x', 'mni_y', 'mni_z']
    coords = np.asarray(pd.concat(flatten_dict(files, 'annotation'))[cols])
    well_id, coords = np.asarray(coords[:, 0], 'int'), coords[:, 1:]
    coords = pd.DataFrame(coords, columns=['x', 'y', 'z'], index=well_id)

    # in case people mix things up and use atlas instead of mask, use that
    if kwargs.get('atlas') is not None and mask is None:
        mask = kwargs['atlas']
    elif mask is None:
        # generate atlas image where each voxel with sample has value of 1
        affine = np.eye(4)
        affine[:-1, -1] = -1 * np.floor(np.max(np.abs(coords), axis=0))
        img = np.ones(np.asarray(-2 * affine[:-1, -1], dtype=int))
        mask = nib.Nifti1Image(img, affine=affine)

    # reset these parameters
    kwargs['atlas'] = mask
    kwargs['atlas_info'] = None
    kwargs['region_agg'] = None
    # soft reset this parameter
    kwargs.setdefault('norm_matched', False)

    # get expression data + drop sample coordinates that weren't in atlas
    exp = get_expression_data(**kwargs)
    if kwargs.get('return_donors'):
        exp = {
            donor: micro.drop(index=[0], level='label', errors='ignore')
                        .droplevel('label')
            for donor, micro in exp.items()
        }
        coords = {
            donor: coords.loc[micro.index]
            for donor, micro in exp.items()
        }
    else:
        exp = exp.drop(index=[0], level='label', errors='ignore') \
                 .droplevel('label')
        coords = coords.loc[exp.index]

    return exp, coords


[docs]def get_interpolated_map(genes, mask, n_neighbors=10, **kwargs):
    """
    Generates dense (i.e., interpolated) expression maps for `genes`

    Uses k-nearest neighbors regression to estimate values at every voxel or
    vertex in the supplied `mask`. Note that by default `lr_mirror` is set to
    'bidirectional' and `norm_matched` is set to False, unless explicitly
    specified.

    Parameters
    ----------
    genes : (G,) list-of-str
        List of gene acronyms for which dense maps are desired
    mask : niimg-like object, optional
        A mask image in MNI space or a tuple of GIFTI images in fsaverage5
        space (where 0 is the background)
    n_neighbors : int, optional
        Number of neighboring tissue samples to use when interpolating data in
        dense map. Default: 10
    kwargs : key-value pairs
        All key-value pairs from :func:`abagen.get_expression_data` except for:
        `atlas`, `atlas_info`, `region_agg`, and `agg_metric`, which will be
        ignored.

    Returns
    -------
    dense : (G,) dict
        Dictionary where keys are `genes` and values are dense maps in space of
        provided `mask`
    """

    genes = np.atleast_1d(genes)
    mask = images.check_atlas(mask)
    if 'atlas' in kwargs:
        del kwargs['atlas']

    # soft reset here and then get ALL the samples in the brain
    # (we don't just want samples _inside_ the mask since we're using NN)
    kwargs.setdefault('lr_mirror', 'bidirectional')
    expression, coords = get_samples_in_mask(mask=None, **kwargs)

    # this is a "naive" matching based only on Euclidean distance
    # TODO: use surface distances when working with surfaces
    # TODO: constrain by hemisphere (and structure?)
    exptree = matching.AtlasTree(np.asarray(expression.index),
                                 coords=np.asarray(coords))
    dist, idx = exptree.tree.query(mask.coords, k=n_neighbors)
    dist = _get_weights(dist)

    # get average of nearest neighbors
    dense = np.zeros(((genes.shape[0],) + mask._shape))
    denom = np.sum(dist, axis=1)
    for n, j in enumerate(genes):
        num = np.sum(np.asarray(expression[j])[idx] * dist, axis=1)
        dense[n][mask._nz] = num / denom

    return dict(zip(genes, dense))


def _get_weights(dist):
    """ Gets inverse of `dist`, handling potential infs

    Parameters
    ----------
    dist : array_like
        Distances to be converted to weights

    Returns
    -------
    weights : np.ndarray
        Inverse of `dist`
    """

    with np.errstate(divide='ignore'):
        dist = 1. / dist
    isinf = np.isinf(dist)
    infrow = np.any(isinf, axis=1)
    dist[infrow] = isinf[infrow]

    return dist


def _interpolate_missing(atlas, labels, microarray, annotation):
    """ Interpolate missing data in `atlas` with weighted nearest-neighbors
    """

    data, labs = [], []
    for label in np.setdiff1d(atlas.labels, labels):
        # it's more cost efficient to do this check here
        if atlas.atlas_info is not None:
            cols = ['structure', 'hemisphere']
            match = atlas.atlas_info.loc[label, cols] == annotation[cols]
            if not np.any(np.all(match, axis=1)):
                continue

        # get sample indices for every node of `lab` in `atlas`
        idx, dist = atlas.fill_label(annotation, label, return_dist=True)
        if np.all(np.isinf(dist)):
            continue

        # get expression of selected tissue samples and weights
        dist = _get_weights(dist[:, None])

        # get weighted average of sample expression and store
        exp = (microarray.loc[idx] * dist).sum(axis=0) / dist.sum()
        sampid = pd.Series([f'interp_{label}'], name='sample_id')
        data.append(pd.DataFrame(exp, columns=sampid).T)
        labs.append(pd.DataFrame(label, columns=['label'], index=sampid))

    return pd.concat(data), pd.concat(labs)


def _match_centroids(atlas, labels, microarray, annotation):
    """ Matches missing data in `atlas` based on centroids
    """

    cols = ['mni_x', 'mni_y', 'mni_z']
    tree = matching.AtlasTree(
        np.asarray(annotation.index),
        coords=np.asarray(annotation[cols]),
        atlas_info=annotation.rename_axis('id')
    )
    empty = np.setdiff1d(atlas.labels, labels)

    # get coordinates of centroids for each missing parcel
    centroids = np.r_[[atlas.centroids[lab] for lab in empty]]
    if atlas.atlas_info is not None:
        centroid_info = atlas.atlas_info.loc[empty]
        centroid_info[cols] = centroids
        centroids = centroid_info

    # get idx of tissue sample closest to each centroid
    idx, dist = tree.match_closest_centroids(centroids, return_dist=True)

    # get expression of selected tissue samples
    mask = idx != -1
    exp = microarray.loc[idx[mask]]
    exp.index = pd.Series(empty[mask], name='label')
    # if some labels weren't matched (due to e.g., hemispheric/structural
    # constraints) then append empty rows to the dataframe
    exp = pd.concat([exp, pd.DataFrame(index=empty[~mask])]) \
        .sort_index() \
        .rename_axis('gene_symbol', axis=1)

    return (exp, dict(zip(empty, dist)))