Source code for abagen.allen

# -*- coding: utf-8 -*-
"""
Functions for mapping AHBA microarray dataset to atlases and and parcellations
"""

from functools import reduce
import logging
import warnings

import nibabel as nib
import numpy as np
import pandas as pd

from . import (correct, datasets, images, io, matching, probes_, reporting,
               samples_, utils)
from .utils import first_entry, flatten_dict

logging.basicConfig(format='%(levelname)s: %(message)s', level=logging.WARNING)
LGR = logging.getLogger('abagen')


[docs]def get_expression_data(atlas, atlas_info=None, *, ibf_threshold=0.5, probe_selection='diff_stability', donor_probes='aggregate', sim_threshold=None, lr_mirror=None, exact=None, missing=None, tolerance=2, sample_norm='srs', gene_norm='srs', norm_matched=True, norm_structures=False, region_agg='donors', agg_metric='mean', corrected_mni=True, reannotated=True, return_counts=False, return_donors=False, return_report=False, donors='all', data_dir=None, verbose=0, n_proc=1): """ Assigns microarray expression data to ROIs defined in `atlas` This function aims to provide a workflow for generating pre-processed, microarray expression data from the Allen Human Brain Atlas ([A2]_) for abitrary `atlas` designations. First, some basic filtering of genetic probes is performed, including: 1. Intensity-based filtering of microarray probes to remove probes that do not exceed a certain level of background noise (specified via the `ibf_threshold` parameter), 2. Selection of a single, representative probe (or collapsing across probes) for each gene, specified via the `probe_selection` parameter (and influenced by the `donor_probes` parameter), and 3. Optional mirroring of the tissue samples across the left/right hemisphere boundary, as specified via the `lr_mirror` parameter (turned off by default). Tissue samples are then matched to parcels in the defined `atlas` for each donor. If `atlas_info` is provided then this matching is constrained by both hemisphere and tissue class designation (e.g., cortical samples from the left hemisphere are only matched to ROIs in the left cortex, subcortical samples from the right hemisphere are only matched to ROIs in the left subcortex); see the `atlas_info` parameter description for more information. Matching of microarray samples to parcels in `atlas` is done via a multi- step process: 1. Determine if the sample falls directly within a parcel, 2. Check to see if there are nearby parcels by slowly expanding the search space to include nearby voxels, up to a specified distance (specified via the `tolerance` parameter), 3. If there are multiple nearby parcels, the sample is assigned to the closest parcel, as determined by the parcel centroid. If at any step a sample can be assigned to a parcel the matching process is terminated. When the provided atlas is not volumetric (i.e., surface-based) the samples are simply matched to the nearest vertex, and `tolerance` is used as a standard deviation threshold. More control over the sample matching can be obtained by setting the `exact` parameter; see the parameter description for more information. Once all samples have been matched to parcels for all supplied donors, the microarray expression data are optionally normalized via the provided `sample_norm` and `gene_norm` functions (which are influenced by the `norm_matched` and `norm_structures` parameters) before being aggregated across donors via the supplied `region_agg` and `agg_metric` parameters. Parameters ---------- atlas : niimg-like object or dict A parcellation image in MNI space or a tuple of GIFTI images in fsaverage5 space, where each parcel is identified by a unique integer ID. Alternatively, a dictionary where keys are donor IDs and values are parcellation images (or surfaces) in the native space of each donor. atlas_info : os.PathLike or pandas.DataFrame, optional Filepath to or pre-loaded dataframe containing information about `atlas`. Must have at least columns 'id', 'hemisphere', and 'structure' containing information mapping atlas IDs to hemisphere (i.e, "L", "R", "B") and broad structural class (i.e., "cortex", "subcortex/brainstem", "cerebellum"). If provided, this will constrain matching of tissue samples to regions in `atlas`. If `atlas` is a tuple of GIFTI images with valid label tables this will be intuited from the data. Default: None ibf_threshold : [0, 1] float, optional Threshold for intensity-based filtering. This number specifies the ratio of samples, across all supplied donors, for which a probe must have signal significantly greater than background noise in order to be retained. Default: 0.5 probe_selection : str, optional Selection method for subsetting (or collapsing across) probes that index the same gene. Must be one of 'average', 'max_intensity', 'max_variance', 'pc_loading', 'corr_variance', 'corr_intensity', or 'diff_stability', 'rnaseq'; see Notes for more information on different options. Default: 'diff_stability' donor_probes : {'aggregate', 'independent', 'common'}, optional Whether specified `probe_selection` method should be performed with microarray data from all donors ('aggregate'), independently for each donor ('independent'), or based on the most common selected probe across donors ('common'). Not all combinations of `probe_selection` and `donor_probes` methods are viable. Default: 'aggregate' sim_threshold : (0, inf) float, optional Threshold for inter-areal similarity filtering. Samples are correlated across probes and those samples with a total correlation less than `sim_threshold` standard deviations below the mean across samples are excluded from futher analysis. If not specified no filtering is performed. Default: None lr_mirror : {None, 'bidirectional', 'leftright', 'rightleft'}, optional Whether to mirror microarray expression samples across hemispheres to increase spatial coverage. Using 'bidirectional' will mirror samples across both hemispheres, 'leftright' will mirror samples in the left hemisphere to the right, and 'rightleft' will mirror the right to the left. Default: None missing : {'centroids', 'interpolate', None}, optional How to handle regions in `atlas` that are not assigned any tissue samples. If 'centroids', any empty regions will be assigned the expression value of the nearest tissue sample (defined as the sample with the closest Euclidean distance to the parcel centroid). If 'interpolate', expression values will be interpolated in the empty regions by assigning every node in the region the expression of the nearest sample and taking a weighted (inverse distance) average. If not specified empty regions will be returned with expression values of NaN. Default: None tolerance : int, optional Distance (in mm) that a sample must be from a parcel for it to be matched to that parcel. If `atlas` is a tuple of surface files then this measure is a standard deviation threshold (i.e., samples greater than `tolerance` SDs away from the mean matched distance are ignored). Default: 2 sample_norm : {'rs', 'srs', 'minmax', 'center', 'zscore', None}, optional Method by which to normalize microarray expression values for each sample. Expression values are normalized separately for each sample and donor across all genes; see Notes for more information on different methods. If None is specified then no normalization is performed. Default: 'srs' gene_norm : {'rs', 'srs', 'minmax', 'center', 'zscore', None}, optional Method by which to normalize microarray expression values for each donor. Expression values are normalized separately for each gene and donor across all samples; see Notes for more information on different methods. If None is specified then no normalization is performed. Default: 'srs' norm_matched : bool, optional Whether to perform gene normalization (`gene_norm`) across only those samples matched to regions in `atlas` instead of all available samples. If `atlas` is very small (i.e., only a few regions of interest), using `norm_matched=False` is suggested. Default: True norm_structures : bool, optional Whether to perform gene normalization (`gene_norm`) within structural classes (i.e., 'cortex', 'subcortex/brainstem', 'cerebellum') instead of across all available samples. Default: False region_agg : {'samples', 'donors'}, optional When multiple samples are identified as belonging to a region in `atlas` this determines how they are aggegated. If 'samples', expression data from all samples for all donors assigned to a given region are combined. If 'donors', expression values for all samples assigned to a given region are combined independently for each donor before being combined across donors. See `agg_metric` for mechanism by which samples are combined. Default: 'donors' agg_metric : {'mean', 'median'} or callable, optional Mechanism by which to reduce sample-level expression data into region- level expression (see `region_agg`). If a callable, should be able to accept an `N`-dimensional input and the `axis` keyword argument and return an `N-1`-dimensional output. Default: 'mean' corrected_mni : bool, optional Whether to use the "corrected" MNI coordinates shipped with the `alleninf` package instead of the coordinates provided with the AHBA data when matching tissue samples to anatomical regions. Default: True reannotated : bool, optional Whether to use reannotated probe information provided by [A1]_ instead of the default probe information from the AHBA dataset. Using reannotated information will discard probes that could not be reliably matched to genes. Default: True return_counts : bool, optional Whether to return dataframe containing information on how many samples were assigned to each parcel in `atlas` for each donor. Default: False return_donors : bool, optional Whether to return donor-level expression arrays instead of aggregating expression across donors with provided `agg_metric`. Default: False return_report : bool, optional Whether to return a string containing longform text describing the processing procedures used to generate the `expression` DataFrames returned by this function. Default: False donors : list, optional List of donors to use as sources of expression data. Can be either donor numbers or UID. If not specified will use all available donors. Note that donors '9861' and '10021' have samples from both left + right hemispheres; all other donors have samples from the left hemisphere only. Default: 'all' data_dir : os.PathLike, optional Directory where expression data should be downloaded (if it does not already exist) / loaded. If not specified will use the current directory. Default: None verbose : int, optional Specifies verbosity of status messages to display during workflow. Higher numbers increase verbosity of messages while zero suppresses all messages. Default: 1 n_proc : int, optional Number of processors to use to download AHBA data. Can parallelize up to six times. Default: 1 Returns ------- expression : (R, G) pandas.DataFrame Microarray expression for `R` regions in `atlas` for `G` genes, aggregated across donors, where the index corresponds to the unique integer IDs of `atlas` and the columns are gene names. If ``return_donors=True`` then this is a list of (R, G) dataframes, one for each donor. counts : (R, D) pandas.DataFrame Number of samples assigned to each of `R` regions in `atlas` for each of `D` donors (if multiple donors were specified); only returned if ``return_counts=True``. report : str Methods describing processing procedures implemented to generate `expression`, suitable to be used in a manuscript Methods section. Only returned if ``return_report=True``. Notes ----- The following methods can be used for collapsing across probes when multiple probes are available for the same gene: 1. ``probe_selection='average'`` Takes the average of expression data across all probes indexing the same gene. Providing 'mean' as the input method will return the same thing. This method can only be used when `donor_probes='aggregate'`. 2. ``probe_selection='max_intensity'`` Selects the probe with the maximum average expression across samples from all donors. 3. ``probe_selection='max_variance'`` Selects the probe with the maximum variance in expression across samples from all donors. 4. ``probe_selection='pc_loading'`` Selects the probe with the maximum loading along the first principal component of a decomposition performed across samples from all donors. 5. ``probe_selection='corr_intensity'`` Selects the probe with the maximum correlation to other probes from the same gene when >2 probes exist; otherwise, uses the same procedure as `max_intensity`. 6. ``probe_selection='corr_variance'`` Selects the probe with the maximum correlation to other probes from the same gene when >2 probes exist; otherwise, uses the same procedure as `max_varance`. 7. ``probe_selection='diff_stability'`` Selects the probe with the most consistent pattern of regional variation across donors (i.e., the highest average correlation across brain regions between all pairs of donors). This method can only be used when `donor_probes='aggregate'`. 8. ``method='rnaseq'`` Selects probes with most consistent pattern of regional variation to RNAseq data (across the two donors with RNAseq data). This method can only be used when `donor_probes='aggregate'`. Note that for incompatible combinations of `probe_selection` and `donor_probes` (as detailed above), the `probe_selection choice will take precedence. For example, providing ``probe_selection='diff_stability'`` and ``donor_probes='independent'`` will cause `donor_probes` to be reset to `'aggregate'`. The following methods can be used for normalizing microarray expression values prior to aggregating: 1. ``{sample,gene}_norm=='rs'`` Uses a robust sigmoid function as in [A3]_ to normalize values 2. ``{sample,gene}_norm='srs'`` Same as 'rs' but scales output to the unit normal (i.e., range 0-1) 3. ``{sample,gene}_norm='minmax'`` Scales data to the unit normal (i.e., range 0-1) 4. ``{sample,gene}_norm='center'`` Removes the mean of expression values 5. ``{sample,gene}_norm='zscore'`` Applies a basic z-score (subtract mean, divide by standard deviation); uses degrees of freedom equal to one for standard deviation References ---------- .. [A1] Arnatkevic̆iūtė, A., Fulcher, B. D., & Fornito, A. (2019). A practical guide to linking brain-wide gene expression and neuroimaging data. NeuroImage, 189, 353-367. .. [A2] Hawrylycz, M.J. et al. (2012) An anatomically comprehensive atlas of the adult human transcriptome. Nature, 489, 391-399. .. [A3] Fulcher, B. D., & Fornito, A. (2016). A transcriptional signature of hub connectivity in the mouse connectome. Proceedings of the National Academy of Sciences, 113(5), 1435-1440. """ # set logging verbosity level LGR.setLevel(dict(zip(range(3), [40, 20, 10])).get(int(verbose), 10)) # get combination functions agg_metric = utils.check_metric(agg_metric) # check probe_selection input if probe_selection not in probes_.SELECTION_METHODS: raise ValueError('Provided probe_selection method is invalid, must be ' f'one of {list(probes_.SELECTION_METHODS)}. Received ' f'value: {probe_selection}') dprobe_opts = ('aggregate', 'independent', 'common') if donor_probes not in dprobe_opts: raise ValueError('Provided donor_probes method is invalid, must be one' f' of {dprobe_opts}. Received value: {donor_probes}') # check lr_mirror input if isinstance(lr_mirror, bool): warnings.warn('Setting lr_mirror to a boolean value will be ' 'deprecated in an upcoming release. Use either ' '`lr_mirror=None` or `lr_mirror="bidirectional"` ' 'instead. See documentation for more details.', FutureWarning) lr_mirror = 'bidirectional' if lr_mirror else None mirror_opts = (None, 'bidirectional', 'leftright', 'rightleft') if lr_mirror not in mirror_opts: raise ValueError('Provided lr_mirror method is invalid, must be one ' f'of {mirror_opts}. Received value: {lr_mirror}') # check exact/missing input if exact is not None: warnings.warn('The exact parameter will be deprecated in an upcoming ' 'release. Use the `missing` parameter instead. See ' 'documentation for more details.', FutureWarning) missing = None if exact else 'centroids' missing_opts = (None, 'interpolate', 'centroids') if missing not in missing_opts: raise ValueError('Provided missing method is invalid, must be one ' f'of {missing_opts}. Received value: {missing}') # check that region_agg is viable with return_donors if return_donors and region_agg == 'samples': raise ValueError('Cannot return donor-level expresison data when ' 'region_agg parameter is set to "samples".') # load atlas and atlas_info, if provided, and coerce to dict atlas, group_atlas = images.coerce_atlas_to_dict(atlas, donors, atlas_info) # fetch files (downloading if necessary) and unpack to variables files = datasets.fetch_microarray(data_dir=data_dir, donors=donors, verbose=verbose, n_proc=n_proc) if probe_selection == 'diff_stability' and len(files) == 1: raise ValueError('Cannot use diff_stability for probe_selection with ' 'only one donor. Please specify a different probe_' 'selection method or use more donors.') elif probe_selection == 'rnaseq': # fetch RNAseq if we're gonna need it datasets.fetch_rnaseq(data_dir=data_dir, donors=donors, verbose=verbose) # get some info on labels in `atlas` all_labels = utils.first_entry(atlas).labels n_gb = (8 * len(all_labels) * 30000) / (1024 ** 3) if n_gb > 1: LGR.warning(f'Output matrix may require up to {n_gb:.2f} GB RAM') # get dataframe of probe information (reannotated or otherwise) # the Probes.csv files are the same for every donor so just grab the first probe_info = io.read_probes(first_entry(files, 'probes')) if reannotated: probe_info = probes_.reannotate_probes(probe_info) # drop probes with no/invalid Entrez ID probe_info = probe_info.dropna(subset=['entrez_id']) # update the annotation "files". this handles updating the MNI coordinates, # dropping mistmatched samples (where MNI coordinates don't match the # provided ontology), and mirroring samples across hemispheres, if desired for donor, data in files.items(): annot, ontol = data['annotation'], data['ontology'] t1w = None if not group_atlas: t1w = datasets.fetch_raw_mri(donors=donor, data_dir=data_dir, verbose=verbose)[donor]['t1w'] annot = samples_.update_coords(annot, corrected_mni=corrected_mni, native_space=t1w) if lr_mirror is not None: annot = samples_.mirror_samples(annot, ontol, swap=lr_mirror) annot = samples_.drop_mismatch_samples(annot, ontol) if sim_threshold is not None: annot = samples_.similarity_threshold(data['microarray'], annot, probe_info, threshold=sim_threshold) data['annotation'] = annot annotation = flatten_dict(files, 'annotation') # intensity-based filtering of probes probe_info = probes_.filter_probes(flatten_dict(files, 'pacall'), annotation, probe_info, threshold=ibf_threshold) # get probe-reduced microarray expression data for all donors based on # selection method; this will be a list of gene x sample dataframes (one # for each donor) microarray = probes_.collapse_probes(flatten_dict(files, 'microarray'), annotation, probe_info, method=probe_selection, donor_probes=donor_probes) centroids = [] counts = pd.DataFrame(np.zeros((len(all_labels) + 1, len(microarray)), dtype=int), index=np.append([0], all_labels), columns=microarray.keys()) for subj in microarray: if lr_mirror is not None: # reset index idx = pd.Series(range(1, len(microarray[subj]) + 1), name='sample_id') microarray[subj].index = idx.copy() annotation[subj].index = idx.copy() # assign samples to regions labels = atlas[subj].label_samples(annotation[subj], tolerance) # if we're doing exact matching and want to aggregate samples w/i # regions, remove the non-labelled samples prior to normalization. # otherwise, we'll remove the non-labelled samples after normalization nz = np.asarray(labels != 0).squeeze() if nz.sum() == 0: LGR.warning(f'No samples matched to atlas for donor {subj}') microarray[subj].index = labels['label'] if missing == 'centroids': missing += [(pd.DataFrame(), {})] continue if norm_matched: microarray[subj] = microarray[subj].loc[nz] annotation[subj] = annotation[subj].loc[nz] labels = labels.loc[nz] # if normalizing by structural class get annotation dataframe annot = annotation[subj][['structure']] if norm_structures else None if sample_norm is not None: microarray[subj] = correct.normalize_expression(microarray[subj].T, norm=sample_norm, ignore_warn=True).T if gene_norm is not None: microarray[subj] = correct.normalize_expression(microarray[subj], norm=gene_norm, structures=annot, ignore_warn=True) # get counts of samples collapsed into each ROI labs, num = np.unique(labels, return_counts=True) counts.loc[labs, subj] = num LGR.info(f'{counts.iloc[1:][subj].sum():>3} / {len(nz)} ' f'samples matched to regions for donor #{subj}') # if we don't want to do exact matching then cache which parcels are # missing data and the expression data for the closest sample to that # parcel; we'll use this once we've iterated through all donors if missing is not None: if missing == 'interpolate': exp, lab = _interpolate_missing(atlas[subj], labels, microarray[subj], annotation[subj]) microarray[subj] = pd.concat([microarray[subj], exp]) labels = pd.concat([labels, lab]) elif missing == 'centroids': centroids += [_match_centroids(atlas[subj], labels, microarray[subj], annotation[subj])] microarray[subj] = pd.merge(microarray[subj], labels, on='sample_id') \ .set_index('label') \ .rename_axis('gene_symbol', axis=1) # if we don't want to aggregate over regions return sample-level results if region_agg is None: microarray = { donor: micro.set_index(annotation[donor]['well_id'], append=True) .dropna(axis=1, how='any') for donor, micro in microarray.items() } if not return_donors: microarray = pd.concat(microarray.values()) return microarray if missing == 'centroids': # labels that are missing across all donors empty = reduce(set.intersection, [set(f.index) for f, d in centroids]) LGR.info(f'Matching {len(empty)} region(s) with no data to the ' 'nearest tissue sample(s)') for roi in empty: # take weighted average of samples closest to parcel centroid # across subjects exp, dist = zip(*[(micro.loc[[roi]], dist.get(roi)) for micro, dist in centroids]) weights = _get_weights(np.asarray(dist)[:, None]) exp = pd.DataFrame( (pd.concat(exp) * weights).sum(axis=0) / weights.sum(), columns=pd.Series([roi], name='label') ).T # assign same value to every subject for subj in microarray: microarray[subj] = pd.concat([microarray[subj], exp]) microarray = samples_.aggregate_samples(microarray, labels=all_labels, region_agg=region_agg, agg_metric=agg_metric, return_donors=return_donors) counts = counts.drop([0], axis=0) if return_report: # generate report report = reporting.Report(atlas, atlas_info=atlas[subj].atlas_info, ibf_threshold=ibf_threshold, probe_selection=probe_selection, donor_probes=donor_probes, lr_mirror=lr_mirror, missing=missing, tolerance=tolerance, sample_norm=sample_norm, gene_norm=gene_norm, norm_matched=norm_matched, norm_structures=norm_structures, region_agg=region_agg, agg_metric=agg_metric, corrected_mni=corrected_mni, reannotated=reannotated, donors=donors, return_donors=return_donors, data_dir=data_dir, counts=counts, n_probes=len(probe_info), n_genes=(first_entry(microarray).shape[1] if return_donors else microarray.shape[1])).body # pack outputs out = (microarray,) if return_counts: out += (counts,) if return_report: out += (report,) if len(out) == 1: out = out[0] return out
[docs]def get_samples_in_mask(mask=None, **kwargs): """ Returns preprocessed microarray expression data for samples in `mask` Uses the same processing workflow as :func:`abagen.get_expression_data` but instead of aggregating samples within regions simply returns sample-level expression data for all samples that fall within boundaries of `mask`. Parameters ---------- mask : niimg-like object or dict, optional A mask image in MNI space or a tuple of GIFTI images in fsaverage5 space (where 0 is the background). Alternatively, a dictionary where keys are donor IDs and values are mask images (or surfaces) in the native space of each donor. If not supplied, all available samples will be returned. Default: None kwargs : key-value pairs All key-value pairs from :func:`abagen.get_expression_data` except for: `atlas`, `atlas_info`, `region_agg`, and `agg_metric`, which will be ignored. If `atlas` is supplied instead of `mask` then `atlas` will be used instead as a modified binary image. If both `atlas` and `mask` are supplied then `mask` will be used Returns ------- expression : (S, G) pandas.DataFrame Microarray expression for `S` samples for `G` genes, aggregated across donors, where the columns are gene names coords : (S,) numpy.ndarray MNI coordinates of samples in `expression`. Even if donor-specific masks are provided MNI coordinates will be returned to ensure comparability between subjects """ # fetch files (downloading if necessary) to get coordinates files = datasets.fetch_microarray(data_dir=kwargs.get('data_dir', None), donors=kwargs.get('donors', 'all'), verbose=kwargs.get('verbose', 1), n_proc=kwargs.get('n_proc', 1)) # get updated coordinates for donor, data in files.items(): annot = data['annotation'] if kwargs.get('corrected_mni', True): annot = samples_.update_mni_coords(annot) files[donor]['annotation'] = annot cols = ['well_id', 'mni_x', 'mni_y', 'mni_z'] coords = np.asarray(pd.concat(flatten_dict(files, 'annotation'))[cols]) well_id, coords = np.asarray(coords[:, 0], 'int'), coords[:, 1:] coords = pd.DataFrame(coords, columns=['x', 'y', 'z'], index=well_id) # in case people mix things up and use atlas instead of mask, use that if kwargs.get('atlas') is not None and mask is None: mask = kwargs['atlas'] elif mask is None: # generate atlas image where each voxel with sample has value of 1 affine = np.eye(4) affine[:-1, -1] = -1 * np.floor(np.max(np.abs(coords), axis=0)) img = np.ones(np.asarray(-2 * affine[:-1, -1], dtype=int)) mask = nib.Nifti1Image(img, affine=affine) # reset these parameters kwargs['atlas'] = mask kwargs['atlas_info'] = None kwargs['region_agg'] = None # soft reset this parameter kwargs.setdefault('norm_matched', False) # get expression data + drop sample coordinates that weren't in atlas exp = get_expression_data(**kwargs) if kwargs.get('return_donors'): exp = { donor: micro.drop(index=[0], level='label', errors='ignore') .droplevel('label') for donor, micro in exp.items() } coords = { donor: coords.loc[micro.index] for donor, micro in exp.items() } else: exp = exp.drop(index=[0], level='label', errors='ignore') \ .droplevel('label') coords = coords.loc[exp.index] return exp, coords
[docs]def get_interpolated_map(genes, mask, n_neighbors=10, **kwargs): """ Generates dense (i.e., interpolated) expression maps for `genes` Uses k-nearest neighbors regression to estimate values at every voxel or vertex in the supplied `mask`. Note that by default `lr_mirror` is set to 'bidirectional' and `norm_matched` is set to False, unless explicitly specified. Parameters ---------- genes : (G,) list-of-str List of gene acronyms for which dense maps are desired mask : niimg-like object, optional A mask image in MNI space or a tuple of GIFTI images in fsaverage5 space (where 0 is the background) n_neighbors : int, optional Number of neighboring tissue samples to use when interpolating data in dense map. Default: 10 kwargs : key-value pairs All key-value pairs from :func:`abagen.get_expression_data` except for: `atlas`, `atlas_info`, `region_agg`, and `agg_metric`, which will be ignored. Returns ------- dense : (G,) dict Dictionary where keys are `genes` and values are dense maps in space of provided `mask` """ genes = np.atleast_1d(genes) mask = images.check_atlas(mask) if 'atlas' in kwargs: del kwargs['atlas'] # soft reset here and then get ALL the samples in the brain # (we don't just want samples _inside_ the mask since we're using NN) kwargs.setdefault('lr_mirror', 'bidirectional') expression, coords = get_samples_in_mask(mask=None, **kwargs) # this is a "naive" matching based only on Euclidean distance # TODO: use surface distances when working with surfaces # TODO: constrain by hemisphere (and structure?) exptree = matching.AtlasTree(np.asarray(expression.index), coords=np.asarray(coords)) dist, idx = exptree.tree.query(mask.coords, k=n_neighbors) dist = _get_weights(dist) # get average of nearest neighbors dense = np.zeros(((genes.shape[0],) + mask._shape)) denom = np.sum(dist, axis=1) for n, j in enumerate(genes): num = np.sum(np.asarray(expression[j])[idx] * dist, axis=1) dense[n][mask._nz] = num / denom return dict(zip(genes, dense))
def _get_weights(dist): """ Gets inverse of `dist`, handling potential infs Parameters ---------- dist : array_like Distances to be converted to weights Returns ------- weights : np.ndarray Inverse of `dist` """ with np.errstate(divide='ignore'): dist = 1. / dist isinf = np.isinf(dist) infrow = np.any(isinf, axis=1) dist[infrow] = isinf[infrow] return dist def _interpolate_missing(atlas, labels, microarray, annotation): """ Interpolate missing data in `atlas` with weighted nearest-neighbors """ data, labs = [], [] for label in np.setdiff1d(atlas.labels, labels): # it's more cost efficient to do this check here if atlas.atlas_info is not None: cols = ['structure', 'hemisphere'] match = atlas.atlas_info.loc[label, cols] == annotation[cols] if not np.any(np.all(match, axis=1)): continue # get sample indices for every node of `lab` in `atlas` idx, dist = atlas.fill_label(annotation, label, return_dist=True) if np.all(np.isinf(dist)): continue # get expression of selected tissue samples and weights dist = _get_weights(dist[:, None]) # get weighted average of sample expression and store exp = (microarray.loc[idx] * dist).sum(axis=0) / dist.sum() sampid = pd.Series([f'interp_{label}'], name='sample_id') data.append(pd.DataFrame(exp, columns=sampid).T) labs.append(pd.DataFrame(label, columns=['label'], index=sampid)) return pd.concat(data), pd.concat(labs) def _match_centroids(atlas, labels, microarray, annotation): """ Matches missing data in `atlas` based on centroids """ cols = ['mni_x', 'mni_y', 'mni_z'] tree = matching.AtlasTree( np.asarray(annotation.index), coords=np.asarray(annotation[cols]), atlas_info=annotation.rename_axis('id') ) empty = np.setdiff1d(atlas.labels, labels) # get coordinates of centroids for each missing parcel centroids = np.r_[[atlas.centroids[lab] for lab in empty]] if atlas.atlas_info is not None: centroid_info = atlas.atlas_info.loc[empty] centroid_info[cols] = centroids centroids = centroid_info # get idx of tissue sample closest to each centroid idx, dist = tree.match_closest_centroids(centroids, return_dist=True) # get expression of selected tissue samples mask = idx != -1 exp = microarray.loc[idx[mask]] exp.index = pd.Series(empty[mask], name='label') # if some labels weren't matched (due to e.g., hemispheric/structural # constraints) then append empty rows to the dataframe exp = pd.concat([exp, pd.DataFrame(index=empty[~mask])]) \ .sort_index() \ .rename_axis('gene_symbol', axis=1) return (exp, dict(zip(empty, dist)))