# -*- coding: utf-8 -*-
"""
Functions for processing and correcting gene expression data
"""
import functools
import itertools
import warnings
import numpy as np
import pandas as pd
import scipy.stats as sstats
from scipy.spatial.distance import cdist
from . import images, utils
def _unpack_tuple(var):
"""
Returns first entry of var if there is only one entry
Parameters
----------
var : tuple
Returns
-------
var
"""
if len(var) == 1:
return var[0]
return var
def _batch_correct(data):
"""
Performs batch correction on `data`
Parameters
----------
data : list of (S, G) pandas.DataFrame
Microarray expression data, where `S` is samples (or regions) and `G`
is genes
Returns
-------
residualized : list of (S, G) pandas.DataFrame
Batch-corrected microarray expression data
"""
if len(data) < 2:
raise ValueError('Cannot perform batch correction with one batch.')
# need to drop NaNs or the lstsq fit will choke
data = [np.asarray(f) for f in data]
data = [f[np.logical_not(np.all(np.isnan(f), axis=1))] for f in data]
# generate donor label / "batch" array
n_samp = [len(d) for d in data]
batch = np.repeat(range(len(data)), n_samp)
batch = np.column_stack([batch == f for f in np.unique(batch)]).astype(int)
# add intercept for fit (but we won't regress this out)
batch = np.column_stack([batch, np.ones(len(batch), dtype=int)])
# fit least squares and residualize
raw = np.row_stack(data)
betas = np.linalg.lstsq(batch, raw, rcond=None)[0]
resid = raw - (batch[:, :-1] @ betas[:-1])
residualized = np.split(resid, np.cumsum(n_samp)[:-1])
return residualized
def _rescale(data, low=0, high=1, axis=0):
"""
Rescales `data` to range [`low`, `high`]
Parameters
----------
data : array_like
Input data array to be rescaled
low : float, optional
Lower bound for rescaling. Default: 0
high : float, optional
Upper bound for rescaling. Default: 1
axis : int, optional
Axis of `data` to scale along. Default: 0
Returns
-------
rescaled : np.ndarray
Rescaled data
"""
data = np.asanyarray(data)
dmin = data.min(axis=axis, keepdims=True)
dmax = data.max(axis=axis, keepdims=True)
rescaled = ((data - dmin) / (dmax - dmin)) * (high - low) + low
return rescaled
def _sigmoid(data, axis=0):
"""
Normalizes `data` with a standard sigmoid function
Parameters
----------
data : array_like
Input data array to be transformed
axis : int, optional
Axis of `data` to be normalized
Returns
-------
normed : array_like
Normalized input `data`
"""
data = np.asanyarray(data)
mean = np.mean(data, axis=axis, keepdims=True)
std = np.std(data, ddof=1, axis=axis, keepdims=True)
normed = 1 / (1 + np.exp(-(np.asarray(data) - mean) / std))
return normed
def _scaledsig(data, low=0, high=1, axis=0):
"""
Normalizes `data` with a scaled sigmoid function
Parameters
----------
data : array_like
Input data array to be transformed
low : float, optional
Lower bound for rescaling. Default: 0
high : float, optional
Upper bound for rescaling. Default: 1
axis : int, optional
Axis of `data` to be normalized
Returns
-------
normed : array_like
Normalized input `data`
"""
sig = _sigmoid(np.asanyarray(data), axis=axis) # sigmoid transform
normed = _rescale(sig, low=low, high=high, axis=axis) # scales data
return normed
def _rs(data, axis=0):
"""
Normalizes `data` with a robust sigmoid function
Parameters
----------
data : array_like
Input data array to be transformed
axis : int, optional
Axis of `data` to be normalized
Returns
-------
normed : array_like
Normalized input `data`
"""
data = np.asanyarray(data)
# calculate sigmoid normalization
med = np.median(data, axis=axis, keepdims=True)
iqr = sstats.iqr(data, axis=axis, scale='normal', keepdims=True)
normed = 1 / (1 + np.exp(-(data - med) / iqr))
return normed
def _srs(data, low=0, high=1, axis=0):
"""
Normalizes `data` with a scaled robust sigmoid function
Parameters
----------
data : array_like
Input data array to be transformed
low : float, optional
Lower bound for rescaling. Default: 0
high : float, optional
Upper bound for rescaling. Default: 1
axis : int, optional
Axis of `data` to be normalized
Returns
-------
normed : array_like
Normalized input `data`
"""
srs = _rs(np.asanyarray(data), axis=axis) # robust sigmoid transform
normed = _rescale(srs, low=low, high=high, axis=axis) # scales data
return normed
def _scaledsig_qnt(data, low=0, high=1, axis=0):
"""
Normalizes `data` with capped-quantile scaled sigmoid function
Caps `data` at 5th and 95th percentiles and then uses `_scaledsig()`
Parameters
----------
data : array_like
Input data array to be transformed
low : float, optional
Lower bound for rescaling. Default: 0
high : float, optional
Upper bound for rescaling. Default: 1
axis : int, optional
Axis of `data` to be normalized
Returns
-------
normed : array_like
Normalized input `data`
"""
data = np.asanyarray(data)
lq, hq = np.percentile(data, [5, 95], axis=axis, keepdims=True)
mdata = np.ma.masked_array(data, np.logical_or(data > hq, data < lq))
normed = np.asarray(_scaledsig(mdata, low=low, high=high, axis=axis))
return normed
def _mixedsig(data, low=0, high=1, axis=0):
"""
Uses `_scaledsig()` if IQR is 0; otherwise, uses `_srs()`
Parameters
----------
data : array_like
Input data array to be transformed
low : float, optional
Lower bound for rescaling. Default: 0
high : float, optional
Upper bound for rescaling. Default: 1
axis : int, optional
Axis of `data` to be normalized
Returns
-------
normed : array_like
Normalized input `data`
"""
data = np.asanyarray(data)
iqr = sstats.iqr(data, axis=axis, scale='normal')
mask = iqr == 0
normed = np.zeros_like(data)
normed[:, mask] = _scaledsig(data[:, mask])
normed[:, ~mask] = _srs(data[:, ~mask])
# constant columns set to 0
return np.nan_to_num(normed)
NORMALIZATION_METHODS = dict(
sig=functools.partial(_sigmoid, axis=0),
scaled_sig=functools.partial(_scaledsig, low=0, high=1, axis=0),
scaled_sig_qnt=functools.partial(_scaledsig_qnt, low=0, high=1, axis=0),
mixed_sig=functools.partial(_mixedsig, low=0, high=1, axis=0),
rs=functools.partial(_rs, axis=0),
srs=functools.partial(_srs, low=0, high=1, axis=0),
center=lambda x: x - x.mean(axis=0, keepdims=True),
zscore=functools.partial(sstats.zscore, axis=0, ddof=1),
minmax=functools.partial(_rescale, low=0, high=1, axis=0)
)
# aliases
for alias, orig in [('demean', 'center'),
('rsig', 'rs'),
('robust_sigmoid', 'rs'),
('scaled_rsig', 'srs'),
('scaled_robust_sigmoid', 'srs'),
('sigmoid', 'sig'),
('scaled_sigmoid', 'scaled_sig'),
('scaled_sigmoid_quantiles', 'scaled_sig_qnt'),
('mixed_sigmoid', 'mixed_sig')]:
NORMALIZATION_METHODS[alias] = NORMALIZATION_METHODS[orig]
[docs]def normalize_expression(expression, norm='srs', structures=None,
ignore_warn=False):
"""
Performs normalization on `expression` data
Parameters
----------
expression : list of (S, G) pandas.DataFrame
Microarray expression data to be normalized, where `S` is samples (or
regions) and `G` is genes
norm : str, optional
Function with which to normalize expression data. See Notes for more
information on options. Default: 'scaled_robust_sigmoid'
structures : list of (S,) pandas.DataFrame
Structural designations of `S` samples (or regions) in `expression`.
Index of provided data frames should be identical to `expression` and
must have at least column 'structure'. If provided, normalization will
be performed separately for each distinct structural class. Default:
None
ignore_warn : bool, optional
Whether to suppress potential warnings raised by normalization.
Default: False
Returns
-------
normalized : list of (S, G) pandas.DataFrame
Data from `expression` normalized separately for each gene
Notes
-----
The following methods can be used for normalizing gene expression values
for each donor (adapted from [PC2]_):
1. ``norm='center'``
Removes the mean of data in each column. Aliased to 'demean'
2. ``norm='zscore'``
Applies a basic z-score (subtract mean, divide by standard deviation) to
each column; uses degrees of freedom equal to one for standard deviation
3. ``norm='minmax'``
Scales data in each column to the unit normal (i.e., range 0-1)
4. ``norm='sigmoid'``
Applies a sigmoidal transform function to normalize data in each column.
Aliased to 'sig'
5. ``norm='scaled_sigmoid'``
Combines 'sigmoid' and 'minmax'. Aliased to 'scaled_sig'
6. ``norm='scaled_sigmoid_quantiles'``
Caps input data at the 5th and 95th percentiles before performing the
'scaled_sigmoid' transform. Aliased to 'scaled_sig_qnt'
7. ``norm='robust_sigmoid'``
Uses a robust sigmoid function ([PC1]_) to normalize data in each column.
Aliased to 'rs' and 'rsig'
8. ``norm='scaled_robust_sigmoid'``
Combines 'robust_sigmoid' and 'minmax'. Aliased to 'srs' and 'scaled_rsig'
9. ``norm='mixed_sigmoid'``
Uses 'scaled_sigmoid' transform for columns where the IQR is 0; otherwise,
uses the 'scaled_robust_sigmoid' transform. Aliased to 'mixed_sig'
10. ``norm='batch'``
Uses a linear model to remove donor effects from data. Differs from other
methods in that all donors are simultaneously fit to the same model and
data are residualized based on estimated betas. Linear model includes the
intercept but it is not removed during residualization
References
----------
.. [PC1] Fulcher, B. D., & Fornito, A. (2016). A transcriptional signature
of hub connectivity in the mouse connectome. Proceedings of the National
Academy of Sciences, 113(5), 1435-1440.
.. [PC2] Fulcher, B. D., Little, M. A., & Jones, N. S. (2013). Highly
comparative time-series analysis: the empirical structure of time series
and their methods. Journal of the Royal Society Interface, 10(83),
20130048
"""
try:
normfunc = NORMALIZATION_METHODS[norm]
except KeyError:
raise ValueError('Provided value for `norm` not recognized. Must be '
f'one of {list(NORMALIZATION_METHODS)}. Received: '
f'{norm}')
kwargs = dict(all='ignore') if ignore_warn else {}
# FIXME: I hate having to do this...
if isinstance(expression, pd.DataFrame):
expression = [expression]
if structures is not None:
if isinstance(structures, pd.DataFrame):
structures = [structures]
if any(len(s) != len(e) for s, e in zip(structures, expression)):
raise ValueError('Length of `structures` class designations '
'differs from provided `expression`.')
else:
structures = [
pd.DataFrame(dict(structure='same'), index=exp.index)
for exp in expression
]
if norm == 'batch':
corrected = _batch_correct(expression)
normexp = []
for n, exp in enumerate(expression):
normalized = pd.DataFrame(np.nan, columns=exp.columns, index=exp.index)
notna = np.logical_not(exp.isna().all(axis=1))
gb = structures[n].groupby('structure')
for grp, samples in gb.groups.items():
idx = samples[np.asarray(notna.loc[samples])]
data = np.asarray(exp.loc[idx])
# normalize the data (however was specified)
with np.errstate(**kwargs):
normed = normfunc(data) if norm != 'batch' else corrected[n]
normalized.loc[idx] = normed
normexp.append(normalized)
return _unpack_tuple(normexp)
[docs]def remove_distance(coexpression, atlas, atlas_info=None, labels=None):
"""
Corrects for distance-dependent correlation effects in `coexpression`
Regresses Euclidean distance between regions in `atlas` from correlated
gene expression array `coexpression`. If `atlas_info` is provided different
connection types (e.g., cortex-cortex, cortex-subcortex, subcortex-
subcortex) will be residualized independently.
Parameters
----------
coexpression : (R x R) array_like
Correlated gene expression array, where `R` is the number of regions,
as generated with e.g., `numpy.corrcoef(expression)`.
atlas : niimg-like object
A parcellation image in MNI space, where each parcel is identified by a
unique integer ID
atlas_info : str or pandas.DataFrame, optional
Filepath to or pre-loaded dataframe containing information about
`atlas`. Must have at least columns 'id', 'hemisphere', and 'structure'
containing information mapping atlas IDs to hemisphere (i.e, "L", "R")
and broad structural class (i.e., "cortex", "subcortex", "cerebellum").
Default: None
labels : (N,) array_like, optional
If only a subset `N` of the ROIs in `atlas` were used to generate the
`coexpression` array this array should specify which to consider. Not
specifying this may cause a ValueError if `atlas` and `atlas_info` do
not match. Default: None
Returns
-------
residualized : (R, R) numpy.ndarray
Provided `coexpression` data residualized against spatial distance
between region pairs
"""
atlas = images.check_atlas(atlas)
# check atlas + coexpression make sense
if labels is None:
labels = atlas.labels
if len(labels) != len(coexpression):
raise ValueError(f'Provided IDs for {labels.shape} parcels are a '
'different length than provided coexpression matrix '
f'of size {coexpression.shape}. Please confirm '
'inputs and try again.')
# load atlas_info, if provided
if atlas_info is not None:
atlas_info = images.check_atlas_info(atlas_info, labels)
# check that provided coexpression array is symmetric
if not np.allclose(coexpression, coexpression.T, atol=1e-10):
raise ValueError('Provided coexpression matrix is not symmetric')
# we'll do basic Euclidean distance correction for now
# TODO: implement gray matter volume / cortical surface path distance
if labels is None:
labels = atlas.labels
centroids = np.row_stack([atlas.centroids[lab] for lab in labels])
dist = cdist(centroids, centroids, metric='euclidean')
corr_resid = np.zeros_like(coexpression)
triu_inds = np.triu_indices_from(coexpression, k=1)
# if no atlas_info, just residualize all correlations against distance
if atlas_info is None:
corr_resid[triu_inds] = _resid_dist(coexpression[triu_inds],
dist[triu_inds])
# otherwise, we can residualize the different connection types separately
else:
triu_inds = np.ravel_multi_index(triu_inds, corr_resid.shape)
coexpression, dist = coexpression.ravel(), dist.ravel()
types = ['cortex', 'subcortex']
for src, tar in itertools.combinations_with_replacement(types, 2):
# get indices of sources and targets
sources = np.where(atlas_info.structure == src)[0]
targets = np.where(atlas_info.structure == tar)[0]
inds = np.ravel_multi_index(np.ix_(sources, targets),
corr_resid.shape)
if src != tar: # e.g., cortex + subcortex
rev = np.ravel_multi_index(np.ix_(targets, sources),
corr_resid.shape)
inds = np.append(inds.ravel(), rev.ravel())
# find intersection of source / target indices + upper triangle
inds = np.intersect1d(triu_inds, inds)
back = np.unravel_index(inds, corr_resid.shape)
# residualize
corr_resid[back] = _resid_dist(coexpression[inds], dist[inds])
corr_resid = (corr_resid + corr_resid.T + np.eye(len(corr_resid)))
return corr_resid
def _resid_dist(dv, iv):
"""
Calculates residuals of `dv` after controlling for `iv`
Parameters
----------
dv : array_like
Dependent variable
iv : array_like
Independent variable; removed from `dv`
Returns
-------
residuals : array_like
Residuals of `dv` after controlling for `iv`
"""
dv = dv.squeeze()
distance = np.column_stack((iv, np.ones_like(iv)))
betas, *rest = np.linalg.lstsq(distance, dv, rcond=None)
residuals = dv - (distance @ betas)
return residuals.squeeze()
[docs]def keep_stable_genes(expression, threshold=0.9, percentile=True, rank=True,
return_stability=False):
"""
Removes genes in `expression` with differential stability < `threshold`
Calculates the similarity of gene expression across brain regions for every
pair of donors in `expression`. Similarity is averaged across donor pairs
and genes whose mean similarity falls below `threshold` are removed.
Parameters
----------
expression : list of (R, G) pandas.DataFrame
Where each entry is the microarray expression of `R` regions across `G`
genes for a given donor
threshold : [0, 1] float, optional
Minimum required average similarity (e.g, correlation) across donors
for a gene to be retained. Default: 0.1
percentile : bool, optional
Whether to treat `threshold` as a percentile instead of an absolute
cutoff. For example, `threshold=0.9` and `percentile=True` would
retain only those genes with a differential stability in the top 10% of
all genes, whereas `percentile=False` would retain only those genes
with differential stability > 0.9. Default: True
rank : bool, optional
Whether to calculate similarity as Spearman correlation instead of
Pearson correlation. Default: True
return_stability : bool, optional
Whether to return stability estimates for each gene in addition to
expression data. Default: False
Returns
-------
expression : list of (R, Gr) pandas.DataFrame
Microarray expression for `R` regions across `Gr` genes, where `Gr` is
the number of retained genes
stability : (G,) numpy.ndarray
Stability (average correlation) of each gene across pairs of donors.
Only returned if ``return_stability=True``
"""
# get number of donors and number of genes
num_subj = len(expression)
num_gene = expression[0].shape[-1]
# rank data, if necessary
for_corr = expression if not rank else [e.rank() for e in expression]
# get correlation of gene expression across regions for all donor pairs
gene_corrs = np.zeros((num_gene, sum(range(num_subj))))
for n, (s1, s2) in enumerate(itertools.combinations(range(num_subj), 2)):
regions = np.intersect1d(for_corr[s1].dropna(axis=0, how='all').index,
for_corr[s2].dropna(axis=0, how='all').index)
gene_corrs[:, n] = utils.efficient_corr(for_corr[s1].loc[regions],
for_corr[s2].loc[regions])
# average similarity across donors (ignore NaNs)
with warnings.catch_warnings():
warnings.filterwarnings('ignore', category=RuntimeWarning,
message='Mean of empty slice')
gene_corrs = np.nan_to_num(np.nanmean(gene_corrs, axis=1))
# calculate absolute threshold if percentile is desired
if percentile:
threshold = np.percentile(gene_corrs, threshold * 100)
keep_genes = gene_corrs >= threshold
expression = [e.iloc[:, keep_genes] for e in expression]
if return_stability:
return expression, gene_corrs
return expression