# -*- coding: utf-8 -*-
"""
Functions for generating workflow methods reports
Note: all text contained within this module is released under a `CC-0 license
<https://creativecommons.org/publicdomain/zero/1.0/>`_.
"""
import logging
import numpy as np
from . import __version__
from .datasets import check_donors, fetch_donor_info
from .images import coerce_atlas_to_dict
from .utils import first_entry
LGR = logging.getLogger('abagen')
METRICS = {
np.mean: 'mean',
np.median: 'median'
}
REFERENCES = dict(
A2019N=('Arnatkevic̆iūtė, A., Fulcher, B. D., & Fornito, A. (2019). '
'A practical guide to linking brain-wide gene expression and '
'neuroimaging data. Neuroimage, 189, 353-367.'),
F2016P=('Fulcher, B. D., & Fornito, A. (2016). A transcriptional '
'signature of hub connectivity in the mouse connectome. '
'Proceedings of the National Academy of Sciences, 113(5), '
'1435-1440.'),
F2013J=('Fulcher, B. D., Little, M. A., & Jones, N. S. (2013). '
'Highly comparative time-series analysis: the empirical '
'structure of time series and their methods. Journal of '
'the Royal Society Interface, 10(83), 20130048.'),
H2012N=('Hawrylycz, M. J., Lein, E. S., Guillozet-Bongaarts, A. L., '
'Shen, E. H., Ng, L., Miller, J. A., ... & Jones, A. R. '
'(2012). An anatomically comprehensive atlas of the adult '
'human brain transcriptome. Nature, 489(7416), 391-399.'),
H2015N=('Hawrylycz, M., Miller, J. A., Menon, V., Feng, D., '
'Dolbeare, T., Guillozet-Bongaarts, A. L., ... & Lein, E. '
'(2015). Canonical genetic signatures of the adult human '
'brain. Nature Neuroscience, 18(12), 1832.'),
M2021B=('Markello, R.D., Arnatkevic̆iūtė, A., Poline, J-B., Fulcher, B. '
'D., Fornito, A., & Misic, B. (2021). Standardizing workflows in '
'imaging transcriptomics with the abagen toolbox. Biorxiv.'),
P2017G=('Parkes, L., Fulcher, B. D., Yücel, M., & Fornito, A. '
'(2017). Transcriptional signatures of connectomic '
'subregions of the human striatum. Genes, Brain and '
'Behavior, 16(7), 647-663.'),
Q2002N=('Quackenbush, J. (2002). Microarray data normalization and '
'transformation. Nature Genetics, 32(4), 496-501.'),
R2018N=('Romero-Garcia, R., Whitaker, K. J., Váša, F., Seidlitz, '
'J., Shinn, M., Fonagy, P., ... & NSPN Consortium. (2018). '
'Structural covariance networks are coupled to expression '
'of genes enriched in supragranular layers of the human '
'cortex. NeuroImage, 171, 256-267.')
)
[docs]class Report:
""" Generates report of methods for :func:`abagen.get_expression_data()`
Refer to the doc-string of the workflow for an overview of paramter options
"""
def __init__(self, atlas, atlas_info=None, *, ibf_threshold=0.5,
probe_selection='diff_stability', donor_probes='aggregate',
lr_mirror=None, missing=None, tolerance=2, sample_norm='srs',
gene_norm='srs', norm_matched=True, norm_structures=False,
region_agg='donors', agg_metric='mean', corrected_mni=True,
reannotated=True, donors='all', return_donors=False,
data_dir=None, counts=None, n_probes=None, n_genes=None):
efflevel = LGR.getEffectiveLevel()
LGR.setLevel(100)
atlas, self.group_atlas = \
coerce_atlas_to_dict(atlas, donors, atlas_info=atlas_info,
data_dir=data_dir)
self.atlas = first_entry(atlas)
self.atlas_info = self.atlas.atlas_info
self.ibf_threshold = ibf_threshold
self.probe_selection = probe_selection
self.donor_probes = donor_probes
self.lr_mirror = lr_mirror
self.missing = missing
self.tolerance = tolerance
self.sample_norm = sample_norm
self.gene_norm = gene_norm
self.norm_matched = norm_matched
self.norm_structures = norm_structures
self.region_agg = region_agg
self.agg_metric = METRICS.get(agg_metric, agg_metric)
self.corrected_mni = corrected_mni
self.reannotated = reannotated
self.donors = donors
self.return_donors = return_donors
self.counts = counts
self.n_probes = n_probes
self.n_genes = n_genes
self.body = self.gen_report()
LGR.setLevel(efflevel)
[docs] def gen_report(self):
""" Generates main text of report
"""
report = ''
report += """
Regional microarry expression data were obtained from {n_donors}
post-mortem brains ({n_female} female, ages {min}--{max}, {mean:.2f}
+/- {std:.2f}) provided by the Allen Human Brain Atlas (AHBA,
https://human.brain-map.org; [H2012N]). Data were processed with the
abagen toolbox (version {vers}; https://github.com/rmarkello/abagen;
[M2021B])
""".format(**_get_donor_demographics(self.donors), vers=__version__)
if self.atlas.volumetric and self.group_atlas:
report += """
using a {n_region}-region volumetric atlas in MNI space.<br>
""".format(n_region=len(self.atlas.labels))
elif not self.atlas.volumetric and self.group_atlas:
report += """
using a {n_region}-region surface-based atlas in MNI space.<br>
""".format(n_region=len(self.atlas.labels))
elif self.atlas.volumetric and not self.group_atlas:
report += """
using a {n_region}-region volumetric atlas, independently aligned
to each donor's native MRI space.<br>
""".format(n_region=len(self.atlas.labels))
else:
report += ".<br>"
if self.reannotated:
report += """
First, microarray probes were reannotated using data provided by
[A2019N]; probes not matched to a valid Entrez ID were discarded.
"""
else:
report += """
First, microarray probes not matched to a valid Entrez ID were
discarded.
"""
if self.ibf_threshold > 0:
report += """
Next, probes were filtered based on their expression intensity
relative to background noise [Q2002N], such that probes with
intensity less than the background in >={threshold:.2f}% of samples
across donors were discarded
""".format(threshold=self.ibf_threshold * 100)
if self.n_probes is not None:
report += """
, yielding {n_probes:,} probes
""".format(n_probes=self.n_probes)
report += "."
if self.probe_selection == 'average':
report += """
When multiple probes indexed the expression of the same gene we
calculated the mean expression across probes.
"""
elif self.probe_selection == 'diff_stability':
report += r"""
When multiple probes indexed the expression of the same gene, we
selected and used the probe with the most consistent pattern of
regional variation across donors (i.e., differential stability;
[H2015N]), calculated with:<br>
$$ \Delta_{{S}}(p) = \frac{{1}}{{\binom{{N}}{{2}}}} \,
\sum_{{i=1}}^{{N-1}} \sum_{{j=i+1}}^{{N}}
\rho[B_{{i}}(p), B_{{j}}(p)] $$<br>
where $ \rho $ is Spearman's rank correlation of the expression of
a single probe, p, across regions in two donors $B_{{i}}$ and
$B_{{j}}$, and N is the total number of donors. Here, regions
correspond to the structural designations provided in the ontology
from the AHBA.
"""
elif self.probe_selection == "pc_loading":
report += """
When multiple probes indexed the expression of the same gene we
selected and used the probe with the highest loading on the first
principal component taken from a decomposition of probe expression
across samples from all donors [P2017G].
"""
elif self.probe_selection == "max_intensity":
report += """
When multiple probes indexed the expression of the same gene we
selected and used the probe with the highest mean intensity across
samples.
"""
elif self.probe_selection == "max_variance":
report += """
When multiple probes indexed the expression of the same gene we
selected and used the probe with the highest variance across
samples.
"""
elif self.probe_selection == "corr_intensity":
report += """
When multiple probes indexed the expression of the same gene we
selected and used the expression profile of a single representative
probe. For genes where only two probes were available we selected
the probe with the highest mean intensity across samples; where
three or more probes were available, we calculated the correlation
of probe expression across samples and selected the probe with the
highest average correlation.
"""
elif self.probe_selection == "corr_variance":
report += """
When multiple probes indexed the expression of the same gene we
selected and used the expression profile of a single representative
probe. For genes where only two probes were available we selected
the probe with the highest variance across samples; where three or
more probes were available, we calculated the correlation of probe
expression across samples and selected the probe with the highest
average correlation.
"""
elif self.probe_selection == "rnaseq":
report += """
When multiple probes indexed the expression of the same gene we
selected and used the probe with the most consistent pattern of
regional expression to RNA-seq data (available for two donors in
the AHBA). That is, we calculated the Spearman’s rank correlation
between each probes' microarray expression and RNA-seq expression
data of the corresponding gene, and selected the probe with the
highest correspondence. Here, regions correspond to the structural
designations provided in the ontology from the AHBA.
"""
if (self.donor_probes == "aggregate" and self.probe_selection not in
['average', 'diff_stability', 'rnaseq']):
report += """
The selection of probes was performed using sample expression data
aggregated across all donors.<br>
"""
elif (self.donor_probes == "aggregate" and self.probe_selection in
['average', 'diff_stability', 'rnaseq']):
report += "<br>"
elif self.donor_probes == "independent":
report += """
The selection of probes was performed independently for each donor,
such that the probes chosen to represent each gene could differ
across donors.<br>
"""
elif self.donor_probes == "common":
report += """
The selection of probes was performed independently for each donor,
and the probe most commonly selected across all donors was chosen
to represent the expression of each gene for all donors.<br>
"""
if self.corrected_mni and self.group_atlas:
report += """
The MNI coordinates of tissue samples were updated to those
generated via non-linear registration using the Advanced
Normalization Tools (ANTs; https://github.com/chrisfilo/alleninf).
"""
if self.lr_mirror == 'bidirectional':
report += """
To increase spatial coverage, tissue samples were mirrored
bilaterally across the left and right hemispheres [R2018N].
"""
elif self.lr_mirror == 'leftright':
report += """
To increase spatial coverage, tissue samples in the left hemisphere
were mirrored into the right hemisphere [R2018N].
"""
elif self.lr_mirror == 'rightleft':
report += """
To increase spatial coverage, tissue samples in the right
hemisphere were mirrored into the left hemisphere [R2018N].
"""
if self.tolerance == 0 and not self.atlas.volumetric:
report += """
Samples were assigned to brain regions by minimizing the Euclidean
distance between the {space} coordinates of each sample and the
nearest surface vertex. Samples where the Euclidean distance to the
nearest vertex was greater than the mean distance for all samples
belonging to that donor were excluded.
""".format(space='MNI' if self.group_atlas else 'native voxel')
elif self.tolerance == 0 and self.atlas.volumetric:
report += """
Samples were assigned to brain regions in the provided atlas only
if their {space} coordinates were directly within a voxel belonging
to a parcel.
""".format(space='MNI' if self.group_atlas else 'native voxel')
elif self.tolerance > 0 and not self.atlas.volumetric:
report += """
Samples were assigned to brain regions by minimizing the Euclidean
distance between the {space} coordinates of each sample and the
nearest surface vertex. Samples where the Euclidean distance to the
nearest vertex was more than {tolerance} standard deviations above
the mean distance for all samples belonging to that donor were
excluded.
""".format(space='MNI' if self.group_atlas else 'native voxel',
tolerance=self.tolerance)
elif self.tolerance > 0 and self.atlas.volumetric:
report += """
Samples were assigned to brain regions in the provided atlas if
their {space} coordinates were within {tolerance} mm of a given
parcel.
""".format(space='MNI' if self.group_atlas else 'native voxel',
tolerance=self.tolerance)
elif self.tolerance < 0 and not self.atlas.volumetric:
report += """
Samples were assigned to brain regions by minimizing the Euclidean
distance between the {space} coordinates of each sample and the
nearest surface vertex. Samples where the Euclidean distance to the
nearest vertex was more than {tolerance}mm were excluded.
""".format(space='MNI' if self.group_atlas else 'native voxel',
tolerance=abs(self.tolerance))
if self.atlas_info is not None:
report += """
To reduce the potential for misassignment, sample-to-region
matching was constrained by hemisphere and gross structural
divisions (i.e., cortex, subcortex/brainstem, and cerebellum, such
that e.g., a sample in the left cortex could only be assigned to an
atlas parcel in the left cortex; [A2019N]).
"""
if self.missing == 'centroids':
report += """
If a brain region was not assigned a sample from any donor based on
the above procedure, the tissue sample closest to the centroid of
that parcel was identified independently for each donor. The
average of these samples was taken across donors, weighted by the
distance between the parcel centroid and the sample, to obtain an
estimate of the parcellated expression values for the missing
region.
"""
if self.counts is not None:
n_missing = np.sum(self.counts.sum(axis=1) == 0)
if n_missing > 0:
report += """
This procedure was performed for {n_missing} regions that
were not assigned tissue samples.
""".format(n_missing=n_missing)
elif self.missing == 'interpolate':
report += """
If a brain region was not assigned a tissue sample based on the
above procedure, every {node} in the region was mapped to the
nearest tissue sample from the donor in order to generate a dense,
interpolated expression map. The average of these expression values
was taken across all {nodes} in the region, weighted by the
distance between each {node} and the sample mapped to it, in order
to obtain an estimate of the parcellated expression values for the
missing region.
""".format(node='voxel' if self.atlas.volumetric else 'vertex',
nodes='voxels' if self.atlas.volumetric else 'vertices')
if self.norm_matched:
report += """
All tissue samples not assigned to a brain region in the provided
atlas were discarded.
"""
report += "<br>"
if self.sample_norm is not None:
report += """
Inter-subject variation was addressed {}
""".format(_get_norm_procedure(self.sample_norm, 'sample_norm'))
if self.gene_norm is None:
pass
elif self.gene_norm == self.sample_norm:
report += """
Gene expression values were then normalized across tissue samples
using an identical procedure.
"""
elif (self.gene_norm != self.sample_norm
and self.sample_norm is not None):
report += """
Inter-subject variation in gene expression was then addressed {}
""".format(_get_norm_procedure(self.gene_norm, 'gene_norm'))
elif self.gene_norm != self.sample_norm and self.sample_norm is None:
report += """
Inter-subject variation was addressed {}
""".format(_get_norm_procedure(self.gene_norm, 'gene_norm'))
if not self.norm_matched and not self.norm_structures:
report += """
All available tissue samples were used in the normalization process
regardless of whether they were assigned to a brain region.
"""
elif not self.norm_matched and self.norm_structures:
report += """
All available tissue samples were used in the normalization process
regardless of whether they were matched to a brain region; however,
normalization was performed separately for samples in distinct
structural classes (i.e., cortex, subcortex/brainstem, cerebellum).
"""
elif self.norm_matched and self.norm_structures:
report += """
Normalization was performed separately for samples in distinct
structural classes (i.e., cortex, subcortex/brainstem, cerebellum).
"""
if not self.norm_matched and self.gene_norm is not None:
report += """
Tissue samples not matched to a brain region were discarded after
normalization.
"""
if self.region_agg == 'donors' and self.agg_metric == 'mean':
report += """
Samples assigned to the same brain region were averaged separately
for each donor{donors}, yielding a regional expression matrix
{n_donors}
""".format(donors=' and then across donors' if
not self.return_donors else '',
n_donors=' for each donor ' if
self.return_donors else '')
elif self.region_agg == 'samples' and self.agg_metric == 'mean':
report += """
Samples assigned to the same brain region were averaged across all
donors, yielding a regional expression matrix
"""
elif self.region_agg == 'donors' and self.agg_metric == 'median':
report += """
The median value of samples assigned to the same brain region was
computed separately for each donor{donors}, yielding a regional
expression matrix{n_donors}
""".format(donors=' and then across donors' if
not self.return_donors else '',
n_donors=' for each donor' if
self.return_donors else '')
elif self.region_agg == 'samples' and self.agg_metric == 'median':
report += """
The median value of samples assigned to the same brain region was
computed across donors, yielding a single regional expression
matrix
"""
if self.n_genes is not None:
report += """
with {n_region} rows, corresponding to brain regions, and
{n_genes:,} columns, corresponding to the retained genes
""".format(n_region=len(self.atlas.labels),
n_genes=self.n_genes)
report += "\b."
report += str(_add_references(report))
return _sanitize_text(report)
def _sanitize_text(text):
""" Cleans up line and paragraph breaks in `text`
"""
text = ' '.join(text.replace('\n', ' ').split())
return text.replace('<br> ', '\n\n') \
.replace('<p>', '\n')
def _get_donor_demographics(donors):
"""
Gets demographic info about the requested `donors`
Parameters
----------
donors : list, optional
List of donors. Can be either donor numbers or UID. If not specified
will use all available donors. Note that donors '9861' and '10021'
have samples from both left + right hemispheres; all other donors have
samples from the left hemisphere only. Default: 'all'
Returns
-------
info : dict
With keys ['n_donor', 'n_female','min', 'max', 'mean', 'std']. The
values represent features of the age distribution of requested donors.
"""
donors = [int(i) for i in check_donors(donors)]
info = fetch_donor_info().set_index('uid').loc[donors]
age_info = info['age'].describe()
dinfo = dict(n_donors=len(info), n_female=sum(info['sex'] == 'F'))
dinfo.update(age_info.loc[['min', 'max', 'mean', 'std']].to_dict())
return dinfo
def _get_norm_procedure(norm, parameter):
"""
Returns methods description of the provided `norm`
Parameters
----------
norm : str
Normalization procedure
Returns
-------
procedure : str
Reporting procedures
"""
if parameter not in ('sample_norm', 'gene_norm'):
raise ValueError(f'Invalid norm parameter {parameter}')
if parameter == 'sample_norm':
mod = ('tissue sample expression values across genes')
suff = ('tissue sample across genes')
else:
mod = ('gene expression values across tissue samples')
suff = ('gene across tissue samples')
sigmoid = r"""
independently for each donor using a sigmoid function [F2016P]:<br>
$$ x_{{norm}} = \frac{{1}}{{1 + \exp(-\frac{{(x - \overline{{x}})}}
{{\sigma_{{x}}}})}} $$<br>
where $\bar{{x}}$ is the arithmetic mean and $\sigma_{{x}}$ is the
sample standard deviation of the expression of a single
"""
robust_sigmoid = r"""
using a robust sigmoid function [F2013J]:<br>
$$ x_{{norm}} = \frac{{1}}{{1 + \exp(-\frac{{(x-\langle x \rangle)}}
{{\text{{IQR}}_{{x}}}})}} $$<br>
where $\langle x \rangle$ is the median and $\text{{IQR}}_{{x}}$
is the normalized interquartile range of the expression of a single
"""
rescale = r"""
Normalized expression values were then rescaled to the unit interval: <br>
$$ x_{{scaled}} = \frac{{x_{{norm}} - \min(x_{{norm}})}}
{{\max(x_{{norm}}) - \min(x_{{norm}})}} $$<br>
"""
procedure = ''
if norm in ['center', 'demean']:
procedure += """
by demeaning {mod} independently for each donor.
""".format(mod=mod)
elif norm == 'zscore':
procedure += """
by mean- and variance-normalizing (i.e., z-scoring) {mod} independently
for each donor.
""".format(mod=mod)
elif norm == 'minmax':
procedure += r"""
by rescaling {mod} to the unit interval independently for each donor:
<br>
$$ x_{{{{scaled}}}} = \frac{{{{x - \min(x)}}}}{{{{\max(x) - \min(x)}}}}
$$<br>
""".format(mod=mod)
elif norm in ['sigmoid', 'sig']:
procedure += r"""
by normalizing {mod} {sigmoid} {suff}.
""".format(mod=mod, sigmoid=sigmoid, suff=suff)
elif norm in ['scaled_sigmoid', 'scaled_sig']:
procedure += r"""
by normalizing {mod} {sigmoid} {suff}. {rescale}
""".format(mod=mod, sigmoid=sigmoid, suff=suff, rescale=rescale)
elif norm in ['scaled_sigmoid_quantiles', 'scaled_sig_qnt']:
procedure += r"""
by normalizing {mod} {sigmoid} {suff}, calculated using only data in
the 5–95th percentile range to downweight the impact of outliers.
{rescale}
""".format(mod=mod, sigmoid=sigmoid, suff=suff, rescale=rescale)
elif norm in ['robust_sigmoid', 'rsig', 'rs']:
procedure += r"""
by normalizing {mod} {robust_sigmoid} {suff}.
""".format(mod=mod, robust_sigmoid=robust_sigmoid, suff=suff)
elif norm in ['scaled_robust_sigmoid', 'scaled_rsig', 'srs']:
procedure += r"""
by normalizing {mod} {robust_sigmoid} {suff}. {rescale}
""".format(mod=mod, robust_sigmoid=robust_sigmoid, suff=suff,
rescale=rescale)
elif norm in ['mixed_sigmoid', 'mixed_sig']:
procedure += r"""
by normalizing {mod} using a mixed sigmoid function [F2013J]:<br>
$$ x_{{{{norm}}}} = \left\{{{{\begin{{{{array}}}}{{{{r r}}}}
\frac{{{{1}}}}{{{{1 + \exp(-\frac{{{{(x-\overline{{{{x}}}})}}}}
{{{{\sigma_{{{{x}}}}}}}})}}}} ,& \text{{{{IQR}}}}_{{{{x}}}} = 0
\frac{{{{1}}}}{{{{1 + \exp(-\frac{{{{(x-\langle x \rangle)}}}}
{{{{\text{{{{IQR}}}}_{{{{x}}}}}}}})}}}} ,& \text{{{{IQR}}}}_{{{{x}}}}
\neq 0 \end{{{{array}}}}\right. $$<br>
where $\bar{{{{x}}}}$ is the arithmetic mean, $\sigma_{{{{x}}}}$ is the
sample standard deviation, $\langle x \rangle$ is the median, and
$\text{{{{IQR}}}}_{{{{x}}}}$ is the normalized interquartile range of
the expression value of a single {suff}. {rescale}
""".format(mod=mod, suff=suff, rescale=rescale)
return procedure
def _add_references(report):
"""
Detects references in `report` and generates list
Parameters
----------
report : str
Report body
Returns
-------
references : str
List of references to be appended to `report`
"""
refreport = ''
for ref, cite in REFERENCES.items():
if ref in report:
refreport += f'[{ref}]: {cite}<p>'
if len(refreport) > 0:
refreport = '<br> REFERENCES<p>----------<p>' + refreport
if refreport.endswith('<p> '):
refreport = refreport[:-4]
return refreport