Source code for abagen.reporting

# -*- coding: utf-8 -*-
"""
Functions for generating workflow methods reports

Note: all text contained within this module is released under a `CC-0 license
<https://creativecommons.org/publicdomain/zero/1.0/>`_.
"""

import logging

import numpy as np

from . import __version__
from .datasets import check_donors, fetch_donor_info
from .images import coerce_atlas_to_dict
from .utils import first_entry

LGR = logging.getLogger('abagen')
METRICS = {
    np.mean: 'mean',
    np.median: 'median'
}
REFERENCES = dict(
    A2019N=('Arnatkevic̆iūtė, A., Fulcher, B. D., & Fornito, A. (2019). '
            'A practical guide to linking brain-wide gene expression and '
            'neuroimaging data. Neuroimage, 189, 353-367.'),
    F2016P=('Fulcher, B. D., & Fornito, A. (2016). A transcriptional '
            'signature of hub connectivity in the mouse connectome. '
            'Proceedings of the National Academy of Sciences, 113(5), '
            '1435-1440.'),
    F2013J=('Fulcher, B. D., Little, M. A., & Jones, N. S. (2013). '
            'Highly comparative time-series analysis: the empirical '
            'structure of time series and their methods. Journal of '
            'the Royal Society Interface, 10(83), 20130048.'),
    H2012N=('Hawrylycz, M. J., Lein, E. S., Guillozet-Bongaarts, A. L., '
            'Shen, E. H., Ng, L., Miller, J. A., ... & Jones, A. R. '
            '(2012). An anatomically comprehensive atlas of the adult '
            'human brain transcriptome. Nature, 489(7416), 391-399.'),
    H2015N=('Hawrylycz, M., Miller, J. A., Menon, V., Feng, D., '
            'Dolbeare, T., Guillozet-Bongaarts, A. L., ... & Lein, E. '
            '(2015). Canonical genetic signatures of the adult human '
            'brain. Nature Neuroscience, 18(12), 1832.'),
    M2021B=('Markello, R.D., Arnatkevic̆iūtė, A., Poline, J-B., Fulcher, B. '
            'D., Fornito, A., & Misic, B. (2021). Standardizing workflows in '
            'imaging transcriptomics with the abagen toolbox. Biorxiv.'),
    P2017G=('Parkes, L., Fulcher, B. D., Yücel, M., & Fornito, A. '
            '(2017). Transcriptional signatures of connectomic '
            'subregions of the human striatum. Genes, Brain and '
            'Behavior, 16(7), 647-663.'),
    Q2002N=('Quackenbush, J. (2002). Microarray data normalization and '
            'transformation. Nature Genetics, 32(4), 496-501.'),
    R2018N=('Romero-Garcia, R., Whitaker, K. J., Váša, F., Seidlitz, '
            'J., Shinn, M., Fonagy, P., ... & NSPN Consortium. (2018). '
            'Structural covariance networks are coupled to expression '
            'of genes enriched in supragranular layers of the human '
            'cortex. NeuroImage, 171, 256-267.')
)


[docs]class Report: """ Generates report of methods for :func:`abagen.get_expression_data()` Refer to the doc-string of the workflow for an overview of paramter options """ def __init__(self, atlas, atlas_info=None, *, ibf_threshold=0.5, probe_selection='diff_stability', donor_probes='aggregate', lr_mirror=None, missing=None, tolerance=2, sample_norm='srs', gene_norm='srs', norm_matched=True, norm_structures=False, region_agg='donors', agg_metric='mean', corrected_mni=True, reannotated=True, donors='all', return_donors=False, data_dir=None, counts=None, n_probes=None, n_genes=None): efflevel = LGR.getEffectiveLevel() LGR.setLevel(100) atlas, self.group_atlas = \ coerce_atlas_to_dict(atlas, donors, atlas_info=atlas_info, data_dir=data_dir) self.atlas = first_entry(atlas) self.atlas_info = self.atlas.atlas_info self.ibf_threshold = ibf_threshold self.probe_selection = probe_selection self.donor_probes = donor_probes self.lr_mirror = lr_mirror self.missing = missing self.tolerance = tolerance self.sample_norm = sample_norm self.gene_norm = gene_norm self.norm_matched = norm_matched self.norm_structures = norm_structures self.region_agg = region_agg self.agg_metric = METRICS.get(agg_metric, agg_metric) self.corrected_mni = corrected_mni self.reannotated = reannotated self.donors = donors self.return_donors = return_donors self.counts = counts self.n_probes = n_probes self.n_genes = n_genes self.body = self.gen_report() LGR.setLevel(efflevel)
[docs] def gen_report(self): """ Generates main text of report """ report = '' report += """ Regional microarry expression data were obtained from {n_donors} post-mortem brains ({n_female} female, ages {min}--{max}, {mean:.2f} +/- {std:.2f}) provided by the Allen Human Brain Atlas (AHBA, https://human.brain-map.org; [H2012N]). Data were processed with the abagen toolbox (version {vers}; https://github.com/rmarkello/abagen; [M2021B]) """.format(**_get_donor_demographics(self.donors), vers=__version__) if self.atlas.volumetric and self.group_atlas: report += """ using a {n_region}-region volumetric atlas in MNI space.<br> """.format(n_region=len(self.atlas.labels)) elif not self.atlas.volumetric and self.group_atlas: report += """ using a {n_region}-region surface-based atlas in MNI space.<br> """.format(n_region=len(self.atlas.labels)) elif self.atlas.volumetric and not self.group_atlas: report += """ using a {n_region}-region volumetric atlas, independently aligned to each donor's native MRI space.<br> """.format(n_region=len(self.atlas.labels)) else: report += ".<br>" if self.reannotated: report += """ First, microarray probes were reannotated using data provided by [A2019N]; probes not matched to a valid Entrez ID were discarded. """ else: report += """ First, microarray probes not matched to a valid Entrez ID were discarded. """ if self.ibf_threshold > 0: report += """ Next, probes were filtered based on their expression intensity relative to background noise [Q2002N], such that probes with intensity less than the background in >={threshold:.2f}% of samples across donors were discarded """.format(threshold=self.ibf_threshold * 100) if self.n_probes is not None: report += """ , yielding {n_probes:,} probes """.format(n_probes=self.n_probes) report += "." if self.probe_selection == 'average': report += """ When multiple probes indexed the expression of the same gene we calculated the mean expression across probes. """ elif self.probe_selection == 'diff_stability': report += r""" When multiple probes indexed the expression of the same gene, we selected and used the probe with the most consistent pattern of regional variation across donors (i.e., differential stability; [H2015N]), calculated with:<br> $$ \Delta_{{S}}(p) = \frac{{1}}{{\binom{{N}}{{2}}}} \, \sum_{{i=1}}^{{N-1}} \sum_{{j=i+1}}^{{N}} \rho[B_{{i}}(p), B_{{j}}(p)] $$<br> where $ \rho $ is Spearman's rank correlation of the expression of a single probe, p, across regions in two donors $B_{{i}}$ and $B_{{j}}$, and N is the total number of donors. Here, regions correspond to the structural designations provided in the ontology from the AHBA. """ elif self.probe_selection == "pc_loading": report += """ When multiple probes indexed the expression of the same gene we selected and used the probe with the highest loading on the first principal component taken from a decomposition of probe expression across samples from all donors [P2017G]. """ elif self.probe_selection == "max_intensity": report += """ When multiple probes indexed the expression of the same gene we selected and used the probe with the highest mean intensity across samples. """ elif self.probe_selection == "max_variance": report += """ When multiple probes indexed the expression of the same gene we selected and used the probe with the highest variance across samples. """ elif self.probe_selection == "corr_intensity": report += """ When multiple probes indexed the expression of the same gene we selected and used the expression profile of a single representative probe. For genes where only two probes were available we selected the probe with the highest mean intensity across samples; where three or more probes were available, we calculated the correlation of probe expression across samples and selected the probe with the highest average correlation. """ elif self.probe_selection == "corr_variance": report += """ When multiple probes indexed the expression of the same gene we selected and used the expression profile of a single representative probe. For genes where only two probes were available we selected the probe with the highest variance across samples; where three or more probes were available, we calculated the correlation of probe expression across samples and selected the probe with the highest average correlation. """ elif self.probe_selection == "rnaseq": report += """ When multiple probes indexed the expression of the same gene we selected and used the probe with the most consistent pattern of regional expression to RNA-seq data (available for two donors in the AHBA). That is, we calculated the Spearman’s rank correlation between each probes' microarray expression and RNA-seq expression data of the corresponding gene, and selected the probe with the highest correspondence. Here, regions correspond to the structural designations provided in the ontology from the AHBA. """ if (self.donor_probes == "aggregate" and self.probe_selection not in ['average', 'diff_stability', 'rnaseq']): report += """ The selection of probes was performed using sample expression data aggregated across all donors.<br> """ elif (self.donor_probes == "aggregate" and self.probe_selection in ['average', 'diff_stability', 'rnaseq']): report += "<br>" elif self.donor_probes == "independent": report += """ The selection of probes was performed independently for each donor, such that the probes chosen to represent each gene could differ across donors.<br> """ elif self.donor_probes == "common": report += """ The selection of probes was performed independently for each donor, and the probe most commonly selected across all donors was chosen to represent the expression of each gene for all donors.<br> """ if self.corrected_mni and self.group_atlas: report += """ The MNI coordinates of tissue samples were updated to those generated via non-linear registration using the Advanced Normalization Tools (ANTs; https://github.com/chrisfilo/alleninf). """ if self.lr_mirror == 'bidirectional': report += """ To increase spatial coverage, tissue samples were mirrored bilaterally across the left and right hemispheres [R2018N]. """ elif self.lr_mirror == 'leftright': report += """ To increase spatial coverage, tissue samples in the left hemisphere were mirrored into the right hemisphere [R2018N]. """ elif self.lr_mirror == 'rightleft': report += """ To increase spatial coverage, tissue samples in the right hemisphere were mirrored into the left hemisphere [R2018N]. """ if self.tolerance == 0 and not self.atlas.volumetric: report += """ Samples were assigned to brain regions by minimizing the Euclidean distance between the {space} coordinates of each sample and the nearest surface vertex. Samples where the Euclidean distance to the nearest vertex was greater than the mean distance for all samples belonging to that donor were excluded. """.format(space='MNI' if self.group_atlas else 'native voxel') elif self.tolerance == 0 and self.atlas.volumetric: report += """ Samples were assigned to brain regions in the provided atlas only if their {space} coordinates were directly within a voxel belonging to a parcel. """.format(space='MNI' if self.group_atlas else 'native voxel') elif self.tolerance > 0 and not self.atlas.volumetric: report += """ Samples were assigned to brain regions by minimizing the Euclidean distance between the {space} coordinates of each sample and the nearest surface vertex. Samples where the Euclidean distance to the nearest vertex was more than {tolerance} standard deviations above the mean distance for all samples belonging to that donor were excluded. """.format(space='MNI' if self.group_atlas else 'native voxel', tolerance=self.tolerance) elif self.tolerance > 0 and self.atlas.volumetric: report += """ Samples were assigned to brain regions in the provided atlas if their {space} coordinates were within {tolerance} mm of a given parcel. """.format(space='MNI' if self.group_atlas else 'native voxel', tolerance=self.tolerance) elif self.tolerance < 0 and not self.atlas.volumetric: report += """ Samples were assigned to brain regions by minimizing the Euclidean distance between the {space} coordinates of each sample and the nearest surface vertex. Samples where the Euclidean distance to the nearest vertex was more than {tolerance}mm were excluded. """.format(space='MNI' if self.group_atlas else 'native voxel', tolerance=abs(self.tolerance)) if self.atlas_info is not None: report += """ To reduce the potential for misassignment, sample-to-region matching was constrained by hemisphere and gross structural divisions (i.e., cortex, subcortex/brainstem, and cerebellum, such that e.g., a sample in the left cortex could only be assigned to an atlas parcel in the left cortex; [A2019N]). """ if self.missing == 'centroids': report += """ If a brain region was not assigned a sample from any donor based on the above procedure, the tissue sample closest to the centroid of that parcel was identified independently for each donor. The average of these samples was taken across donors, weighted by the distance between the parcel centroid and the sample, to obtain an estimate of the parcellated expression values for the missing region. """ if self.counts is not None: n_missing = np.sum(self.counts.sum(axis=1) == 0) if n_missing > 0: report += """ This procedure was performed for {n_missing} regions that were not assigned tissue samples. """.format(n_missing=n_missing) elif self.missing == 'interpolate': report += """ If a brain region was not assigned a tissue sample based on the above procedure, every {node} in the region was mapped to the nearest tissue sample from the donor in order to generate a dense, interpolated expression map. The average of these expression values was taken across all {nodes} in the region, weighted by the distance between each {node} and the sample mapped to it, in order to obtain an estimate of the parcellated expression values for the missing region. """.format(node='voxel' if self.atlas.volumetric else 'vertex', nodes='voxels' if self.atlas.volumetric else 'vertices') if self.norm_matched: report += """ All tissue samples not assigned to a brain region in the provided atlas were discarded. """ report += "<br>" if self.sample_norm is not None: report += """ Inter-subject variation was addressed {} """.format(_get_norm_procedure(self.sample_norm, 'sample_norm')) if self.gene_norm is None: pass elif self.gene_norm == self.sample_norm: report += """ Gene expression values were then normalized across tissue samples using an identical procedure. """ elif (self.gene_norm != self.sample_norm and self.sample_norm is not None): report += """ Inter-subject variation in gene expression was then addressed {} """.format(_get_norm_procedure(self.gene_norm, 'gene_norm')) elif self.gene_norm != self.sample_norm and self.sample_norm is None: report += """ Inter-subject variation was addressed {} """.format(_get_norm_procedure(self.gene_norm, 'gene_norm')) if not self.norm_matched and not self.norm_structures: report += """ All available tissue samples were used in the normalization process regardless of whether they were assigned to a brain region. """ elif not self.norm_matched and self.norm_structures: report += """ All available tissue samples were used in the normalization process regardless of whether they were matched to a brain region; however, normalization was performed separately for samples in distinct structural classes (i.e., cortex, subcortex/brainstem, cerebellum). """ elif self.norm_matched and self.norm_structures: report += """ Normalization was performed separately for samples in distinct structural classes (i.e., cortex, subcortex/brainstem, cerebellum). """ if not self.norm_matched and self.gene_norm is not None: report += """ Tissue samples not matched to a brain region were discarded after normalization. """ if self.region_agg == 'donors' and self.agg_metric == 'mean': report += """ Samples assigned to the same brain region were averaged separately for each donor{donors}, yielding a regional expression matrix {n_donors} """.format(donors=' and then across donors' if not self.return_donors else '', n_donors=' for each donor ' if self.return_donors else '') elif self.region_agg == 'samples' and self.agg_metric == 'mean': report += """ Samples assigned to the same brain region were averaged across all donors, yielding a regional expression matrix """ elif self.region_agg == 'donors' and self.agg_metric == 'median': report += """ The median value of samples assigned to the same brain region was computed separately for each donor{donors}, yielding a regional expression matrix{n_donors} """.format(donors=' and then across donors' if not self.return_donors else '', n_donors=' for each donor' if self.return_donors else '') elif self.region_agg == 'samples' and self.agg_metric == 'median': report += """ The median value of samples assigned to the same brain region was computed across donors, yielding a single regional expression matrix """ if self.n_genes is not None: report += """ with {n_region} rows, corresponding to brain regions, and {n_genes:,} columns, corresponding to the retained genes """.format(n_region=len(self.atlas.labels), n_genes=self.n_genes) report += "\b." report += str(_add_references(report)) return _sanitize_text(report)
def _sanitize_text(text): """ Cleans up line and paragraph breaks in `text` """ text = ' '.join(text.replace('\n', ' ').split()) return text.replace('<br> ', '\n\n') \ .replace('<p>', '\n') def _get_donor_demographics(donors): """ Gets demographic info about the requested `donors` Parameters ---------- donors : list, optional List of donors. Can be either donor numbers or UID. If not specified will use all available donors. Note that donors '9861' and '10021' have samples from both left + right hemispheres; all other donors have samples from the left hemisphere only. Default: 'all' Returns ------- info : dict With keys ['n_donor', 'n_female','min', 'max', 'mean', 'std']. The values represent features of the age distribution of requested donors. """ donors = [int(i) for i in check_donors(donors)] info = fetch_donor_info().set_index('uid').loc[donors] age_info = info['age'].describe() dinfo = dict(n_donors=len(info), n_female=sum(info['sex'] == 'F')) dinfo.update(age_info.loc[['min', 'max', 'mean', 'std']].to_dict()) return dinfo def _get_norm_procedure(norm, parameter): """ Returns methods description of the provided `norm` Parameters ---------- norm : str Normalization procedure Returns ------- procedure : str Reporting procedures """ if parameter not in ('sample_norm', 'gene_norm'): raise ValueError(f'Invalid norm parameter {parameter}') if parameter == 'sample_norm': mod = ('tissue sample expression values across genes') suff = ('tissue sample across genes') else: mod = ('gene expression values across tissue samples') suff = ('gene across tissue samples') sigmoid = r""" independently for each donor using a sigmoid function [F2016P]:<br> $$ x_{{norm}} = \frac{{1}}{{1 + \exp(-\frac{{(x - \overline{{x}})}} {{\sigma_{{x}}}})}} $$<br> where $\bar{{x}}$ is the arithmetic mean and $\sigma_{{x}}$ is the sample standard deviation of the expression of a single """ robust_sigmoid = r""" using a robust sigmoid function [F2013J]:<br> $$ x_{{norm}} = \frac{{1}}{{1 + \exp(-\frac{{(x-\langle x \rangle)}} {{\text{{IQR}}_{{x}}}})}} $$<br> where $\langle x \rangle$ is the median and $\text{{IQR}}_{{x}}$ is the normalized interquartile range of the expression of a single """ rescale = r""" Normalized expression values were then rescaled to the unit interval: <br> $$ x_{{scaled}} = \frac{{x_{{norm}} - \min(x_{{norm}})}} {{\max(x_{{norm}}) - \min(x_{{norm}})}} $$<br> """ procedure = '' if norm in ['center', 'demean']: procedure += """ by demeaning {mod} independently for each donor. """.format(mod=mod) elif norm == 'zscore': procedure += """ by mean- and variance-normalizing (i.e., z-scoring) {mod} independently for each donor. """.format(mod=mod) elif norm == 'minmax': procedure += r""" by rescaling {mod} to the unit interval independently for each donor: <br> $$ x_{{{{scaled}}}} = \frac{{{{x - \min(x)}}}}{{{{\max(x) - \min(x)}}}} $$<br> """.format(mod=mod) elif norm in ['sigmoid', 'sig']: procedure += r""" by normalizing {mod} {sigmoid} {suff}. """.format(mod=mod, sigmoid=sigmoid, suff=suff) elif norm in ['scaled_sigmoid', 'scaled_sig']: procedure += r""" by normalizing {mod} {sigmoid} {suff}. {rescale} """.format(mod=mod, sigmoid=sigmoid, suff=suff, rescale=rescale) elif norm in ['scaled_sigmoid_quantiles', 'scaled_sig_qnt']: procedure += r""" by normalizing {mod} {sigmoid} {suff}, calculated using only data in the 5–95th percentile range to downweight the impact of outliers. {rescale} """.format(mod=mod, sigmoid=sigmoid, suff=suff, rescale=rescale) elif norm in ['robust_sigmoid', 'rsig', 'rs']: procedure += r""" by normalizing {mod} {robust_sigmoid} {suff}. """.format(mod=mod, robust_sigmoid=robust_sigmoid, suff=suff) elif norm in ['scaled_robust_sigmoid', 'scaled_rsig', 'srs']: procedure += r""" by normalizing {mod} {robust_sigmoid} {suff}. {rescale} """.format(mod=mod, robust_sigmoid=robust_sigmoid, suff=suff, rescale=rescale) elif norm in ['mixed_sigmoid', 'mixed_sig']: procedure += r""" by normalizing {mod} using a mixed sigmoid function [F2013J]:<br> $$ x_{{{{norm}}}} = \left\{{{{\begin{{{{array}}}}{{{{r r}}}} \frac{{{{1}}}}{{{{1 + \exp(-\frac{{{{(x-\overline{{{{x}}}})}}}} {{{{\sigma_{{{{x}}}}}}}})}}}} ,& \text{{{{IQR}}}}_{{{{x}}}} = 0 \frac{{{{1}}}}{{{{1 + \exp(-\frac{{{{(x-\langle x \rangle)}}}} {{{{\text{{{{IQR}}}}_{{{{x}}}}}}}})}}}} ,& \text{{{{IQR}}}}_{{{{x}}}} \neq 0 \end{{{{array}}}}\right. $$<br> where $\bar{{{{x}}}}$ is the arithmetic mean, $\sigma_{{{{x}}}}$ is the sample standard deviation, $\langle x \rangle$ is the median, and $\text{{{{IQR}}}}_{{{{x}}}}$ is the normalized interquartile range of the expression value of a single {suff}. {rescale} """.format(mod=mod, suff=suff, rescale=rescale) return procedure def _add_references(report): """ Detects references in `report` and generates list Parameters ---------- report : str Report body Returns ------- references : str List of references to be appended to `report` """ refreport = '' for ref, cite in REFERENCES.items(): if ref in report: refreport += f'[{ref}]: {cite}<p>' if len(refreport) > 0: refreport = '<br> REFERENCES<p>----------<p>' + refreport if refreport.endswith('<p> '): refreport = refreport[:-4] return refreport