Source code for abagen.datasets.fetchers

# -*- coding: utf-8 -*-
"""
Functions for downloading the Allen Brain Atlas human microarray dataset.
"""

from collections import namedtuple
from functools import partial
import multiprocessing as mp
import os
from pkg_resources import resource_filename

import nibabel as nib
import pandas as pd

from .. import io
from ..utils import load_gifti, first_entry
from .utils import _get_dataset_dir, _fetch_files

WELL_KNOWN_IDS = nib.volumeutils.Recoder(
    (('9861', 'H0351.2001', '178238387', '157722636', '157722638'),
     ('10021', 'H0351.2002', '178238373', '157723301', '157723303'),
     ('12876', 'H0351.1009', '178238359', '157722290', '157722292'),
     ('15496', 'H0351.1015', '178238266', '162021642', '162021644'),
     ('14380', 'H0351.1012', '178238316', '157721937', '157721939'),
     ('15697', 'H0351.1016', '178236545', '157682966', '157682968')),
    fields=('subj', 'uid', 'url', 't1w', 't2w')
)
VALID_DONORS = sorted(WELL_KNOWN_IDS.value_set('subj')
                      | WELL_KNOWN_IDS.value_set('uid'))
RESOURCE = partial(resource_filename, 'abagen')


def check_donors(donors, default='12876', valid=VALID_DONORS):
    """
    Checks that provided `donors` are valid

    Parameters
    ----------
    donors : list of str
        List of donors to download; can be either donor number or UID. Can also
        specify 'all' to download all available donors. If 'None' is provided
        then `default` will be used.
    default : str, optional
        Default donor to use if `donors` is None. Default: '12876'
    valid : list of str, optional
        List of valid donnor numbers and UIDs. Default: :obj:`VALID_DONORS`

    Returns
    -------
    donors : list of str
        Donor subject IDs
    """

    if donors is None:
        donors = [default]
    elif donors == 'all':
        donors = valid
    elif isinstance(donors, str):
        donors = [donors]

    donors = list(donors).copy()
    for n, sub_id in enumerate(donors):
        if sub_id not in valid:
            raise ValueError('Invalid subject id: {0}. Subjects must in: {1}.'
                             .format(sub_id, valid))
        donors[n] = WELL_KNOWN_IDS[sub_id]  # convert to ID system
    donors = sorted(set(donors), key=lambda x: int(x))

    return donors


[docs]def fetch_microarray(data_dir=None, donors=None, resume=True, verbose=1,
                     convert=True, n_proc=1):
    """
    Downloads the Allen Human Brain Atlas microarray expression dataset

    Parameters
    ----------
    data_dir : str, optional
        Directory where data should be downloaded and unpacked. Default: $HOME/
        abagen-data
    donors : list, optional
        List of donors to download; can be either donor number or UID. Can also
        specify 'all' to download all available donors. Default: 12876
    resume : bool, optional
        Whether to resume download of a partly-downloaded file. Default: True
    verbose : int, optional
        Verbosity level (0 means no message). Default: 1
    convert : bool, optional
        Whether to convert downloaded CSV files into parquet format for faster
        loading in the future; only available if ``fastparquet`` and ``python-
        snappy`` are installed. Default: True
    n_proc : int, optional
        Number of processes to parallelize download if multiple donors are
        specified. Default: 1

    Returns
    -------
    data : dict
        Two-level nested dictionary, where top-level keys are donor IDs and
        second-level keys are ['microarray', 'ontology', 'pacall', 'probes',
        'annotation'], where corresponding values are lists of filepaths to
        downloaded CSV files.

    References
    ----------
    Hawrylycz, M. J., Lein, E. S., Guillozet-Bongaarts, A. L., Shen, E. H., Ng,
    L., Miller, J. A., ... & Abajian, C. (2012). An anatomically comprehensive
    atlas of the adult human brain transcriptome. Nature, 489(7416), 391.
    """

    url = "https://human.brain-map.org/api/v2/well_known_file_download/{}"

    dataset_name = 'microarray'
    data_dir = _get_dataset_dir(dataset_name, data_dir=data_dir,
                                verbose=verbose)

    sub_files = ('MicroarrayExpression.csv',
                 'Ontology.csv', 'PACall.csv',
                 'Probes.csv', 'SampleAnnot.csv')
    n_files = len(sub_files)
    donors = check_donors(donors)

    if n_proc < 0:
        n_proc = mp.cpu_count() + n_proc + 1

    files = [
        [(os.path.join('normalized_microarray_donor{}'.format(sub), fname),
            url.format(WELL_KNOWN_IDS.url[sub]),
            dict(uncompress=True,
                 move=os.path.join('normalized_microarray_donor{}'.format(sub),
                                   'donor{}.zip'.format(sub))))
         for fname in sub_files]
        for sub in donors
    ]

    if n_proc > 1:
        with mp.Pool(n_proc) as pool:
            results = [pool.apply_async(_fetch_files,
                                        (data_dir, f),
                                        dict(resume=resume, verbose=verbose))
                       for f in files]
            # flatten outputs into single list
            files = [fn for res in results for fn in res.get()]
    else:
        # flatten list of lists into single list
        files = [fn for f in files for fn in f]
        files = _fetch_files(data_dir, files, resume=resume, verbose=verbose)

    # if we want to convert files to parquet format it's good to do that now
    # this step is _already_ super long, so an extra 1-2 minutes is negligible
    if convert and io.use_parq:
        for fn in files[0::n_files] + files[2::n_files]:
            io._make_parquet(fn, convert_only=True)

    keys = ['microarray', 'ontology', 'pacall', 'probes', 'annotation']
    return {
        donor: dict(zip(keys, files[k:k + n_files]))
        for k, donor in zip(range(0, len(files), n_files), donors)
    }


[docs]def fetch_rnaseq(data_dir=None, donors=None, resume=True, verbose=1):
    """
    Downloads RNA-sequencing data from the Allen Human Brain Atlas

    Parameters
    ----------
    data_dir : str, optional
        Directory where data should be downloaded and unpacked. Default:
        current directory
    donors : list, optional
        List of donors to download; can be either donor number or UID. Can also
        specify 'all' to download all available donors (two). Default: 9861
    resume : bool, optional
        Whether to resume download of a partly-downloaded file. Default: True
    verbose : int, optional
        Verbosity level (0 means no message). Default: 1

    Returns
    -------
    data : dict
        Two-level nested dictionary, where top-level keys are donor IDs and
        second-level keys are ['counts', 'tpm', 'ontology', 'genes',
        'annotation'], where corresponding values are lists of filepaths to
        downloaded CSV files.

    References
    ----------
    Hawrylycz, M. J., Lein, E. S., Guillozet-Bongaarts, A. L., Shen, E. H., Ng,
    L., Miller, J. A., ... & Abajian, C. (2012). An anatomically comprehensive
    atlas of the adult human brain transcriptome. Nature, 489(7416), 391.
    """

    url = "https://human.brain-map.org/api/v2/well_known_file_download/{}"
    well_known_ids = {
        '9861': '278447594',
        '10021': '278448166'
    }

    dataset_name = 'rnaseq'
    data_dir = _get_dataset_dir(dataset_name, data_dir=data_dir,
                                verbose=verbose)

    sub_files = ('Genes.csv', 'Ontology.csv',
                 'RNAseqCounts.csv', 'RNAseqTPM.csv', 'SampleAnnot.csv')
    n_files = len(sub_files)
    valid = ['9861', '10021', 'H0351.2001', 'H0351.2002']
    donors = sorted(set(check_donors(donors, default=valid[0])) & set(valid),
                    key=lambda x: int(x))

    files = [
        [(os.path.join('rnaseq_donor{}'.format(sub), fname),
            url.format(well_known_ids[sub]),
            dict(uncompress=True,
                 move=os.path.join('rnaseq_donor{}'.format(sub),
                                   'donor{}.zip'.format(sub))))
         for fname in sub_files]
        for sub in donors
    ]

    files = [fn for f in files for fn in f]
    files = _fetch_files(data_dir, files, resume=resume, verbose=verbose)

    keys = ['genes', 'ontology', 'counts', 'tpm', 'annotation']
    return {
        donor: dict(zip(keys, files[k:k + n_files]))
        for k, donor in zip(range(0, len(files), n_files), donors)
    }


[docs]def fetch_raw_mri(data_dir=None, donors=None, resume=True, verbose=1):
    """
    Downloads the "raw" Allen Human Brain Atlas T1w/T2w MRI images

    Parameters
    ----------
    data_dir : str, optional
        Directory where data should be downloaded and unpacked. Default: $HOME/
        abagen-data
    donors : list, optional
        List of donors to download; can be either donor number or UID. Can also
        specify 'all' to download all available donors. Default: 12876
    resume : bool, optional
        Whether to resume download of a partly-downloaded file. Default: True
    verbose : int, optional
        Verbosity level (0 means no message). Default: 1

    Returns
    -------
    mris : dict
        Two-level nested dictionary, where top-level keys are donor IDs and
        second-level keys are ['t1w', 't2w'], where corresponding values are
        lists of filepaths to downloaded Nifti files
    """

    url = "https://human.brain-map.org/api/v2/well_known_file_download/{}"

    dataset_name = 'mri'
    data_dir = _get_dataset_dir(dataset_name, data_dir=data_dir,
                                verbose=verbose)

    sub_files = dict(t1w='T1.nii.gz', t2w='T2.nii.gz')
    n_files = len(sub_files)
    donors = check_donors(donors)

    files = [
        (os.path.join('mri_donor{}'.format(sub), fname),
         url.format(getattr(WELL_KNOWN_IDS, img)[sub]),
         dict(move=os.path.join('mri_donor{}'.format(sub),
                                fname)))
        for sub in donors
        for img, fname in sub_files.items()
    ]

    files = _fetch_files(data_dir, files, resume=resume, verbose=verbose)

    return {
        donor: dict(zip(sub_files.keys(), files[k:k + n_files]))
        for k, donor in zip(range(0, len(files), n_files), donors)
    }


[docs]def fetch_freesurfer(data_dir=None, donors=None, resume=True, verbose=1):
    """
    Downloads FreeSurfer reconstructions of the Allen Human Brain Atlas MRIs

    Parameters
    ----------
    data_dir : str, optional
        Directory where data should be downloaded and unpacked. Default: $HOME/
        abagen-data
    donors : list, optional
        List of donors to download; can be either donor number or UID. Can also
        specify 'all' to download all available donors. Default: 12876
    resume : bool, optional
        Whether to resume download of a partly-downloaded file. Default: True
    verbose : int, optional
        Verbosity level (0 means no message). Default: 1

    Returns
    -------
    freesurfer : dict
        Dictionary where keys are donor IDs and values are paths to FreeSurfer
        directories for requested `donors`

    References
    ----------
    Romero-Garcia, R., Whitaker, K., Vasa, F., Seidlitz, J., Shinn, M., Fonagy,
    P., Jones, P., et al. (2017). Data supporting NSPN publication "Structural
    covariance networks are coupled to expression of genes enriched in
    supragranular layers of the human cortex " [Dataset].
    https://doi.org/10.17863/CAM.11392
    """

    url = "https://www.repository.cam.ac.uk/bitstream/handle/1810/265272/" \
          "donor{}.zip"
    dataset_name = 'freesurfer'
    data_dir = _get_dataset_dir(dataset_name, data_dir=data_dir,
                                verbose=verbose)

    donors = check_donors(donors)
    files = [
        ('donor{}'.format(sub),
         url.format(sub),
         dict(uncompress=True,
              move=os.path.join('freesurfer.tar.gz')))
        for sub in donors
    ]

    files = _fetch_files(data_dir, files, resume=resume, verbose=verbose)

    return {
        donor: files[k]
        for k, donor in enumerate(donors)
    }


[docs]def fetch_desikan_killiany(native=False, surface=False, *args, **kwargs):
    """
    Fetches Desikan-Killiany atlas shipped with `abagen`

    Parameters
    ----------
    native : bool, optional
        Whether to return individualized atlases in donor native space.
        Default: False
    surface : bool, optional
        Whether to return surface instead of volumetric parcellation. This
        option is currently incompatible with ``native=True``; instead, refer
        to :func:`abagen.datasets.fetch_freesurfer` for donor-specific surface
        atlases. Default: False

    Returns
    -------
    atlas : dict
        Dictionary with keys ['image', 'info'] pointing to atlas image and
        information files. If ``native`` then 'image' is a dictionary where
        keys are donor IDs and values are image paths. If ``surface`` then
        'image' is a tuple of GIFTI files (.label.gii.gz)

    References
    ----------
    Desikan, R. S., Ségonne, F., Fischl, B., Quinn, B. T., Dickerson, B. C.,
    Blacker, D., ... & Albert, M. S. (2006). An automated labeling system
    for subdividing the human cerebral cortex on MRI scans into gyral based
    regions of interest. Neuroimage, 31(3), 968-980.

    Examples
    --------
    >>> import abagen
    >>> atlas = abagen.fetch_desikan_killiany()
    >>> print(atlas['image'])  # doctest: +ELLIPSIS
    /.../abagen/data/atlas-desikankilliany.nii.gz
    >>> print(atlas['info'])  # doctest: +ELLIPSIS
    /.../abagen/data/atlas-desikankilliany.csv

    When fetching native-space atlases, `atlas['image']` will be a dictionary
    where the keys are donor IDs and the values are paths to the donor-specific
    atlases:

    >>> atlas = abagen.fetch_desikan_killiany(native=True)
    >>> print(atlas['image'].keys())
    dict_keys(['9861', '10021', '12876', '14380', '15496', '15697'])
    >>> print(atlas['image']['9861'])  # doctest: +ELLIPSIS
    /.../abagen/data/native_dk/9861/atlas-desikankilliany.nii.gz
    """

    # grab resource filenames
    img = dict()
    for donor in check_donors('all'):
        fp = 'data' if not native else os.path.join('data', 'native_dk', donor)
        if surface:
            impath = tuple([
                RESOURCE(
                    os.path.join(fp, f'atlas-desikankilliany-{h}.label.gii.gz')
                )
                for h in ('lh', 'rh')
            ])
        else:
            impath = RESOURCE(os.path.join(fp, 'atlas-desikankilliany.nii.gz'))
        img[donor] = impath
    if not native:
        img = first_entry(img)
    info = RESOURCE('data/atlas-desikankilliany.csv')

    return dict(image=img, info=info)


[docs]def fetch_gene_group(group):
    """
    Return list of gene acronyms belonging to provided `group`

    Groups are defined as in [DS1]_

    Parameters
    ----------
    group : {'brain', 'neuron', 'oligodendrocyte', 'synaptome', 'layers'}
        Desired gene group

    Returns
    -------
    genes : list of str
        List of gene acronyms

    References
    ----------
    .. [DS1] Burt, J. B., Demirtaş, M., Eckner, W. J., Navejar, N. M., Ji, J.
       L., Martin, W. J., ... & Murray, J. D. (2018). Hierarchy of
       transcriptomic specialization across human cortex captured by
       structural neuroimaging topography. Nature neuroscience, 21(9), 1251.
    """

    groups = ['brain', 'neuron', 'oligodendrocyte', 'synaptome', 'layers']
    if group.lower() not in groups:
        raise ValueError('Provided group {} not one of the available gene '
                         'groups: {}'.format(group, groups))

    group = group.lower()
    fn = RESOURCE(os.path.join('data', 'burt2018_natneuro.csv.gz'))
    genes = pd.read_csv(fn).query('group == "{}"'.format(group))['acronym']

    return sorted(list(genes))


[docs]def fetch_donor_info():
    """
    Returns dataframe with donor demographic information

    Returns
    -------
    info : pandas.DataFrame
        With columns ['donor', 'age', 'sex', 'ethnicity', 'medical_conditions',
        'post_mortem_interval_hours'] detailing basic demographic info about
        donors
    """

    fn = RESOURCE(os.path.join('data', 'donor_info.csv'))
    donors = pd.read_csv(fn)

    return donors


Brain = namedtuple('Brain', ('lh', 'rh'))
Surface = namedtuple('Surface', ('vertices', 'faces'))


[docs]def fetch_fsaverage5(load=True):
    """
    Fetches and optionally loads fsaverage5 surface

    Parameters
    ----------
    load : bool, optional
        Whether to pre-load files. Default: True

    Returns
    -------
    brain : namedtuple ('lh', 'rh')
        If `load` is True, a namedtuple where each entry in the tuple is a
        hemisphere, represented as a namedtuple with fields ('vertices',
        'faces'). If `load` is False, a namedtuple where entries are filepaths.
    """

    hemispheres = []
    for hemi in ('lh', 'rh'):
        fn = RESOURCE(
            os.path.join('data', f'fsaverage5-pial-{hemi}.surf.gii.gz')
        )
        if load:
            hemispheres.append(Surface(*load_gifti(fn).agg_data()))
        else:
            hemispheres.append(fn)

    return Brain(*hemispheres)


[docs]def fetch_fsnative(donors, surf='pial', load=True, data_dir=None, resume=True,
                   verbose=1):
    """
    Fetches and optionally loads fsnative surface of `donor`

    Parameters
    ----------
    donors : str or list-of-str
        Donor(s) to download; can be either donor number or UID. Can also
        specify 'all' to download all available donors.
    surf : {'orig', 'white', 'pial', 'inflated', 'sphere'}, optional
        Which surface to load. Default: 'pial'
    load : bool, optional
        Whether to pre-load files. Default: True
    data_dir : str, optional
        Directory where data should be downloaded and unpacked. Default: $HOME/
        abagen-data
    resume : bool, optional
        Whether to resume download of a partly-downloaded file. Default: True
    verbose : int, optional
        Verbosity level (0 means no message). Default: 1

    Returns
    -------
    brain : namedtuple ('lh', 'rh')
        If `load` is True, a namedtuple where each entry in the tuple is a
        hemisphere, represented as a namedtuple with fields ('vertices',
        'faces'). If `load` is False, a namedtuple where entries are filepaths.
        If multiple donors are requested a dictionary is returned where keys
        are donor IDs.
    """

    donors = check_donors(donors)
    if len(donors) > 1:
        return {donor: fetch_fsnative(donor, surf, data_dir, resume, verbose)
                for donor in donors}

    donors = donors[0]
    fpath = fetch_freesurfer(donors=donors, data_dir=data_dir, resume=resume,
                             verbose=verbose)[donors]
    hemispheres = []
    for hemi in ('lh', 'rh'):
        fn = os.path.join(fpath, 'surf', f'{hemi}.{surf}')
        if load:
            hemispheres.append(Surface(*nib.freesurfer.read_geometry(fn)))
        else:
            hemispheres.append(fn)

    return Brain(*hemispheres)