Source code for abagen.mouse.io

# -*- coding: utf-8 -*-
"""
Functions to fetch (and load) mouse gene and structure lists from Allen API
"""

import os.path as op
from pkg_resources import resource_filename

import pandas as pd

from .utils import _make_api_query
from ..datasets import _get_dataset_dir


[docs]def fetch_allenref_genes(entry_type=None, cache=True, data_dir=None,
                         verbose=True):
    """
    Loads all genes from Allen Reference database

    Parameters
    ----------
    entry_type: {'id', 'acronym', 'name'}, optional
        The type of gene identifier to load. Specifying 'id' returns a list of
        numerical gene IDs, 'acronym' returns a list of short-form gene
        acronyms, and 'name' returns full gene names. If not specified, returns
        a dataframe with all information. Default: None
    cache : bool, optional
        Whether to use cached gene information (if it exists). Setting to False
        will overwrite cache. Default: True
    data_dir : str, optional
        Directory where data should be downloaded and unpacked. Default: $HOME/
        abagen-data
    verbose : bool, optional
        Whether to print status message. Default: True

    Returns
    -------
    genes : list or :obj:`pandas.DataFrame`
        Genes in Allen Reference database

    Notes
    -----
    May require internet access to make query to the Allen API (which will take
    some time); after query is made once the results are cached.
    """

    # check that provided entry_type is valid
    entries = ['id', 'acronym', 'name']
    if entry_type is not None and entry_type not in entries:
        raise ValueError('Provided entry_type {} is not valid. Specified '
                         'entry_type must be one of {}.'
                         .format(entry_type, entries))

    # if file doesn't exist or we want to overwrite the cache for some reason
    # download the data from the Allen API
    fname = op.join(_get_dataset_dir('allenmouse', data_dir=data_dir,
                                     verbose=verbose),
                    'reference_genes.csv')
    if not op.isfile(fname) or not cache:
        if verbose:
            print('Gene information not available locally; querying '
                  'Allen API for information. This may take some time...')
        out = _make_api_query('Gene', criteria='products[id$eq1]',
                              attributes=entries, suffix='num_rows=all')
        genes = pd.DataFrame(out)
        # sort entries by gene ID
        genes = genes.sort_values('id').reset_index(drop=True)
        # save information to disk
        genes.to_csv(fname, index=False)
    else:
        genes = pd.read_csv(fname)[entries]

    # extract only relevant entry_type, if desired
    if entry_type is not None:
        genes = genes[entry_type].tolist()

    return genes


[docs]def fetch_allenref_structures(entry_type=None, cache=True, data_dir=None,
                              verbose=True):
    """
    Loads all anatomical structures in the Allen Reference Atlas

    Parameters
    ----------
    entry_type: {'id', 'acronym', 'name'}, optional
        The type of structural identifier to load. Specifying 'id' returns a
        list of numerical structure IDs, 'acronym' returns a list of short-form
        structure acronyms, and 'name' returns full structure names. If not
        specified, returns a dataframe with all information. Default: None
    cache : bool, optional
        Whether to use cached structure information (if it exists). Setting to
        False will overwrite cache. Default: True
    data_dir : str, optional
        Directory where data should be downloaded and unpacked. Default: $HOME/
        abagen-data
    verbose : bool, optional
        Whether to print status message. Default: True

    Returns
    -------
    structures : list or :obj:`pandas.DataFrame`
        Anatomical structures in Allen Reference Atlas

    Notes
    -----
    May require internet access to make query to the Allen API (which will take
    some time); after query is made once the results are cached.
    """

    entries = ['id', 'acronym', 'name']
    if entry_type is not None and entry_type not in entries:
        raise ValueError('Provided entry_type {} is not valid. Specified '
                         'entry_type must be one of {}.'
                         .format(entry_type, entries))

    fname = op.join(_get_dataset_dir('allenmouse', data_dir=data_dir,
                                     verbose=verbose),
                    'reference_atlas.csv')
    if not op.isfile(fname) or not cache:
        if verbose:
            print('Structure information not available locally; querying '
                  'Allen API for information...')
        out = _make_api_query('Structure', criteria='ontology[id$eq1]',
                              attributes=entries, suffix='num_rows=all')
        structures = pd.DataFrame(out)
        # sort entries by structure ID
        structures = structures.sort_values('id').reset_index(drop=True)
        # save information to disk
        structures.to_csv(fname, index=False)
    else:
        structures = pd.read_csv(fname)[entries]

    # extract only relevant entry_type, if desired
    if entry_type is not None:
        structures = structures[entry_type].tolist()

    return structures


[docs]def fetch_rubinov2015_structures(entry_type=None):
    """
    Loads subset of anatomical structures in Allen Reference Atlas from [MI1]_

    Parameters
    ----------
    entry_type: {'id', 'acronym', 'name'}, optional
        The type of structural identifier to load. Specifying 'id' returns a
        list of numerical structure IDs, 'acronym' returns a list of short-form
        structure acronyms, and 'name' returns full structure names. If not
        specified, returns a dataframe with all information. Default: None

    Returns
    -------
    structures : list or :obj:`pandas.DataFrame`
        Anatomical structures in Allen Reference Atlas from [MI1]_

    References
    ----------
    .. [MI1] Rubinov, M., Ypma, R. J., Watson, C., & Bullmore, E. T. (2015).
       Wiring cost and topological participation of the mouse brain connectome.
       Proceedings of the National Academy of Sciences, 112(32), 10032-10037.
    """

    entries = ['id', 'acronym', 'name']
    if entry_type is not None and entry_type not in entries:
        raise ValueError('Provided entry_type {} is not valid. Specified '
                         'entry_type must be one of {}.'
                         .format(entry_type, entries))

    fname = resource_filename('abagen', 'data/rubinov2015_pnas.csv.gz')
    structures = pd.read_csv(fname)[entries]

    if entry_type is not None:
        structures = structures[entry_type].tolist()

    return structures