Source code for abagen.io

# -*- coding: utf-8 -*-
"""
Functions for loading the various files associated with the AHBA microarray and
RNAseq datasets.

This also contains functionality for optionally converting the downloaded CSV
files to parquet format, which provides much faster I/O access / quicker load
times.
"""

import os.path as op
import pandas as pd
try:
    eng = pd.io.parquet.get_engine('fastparquet')
    assert 'SNAPPY' in eng.api.compression.compressions
    use_parq = True
# pandas version too low OR don't have fastparquet installed
except (AttributeError, ImportError, AssertionError):
    use_parq = False


def _make_parquet(fname, convert_only=False):
    """
    Loads `fname`, converting to parquet file if it does not already exist

    Parameters
    ----------
    fname : str
        Path to data file
    convert_only : bool, optional
        Check if parquet version of `fname` exists and convert if it doesn't;
        if it does, just return. Default: False

    Returns
    -------
    data : pandas.DataFrame
        Data loaded from `fname`
    """

    # get ideal parquet filename
    parqname = fname.rpartition('.csv')[0] + '.parq'

    # if it exists, load it as parquet
    if op.exists(parqname):
        if convert_only:
            return
        data = pd.read_parquet(parqname, engine='fastparquet')
    # otherwise, load CSV and save to parquet
    else:
        data = pd.read_csv(fname, header=None)
        data.columns = data.columns.astype(str)
        data.to_parquet(parqname, engine='fastparquet')
        if convert_only:
            return

    return data


[docs]def read_microarray(fname, copy=False, parquet=True): """ Loads MicroarrayExpression.csv file found at `fname` Microarray files contain raw expression data for all the tissue samples taken from a single donor across all genetic probes. Parameters ---------- fname : str Path to MicroarrayExpression.csv file copy : bool, optional Whether to return a copy if `fname` is a pre-loaded pandas.Dataframe. Default: False parquet : bool, optional Whether to load data from parquet file instead of CSV. If a parquet file does not already exist then one will be created for faster loading in the future. Only available if ``fastparquet`` and ``python-snappy`` module are installed. Default: True Returns ------- microarray : (P, S) pandas.DataFrame Dataframe containing microarray expression data, where `P` is probes and `S` is samples. The row index is the unique probe ID assigned during processing, which can be used to match data to the information obtained with :func:`read_probes`. The column index is the unique sample ID (integer, beginning at 0) which can be used to match data to the information obtained with :func:`read_annotation`. """ try: if use_parq and parquet: data = _make_parquet(fname, convert_only=False) data = data.set_index('0') else: data = pd.read_csv(fname, header=None, index_col=0) data.index.name = 'probe_id' data.columns = pd.Series(range(1, len(data.columns) + 1), name='sample_id') except (AttributeError, ValueError, TypeError): if not isinstance(fname, pd.DataFrame): raise TypeError('Provided fname must be filepath to Microarray' 'Expression.csv file from Allen Human Brain ' 'Atlas.') data = fname.copy() if copy else fname return data
[docs]def read_ontology(fname, copy=False): """ Loads Ontology.csv file found at `fname` Ontology files contain information on the anatomical delineations used by the Allen Institute when obtaining samples from donor brains, and are used in their online Brain Viewer to colorize regions. These files should be the same for every donors. This information can be used to ensure that microarray samples are appropriately matched to anatomical regions. Parameters ---------- fname : str Path to Ontology.csv file copy : bool, optional Whether to return a copy if `fname` is a pre-loaded pandas.Dataframe. Default: False Returns ------- ontology : (R, 8) pandas.DataFrame Dataframe containing ontology information for `R` anatomical regions used by the Allen Institute. Columns include: 'id', 'acronym', 'name', 'parent_structure_id', 'hemisphere', 'graph_order', 'structure_id_path', and 'color_hex_triplet'. """ try: data = pd.read_csv(fname) except (ValueError, TypeError): if not isinstance(fname, pd.DataFrame): raise TypeError('Provided fname must be filepath to Ontology.csv ' 'file from Allen Human Brain Atlas.') data = fname.copy() if copy else fname return data
[docs]def read_pacall(fname, copy=False, parquet=True): """ Loads PACall.csv file found at `fname` PA files contain a present/absent flag indicating whether the corresponding probe's expression is above background noise. It is set to 1 when both of the following conditions are met: 1. The mean signal of the probe's expression is significantly different from the corresponding background, as assessed by a 2-sided t-test where p < 0.01, and 2. The difference between the background subtracted signal and the background is significant (> 2.6 * background standard deviation). This information can be used to discard "noisy" probes that might not be contributing high-quality expression information. Parameters ---------- fname : str Path to PACall.csv file copy : bool, optional Whether to return a copy if `fname` is a pre-loaded pandas.Dataframe. Default: False parquet : bool, optional Whether to load data from parquet file instead of CSV. If a parquet file does not already exist then one will be created for faster loading in the future. Only available if ``fastparquet`` and ``python-snappy`` module are installed. Default: True Returns ------- pacall : (P, S) pandas.DataFrame Dataframe containing a binary indicator determining whether expression information for each probe exceeded background noise in a given sample, where `P` is probes and `S` is samples. The row index is the unique probe ID assigned during processing, which can be used to match data to the information obtained with :func:`read_probes`. The column index is the unique sample ID (integer, beginning at 1) which can be used to match data to the information obtained with :func:`read_annotation`. """ try: if use_parq and parquet: data = _make_parquet(fname, convert_only=False) data = data.set_index('0') else: data = pd.read_csv(fname, header=None, index_col=0) data.index.name = 'probe_id' data.columns = pd.Series(range(1, len(data.columns) + 1), name='sample_id') except (AttributeError, ValueError, TypeError): if not isinstance(fname, pd.DataFrame): raise TypeError('Provided fname must be filepath to PACall.csv ' 'file from Allen Human Brain Atlas.') data = fname.copy() if copy else fname return data
[docs]def read_probes(fname, copy=False): """ Loads Probes.csv file found at `fname` Probe files contain metadata on all genetic probes used in the AHBA data. These files should be the same for every donor. This information can be used to e.g., query expression data for certain genes, collapse data across probes from the same gene, etc. Parameters ---------- fname : str Path to Probes.csv file copy : bool, optional Whether to return a copy if `fname` is a pre-loaded pandas.Dataframe. Default: False Returns ------- probes : (P, 6) pandas.DataFrame Dataframe containing information for `P` genetic probes. The row index is the unique probe ID assigned during processing, which can be used to match metadata to information obtained with :func:`read_microarray` and :func:`read_pacall`. Columns include 'probe_name', 'gene_id', 'gene_symbol', 'gene_name', 'entrez_id', and 'chromosome'. """ try: data = pd.read_csv(fname, index_col=0, dtype={'entrez_id': pd.Int64Dtype()}) except (ValueError, TypeError): if not isinstance(fname, pd.DataFrame): raise TypeError('Provided fname must be filepath to Probes.csv ' 'file from Allen Human Brain Atlas.') data = fname.copy() if copy else fname return data
[docs]def read_annotation(fname, copy=False): """ Loads SampleAnnot.csv file found at `fname` Sample annotation files contain metadata on all the tissue samples taken from a single donor brain, including the spatial location of the samples. This information can be used to combine samples within the same anatomical region across donors. Parameters ---------- fname : str Path to SampleAnnot.csv file copy : bool, optional Whether to return a copy if `fname` is a pre-loaded pandas.Dataframe. Default: False Returns ------- annotation : (S, 13) pandas.DataFrame Dataframe containing structural information on `S` samples. The row index is the unique sample ID (integer, beginning with 1) which can be used to match data to the information obtained with e.g., :func:`read_microarray`. Notes ----- If the provided annotation file is from microarray expression data (obtained by, e.g., `abagen.fetch_microarray()`), then the returned DataFrame will have the following columns: 'structure_id', 'slab_num', 'well_id', 'slab_type', 'structure_acronym', 'structure_name', 'polygon_id', 'mri_voxel_x', 'mri_voxel_y', 'mri_voxel_z', 'mni_x', 'mni_y', 'mni_z'. If the provided annotation file is from RNAseq data (obtained by, e.g., `abagen.fetch_rnaseq()`), then the returned DataFrame will have the following columns: 'RNAseq_sample_name', 'replicate_sample', 'sample_name', 'well_id', 'microarray_run_id', 'ontology_color', 'main_structure', 'sub_structure', 'structure_id', 'structure_acronym', 'hemisphere', 'brain', 'million_clusters', 'clip_percentage', 'RIN_RNA_squality', 'rnaseq_run_id', 'A.Pct', 'C.Pct', 'G.Pct', 'T.Pct', 'N.Pct' """ mapper = dict( ontology_structure_id='structure_id', ontology_structure_acronym='structure_acronym' ) try: data = pd.read_csv(fname) data.index = pd.Series(range(1, len(data.index) + 1), name='sample_id') except (ValueError, TypeError): if not isinstance(fname, pd.DataFrame): raise TypeError('Provided fname must be filepath to Annotation' '.csv file from Allen Human Brain Atlas.') data = fname.copy() if copy else fname data.rename(mapper, axis='columns', inplace=True, errors='ignore') return data
[docs]def read_tpm(fname, copy=False): """ Loads RNAseqTPM.csv file found at `fname` RNAseq TPM files contain TPM values for all the tissue samples taken from a single donor across all genes. TPM values are scaled fragment (read) counts derived using RSEM. Parameters ---------- fname : str Path to RNAseqTPM.csv file copy : bool, optional Whether to return a copy if `fname` is a pre-loaded pandas.Dataframe. Default: False Returns ------- tpm : (G, S) pandas.DataFrame Dataframe containing RNAseq TPM expression data, where `G` is genes and `S` is samples. The row index is the unique gene symbol assigned during processing, which can be used to match data to the information obtained with :func:`read_genes`. The column index is the unique sample ID (integer, beginning at 0) which can be used to match data to the information obtained with :func:`read_annotation`. """ try: data = pd.read_csv(fname, header=None, index_col=0) data.index.name = 'gene_symbol' data.columns = pd.Series(range(1, len(data.columns) + 1), name='sample_id') except (ValueError, TypeError): if not isinstance(fname, pd.DataFrame): raise TypeError('Provided fname must be filepath to RNAseqTPM' '.csv file from Allen Human Brain Atlas.') data = fname.copy() if copy else fname return data
[docs]def read_counts(fname, copy=False): """ Loads RNAseqCounts.csv file found at `fname` RNAseq count files contain fragment counts for all the tissue samples taken from a single donor across all genes. Fragment counts can be fractional, as ambiguous reads are distributed between relevant transcripts. For present / absent calling, a value of zero indicates no transcript was seen. Parameters ---------- fname : str Path to RNAseqCounts.csv file copy : bool, optional Whether to return a copy if `fname` is a pre-loaded pandas.Dataframe. Default: False Returns ------- tpm : (G, S) pandas.DataFrame Dataframe containing RNAseq count expression data, where `G` is genes and `S` is samples. The row index is the unique gene symbol assigned during processing, which can be used to match data to the information obtained with :func:`read_genes`. The column index is the unique sample ID (integer, beginning at 0) which can be used to match data to the information obtained with :func:`read_annotation`. """ try: data = pd.read_csv(fname, header=None, index_col=0) data.index.name = 'gene_symbol' data.columns = pd.Series(range(1, len(data.columns) + 1), name='sample_id') except (ValueError, TypeError): if not isinstance(fname, pd.DataFrame): raise TypeError('Provided fname must be filepath to RNAseqCounts' '.csv file from Allen Human Brain Atlas.') data = fname.copy() if copy else fname return data
[docs]def read_genes(fname, copy=False): """ Loads Genes.csv file found at `fname` Genes files contain metadata on all genes used in the RNAseq AHBA data. These files should be the same for every donor. Parameters ---------- fname : str Path to Genes.csv file copy : bool, optional Whether to return a copy if `fname` is a pre-loaded pandas.Dataframe. Default: False Returns ------- genes : (G, 11) pandas.DataFrame Dataframe containing information for `G` unique genes. The row index is the unique gene symbol which can be used to match metadata to information obtained with :func:`read_tpm` and :func:`read_counts`. Columns include 'gene_id', 'entrez_id', 'chromosome', 'strand', 'number_of_transcripts', 'median_transcriptome_length', 'median_genome_length', 'median_number_of_exons', 'median_gene_start', and 'median_gene_end' """ try: data = pd.read_csv(fname, index_col=0) except (ValueError, TypeError): if not isinstance(fname, pd.DataFrame): raise TypeError('Provided fname must be filepath to Annotation' '.csv file from Allen Human Brain Atlas.') data = fname.copy() if copy else fname return data