import json
import logging
import re
from pathlib import Path
from typing import Optional, Union
import pandas as pd
from . import DOCSURL, DS_URL_PREFIX, readers
# configure logging
logger = logging.getLogger(__name__)
logger.setLevel(logging.INFO)
ch = logging.StreamHandler()
ch.setLevel(logging.INFO)
logger.addHandler(ch)
DEFAULT_READERS = {
"loom": readers.read_loom_to_anndata,
"rds": readers.read_seurat_to_anndata,
"h5ad": readers.read_anndata_to_anndata,
"hdf5": readers.read_10xhdf5_to_anndata,
"h5": readers.read_10xhdf5_to_anndata,
"tsv": readers.read_densetsv_to_anndata,
"csv": readers.read_densecsv_to_anndata,
}
DATA_DIR = Path("/fastgenomics/data")
DF_SORT_ORDER = [
"title",
"id",
"organism",
"tissue",
"numberOfCells",
"numberOfGenes",
"path",
"numberOfExpressionDataFiles",
"expressionDataFileNames",
"numberOfMetaDataFiles",
"metaDataFileNames",
"expressionDataFileInfos",
"metaDataFileInfos",
]
def get_datasets_df(data_dir: Path = DATA_DIR) -> pd.DataFrame:
"""Constructs a :py:func:`pandas.DataFrame` from all available datasets.
Parameters
----------
data_dir : Path, optional
Directory containing the datasets, e.g. ``fastgenomics/data``, by default DATA_DIR
Returns
-------
pd.DataFrame
A pandas DataFrame containing all available datasets
"""
ds_paths = get_ds_paths(data_dir=data_dir)
ds_df = pd.DataFrame()
for ds_path in ds_paths:
with open(ds_path / "dataset_info.json") as f:
info_df = json.load(f)
info_df["path"] = str(ds_path)
info_df["numberOfExpressionDataFiles"] = len(
info_df["expressionDataFileInfos"]
)
info_df["numberOfMetaDataFiles"] = len(info_df["metaDataFileInfos"])
_ = info_df.pop("schemaVersion", None)
ds_df = ds_df.append(info_df, ignore_index=True)
# sort colnames
col_names = ds_df.columns.values.tolist()
col_names_sorted = [name for name in DF_SORT_ORDER if name in col_names]
[col_names.remove(name) for name in DF_SORT_ORDER if name in col_names]
col_names_sorted.extend(col_names)
ds_df = ds_df[col_names_sorted]
# Format types
ds_df = ds_df.astype(
{
"numberOfCells": "int32",
"numberOfGenes": "int32",
"numberOfExpressionDataFiles": "int32",
"numberOfMetaDataFiles": "int32",
}
)
return ds_df
[docs]def ds_info(
ds: Optional[str] = None,
pretty: bool = None,
output: bool = None,
data_dir: Path = DATA_DIR,
) -> pd.DataFrame:
"""Get information on all available datasets in this analysis.
Parameters
----------
ds : Optional[str], optional
A single dataset ID or dataset title. If set, only this dataset will be displayed. Recommended to use with ``pretty``, by default None
pretty : bool, optional
Whether to display some nicely formatted output, by default True
output : bool, optional
Whether to return a DataFrame or not, by default True
data_dir : Path, optional
Directory containing the datasets, e.g. ``fastgenomics/data``, by default DATA_DIR
Returns
-------
pd.DataFrame
A pandas DataFrame containing all, or a single dataset (depends on ``ds``)
"""
if pretty is None:
pretty = ds is not None
if output is None:
output = ds is None
if not pretty and not output:
logger.warning(
'You have set "pretty" and "output" to false. Hence, this function will do/return nothing.'
)
return
try:
ds_df = get_datasets_df(data_dir=data_dir)
except NoDatasetsError as err:
logger.warning(err)
return pd.DataFrame()
def add_url(title, id):
return f'<a href="{DS_URL_PREFIX}{id}" target="_blank">{title}</a>'
def disp_pretty_df(df, index=True, header=True):
try:
from IPython.display import display, Markdown
df_html = df.to_html(
render_links=True,
escape=False,
header=header,
index=index,
justify="center",
)
display(Markdown(df_html))
except:
logger.warning(
"IPython not available. Pretty printing only works in Jupyter Notebooks."
)
if ds:
single_ds_df = select_ds_id(ds, df=ds_df)
single_ds_df["expressionDataFileNames"] = ", ".join(
[
expr["name"]
for expr in single_ds_df.loc[0, "expressionDataFileInfos"]
]
)
single_ds_df["metaDataFileNames"] = ", ".join(
[expr["name"] for expr in single_ds_df.loc[0, "metaDataFileInfos"]]
)
# Sort columns
single_col_names = single_ds_df.columns.values.tolist()
single_col_names_sorted = [
name for name in DF_SORT_ORDER if name in single_col_names
]
[
single_col_names.remove(name)
for name in DF_SORT_ORDER
if name in single_col_names
]
single_col_names_sorted.extend(single_col_names)
single_ds_df = single_ds_df[single_col_names_sorted]
if pretty:
pretty_df = single_ds_df
pretty_df["expressionDataFileNames"] = "<br>".join(
[
expr["name"]
for expr in pretty_df.loc[0, "expressionDataFileInfos"]
]
)
pretty_df["metaDataFileNames"] = ", ".join(
[expr["name"] for expr in pretty_df.loc[0, "metaDataFileInfos"]]
)
empty_cols = [
col for col in pretty_df.columns if pretty_df.loc[0, col] == ""
]
pretty_df = pretty_df.drop(
labels=["expressionDataFileInfos", "metaDataFileInfos"]
+ empty_cols,
axis=1,
errors="ignore",
)
pretty_df.loc[0, "title"] = pretty_df.apply(
lambda x: add_url(x.title, x.id), axis=1
).squeeze()
disp_pretty_df(pretty_df.T, header=False)
if output:
return single_ds_df
else:
if pretty:
pretty_df = ds_df.drop(
labels=[
"description",
"license",
"preprocessing",
"citation",
"webLink",
"file",
"expressionDataFileInfos",
"metaDataFileInfos",
],
axis=1,
errors="ignore",
)
pretty_df["title"] = pretty_df.apply(
lambda x: add_url(x.title, x.id), axis=1
)
disp_pretty_df(pretty_df)
if output:
return ds_df
[docs]def load_data(
ds: Optional[str] = None,
data_dir: Path = DATA_DIR,
additional_readers: dict = {},
expression_file: Optional[str] = None,
as_format: Optional[str] = None,
):
"""This function loads a single dataset into an AnnData object.
If there are multiple datasets available you need to specify one by setting
``ds`` to a dataset `id` or dataset `title`.
To get an overview of availabe dataset use :py:func:`ds_info`
Parameters
----------
ds : str, optional
A single dataset ID or dataset title to select a dataset to be loaded.
If only one dataset is available you do not need to set this parameter, by default None
data_dir : Path, optional
Directory containing the datasets, e.g. ``fastgenomics/data``, by default DATA_DIR
additional_readers : dict, optional
Used to specify your own readers for the specific data set format.
Dict key needs to be file extension (e.g., h5ad), dict value a function.
Still experimental, by default {}
expression_file: str, Optional
The name of the expression file to load.
Only needed when there are multiple expression files in a dataset.
as_format: str, optional
Specifies which reader should be uses for this dataset. Overwrites the auto-detection
of the format. Possible parameters are the file extensions of our supported data
formats: ``h5ad``, ``h5``, ``hdf5``, ``loom``, ``rds``, ``csv``, ``tsv``.
Returns
-------
AnnData Object
A single AnnData object with dataset id in `obs` and all dataset metadata in `uns`
Examples
--------
To use a custom reader for files with the extension ".fg", you have to define a function first:
>>> def my_loader(file):
... anndata = magic_file_loading(file)
... return anndata
You can then use this reader like this:
>>> fgread.load_data("my_dataset", additional_readers={"fg": my_loader})
"""
readers = {**DEFAULT_READERS, **additional_readers}
if ds:
single_df = select_ds_id(ds, df=get_datasets_df(data_dir=data_dir))
else:
single_df = get_datasets_df(data_dir=data_dir)
if len(single_df) > 1:
raise RuntimeError(
"There is more than one dataset available in this analysis. "
"Please select one by its ID or title. "
'You can list available datasets by using "fgread.ds_info()".'
)
exp_count = single_df.loc[0, "numberOfExpressionDataFiles"]
meta_count = single_df.loc[0, "numberOfMetaDataFiles"]
if exp_count == 0:
raise TypeError(
f"There is no expression data available in this data set.\n"
f"Metadata files: {meta_count}."
)
exp_files = [
exp["name"] for exp in single_df.loc[0, "expressionDataFileInfos"]
]
if expression_file:
if expression_file in exp_files:
file = expression_file
else:
raise KeyError(
f'Expression file "{expression_file}" not found in dataset. '
f"Available expression files are: {exp_files}."
)
else:
if exp_count == 1:
file = single_df.loc[0, "expressionDataFileInfos"][0]["name"]
else:
raise TypeError(
f"There are {exp_count} expression data files in this dataset. "
'Please specify which one you want to load using the parameter "expression_file". '
f"Available expression files are: {exp_files}."
)
title = single_df.loc[0, "title"]
ds_id = single_df.loc[0, "id"]
path = single_df.loc[0, "path"]
metadata_dict = single_df.loc[0].to_dict()
if as_format:
format = as_format.lower()
else:
try:
format = file.rsplit(".", 1)[1].lower()
logger.info(f'Expression file "{file}" with format "{format}".')
except ValueError as e:
raise ValueError(
f'The expression file "{file}" has no valid file suffix.'
).with_traceback(e.__traceback__)
if format in readers:
if meta_count != 0:
logger.info(
f"There are {meta_count} metadata files in this dataset. "
"This data will not be integrated into the anndata object."
)
logger.info(
f'Loading file "{file}" from dataset "{title}" in format "{format}" from directory "{path}"...\n'
)
adata = readers[format](Path(path) / file)
adata.uns["ds_metadata"] = {ds_id: {"title": title}}
adata.uns["ds_metadata_raw"] = {ds_id: str(metadata_dict)}
adata.obs["fg_id"] = ds_id
n_genes = adata.shape[1]
n_cells = adata.shape[0]
logger.info(
f'Loaded dataset "{title}" with {n_cells} cells and {n_genes} genes.\n'
f"==================================================================\n"
)
return adata
else:
raise KeyError(
f'Unsupported file format "{format}", use one of {list(readers)}. '
f'You can force the usage of a specific reader by setting "as_format" to a supported format. '
f"In addition, you can also implement your own reading function. See {DOCSURL} for more information."
)
def select_ds_id(ds: str, df: pd.DataFrame = None) -> pd.DataFrame:
"""Select a single dataset from a pandas DataFrame by its ID or title
Parameters
----------
ds : str
A single dataset ID or dataset title for selection
df : pd.DataFrame, optional
A pandas DataFrame from which a single entry is selected, by default None
Returns
-------
pd.DataFrame
A pandas DataFrame with only the selected dataset.
"""
single_df = df.loc[(df["id"] == ds) | (df["title"] == ds)].reset_index(
drop=True
)
len_df = len(single_df)
if len_df == 1:
return single_df.copy()
elif len_df == 0:
add_err = ""
if not ds.startswith("dataset-"):
add_err = " Please note that dataset titles can be changed by the owner. To be safe, you might want to consider dataset IDs instead."
raise KeyError("Your selection matches no datasets." + add_err)
else:
display(single_df)
raise KeyError(
f"Your selection matches {len_df} datasets. Please make sure to select exactly one."
)
def get_ds_paths(data_dir: Union[str, Path] = DATA_DIR) -> list:
"""Gets available datasets for this analysis from path.
Parameters
----------
data_dir : Union[str,Path], optional
Directory containing the datasets, e.g. "fastgenomics/data", by default DATA_DIR
Returns
-------
list
A list of dataset paths
"""
data_dir = Path(data_dir)
if not data_dir.exists():
raise NoDatasetsError(
f'There are no datasets attached to this analysis. Path "{data_dir}" does not exist.'
)
paths = [
Path(subdir)
for subdir in sorted(data_dir.iterdir())
if subdir.is_dir() and re.match(r"^dataset_\d{4}$", subdir.name)
]
if not paths:
raise NoDatasetsError(
f'There are no datasets attached to this analysis. Path "{data_dir}" is empty.'
)
return paths
class NoDatasetsError(Exception):
"""Raised when no datasets are attached"""
pass