diff --git a/README.md b/README.md index 02213221..f598404d 100644 --- a/README.md +++ b/README.md @@ -111,8 +111,7 @@ Marconato, L., Palla, G., Yamauchi, K.A. et al. SpatialData: an open and univers [link-docs]: https://spatialdata.scverse.org/projects/io/en/latest/ [link-api]: https://spatialdata.scverse.org/projects/io/en/latest/api.html [link-cli]: https://spatialdata.scverse.org/projects/io/en/latest/cli.html - -[//]: # (numfocus-fiscal-sponsor-attribution) +[//]: # "numfocus-fiscal-sponsor-attribution" spatialdata-io is part of the scverse® project ([website](https://scverse.org), [governance](https://scverse.org/about/roles)) and is fiscally sponsored by [NumFOCUS](https://numfocus.org/). If you like scverse® and want to support our mission, please consider making a tax-deductible [donation](https://numfocus.org/donate-to-scverse) to help the project pay for developer time, professional services, travel, workshops, and a variety of other needs. diff --git a/src/spatialdata_io/readers/_utils/_read_10x_h5.py b/src/spatialdata_io/readers/_utils/_read_10x_h5.py deleted file mode 100644 index 4c5b33cc..00000000 --- a/src/spatialdata_io/readers/_utils/_read_10x_h5.py +++ /dev/null @@ -1,137 +0,0 @@ -# BSD 3-Clause License - -# Copyright (c) 2017 F. Alexander Wolf, P. Angerer, Theis Lab -# All rights reserved. - -# Redistribution and use in source and binary forms, with or without -# modification, are permitted provided that the following conditions are met: - -# * Redistributions of source code must retain the above copyright notice, this -# list of conditions and the following disclaimer. - -# * Redistributions in binary form must reproduce the above copyright notice, -# this list of conditions and the following disclaimer in the documentation -# and/or other materials provided with the distribution. - -# * Neither the name of the copyright holder nor the names of its -# contributors may be used to endorse or promote products derived from -# this software without specific prior written permission. - -# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE -# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE -# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -# code below taken from https://github.com/scverse/scanpy/blob/master/scanpy/readwrite.py - -from pathlib import Path -from typing import Any - -import h5py -import numpy as np -from anndata import AnnData -from spatialdata._logging import logger - - -def _read_10x_h5( - filename: str | Path, - genome: str | None = None, - gex_only: bool = True, -) -> AnnData: - """ - Read 10x-Genomics-formatted hdf5 file. - - Parameters - ---------- - filename - Path to a 10x hdf5 file. - genome - Filter expression to genes within this genome. For legacy 10x h5 - files, this must be provided if the data contains more than one genome. - gex_only - Only keep 'Gene Expression' data and ignore other feature types, - e.g. 'Antibody Capture', 'CRISPR Guide Capture', or 'Custom' - - Returns - ------- - Annotated data matrix, where observations/cells are named by their - barcode and variables/genes by gene name. - Stores the following information: - - - `~anndata.AnnData.X`: The data matrix is stored - - `~anndata.AnnData.obs_names`: Cell names - - `~anndata.AnnData.var_names`: Gene names - - `['gene_ids']`: Gene IDs - - `['feature_types']`: Feature types - """ - start = logger.info(f"reading {filename}") - filename = Path(filename) if isinstance(filename, str) else filename - is_present = filename.is_file() - if not is_present: - logger.debug(f"... did not find original file {filename}") - with h5py.File(str(filename), "r") as f: - v3 = "/matrix" in f - - if v3: - adata = _read_v3_10x_h5(filename, start=start) - if genome: - if genome not in adata.var["genome"].values: - raise ValueError( - f"Could not find data corresponding to genome `{genome}` in `{filename}`. " - f'Available genomes are: {list(adata.var["genome"].unique())}.' - ) - adata = adata[:, adata.var["genome"] == genome] - if gex_only: - adata = adata[:, adata.var["feature_types"] == "Gene Expression"] - if adata.is_view: - adata = adata.copy() - else: - raise ValueError("Versions older than V3 are not supported.") - return adata - - -def _read_v3_10x_h5(filename: str | Path, *, start: Any | None = None) -> AnnData: - """Read hdf5 file from Cell Ranger v3 or later versions.""" - with h5py.File(str(filename), "r") as f: - try: - dsets: dict[str, Any] = {} - _collect_datasets(dsets, f["matrix"]) - - from scipy.sparse import csr_matrix - - M, N = dsets["shape"] - data = dsets["data"] - if dsets["data"].dtype == np.dtype("int32"): - data = dsets["data"].view("float32") - data[:] = dsets["data"] - matrix = csr_matrix( - (data, dsets["indices"], dsets["indptr"]), - shape=(N, M), - ) - adata = AnnData( - matrix, - obs={"obs_names": dsets["barcodes"].astype(str)}, - var={ - "var_names": dsets["name"].astype(str), - "gene_ids": dsets["id"].astype(str), - "feature_types": dsets["feature_type"].astype(str), - "genome": dsets["genome"].astype(str), - }, - ) - return adata - except KeyError: - raise Exception("File is missing one or more required datasets.") - - -def _collect_datasets(dsets: dict[str, Any], group: h5py.Group) -> None: - for k, v in group.items(): - if isinstance(v, h5py.Dataset): - dsets[k] = v[:] - else: - _collect_datasets(dsets, v) diff --git a/src/spatialdata_io/readers/_utils/_utils.py b/src/spatialdata_io/readers/_utils/_utils.py index 82676098..b7d3a986 100644 --- a/src/spatialdata_io/readers/_utils/_utils.py +++ b/src/spatialdata_io/readers/_utils/_utils.py @@ -5,14 +5,13 @@ from pathlib import Path from typing import Any, Union +import scanpy as sc from anndata import AnnData, read_text from h5py import File from ome_types import from_tiff from ome_types.model import Pixels, UnitsLength from spatialdata._logging import logger -from spatialdata_io.readers._utils._read_10x_h5 import _read_10x_h5 - PathLike = Union[os.PathLike, str] # type:ignore[type-arg] @@ -24,7 +23,7 @@ def _read_counts( ) -> tuple[AnnData, str]: path = Path(path) if counts_file.endswith(".h5"): - adata: AnnData = _read_10x_h5(path / counts_file, **kwargs) + adata: AnnData = sc.read_10x_h5(path / counts_file, **kwargs) with File(path / counts_file, mode="r") as f: attrs = dict(f.attrs) if library_id is None: diff --git a/src/spatialdata_io/readers/xenium.py b/src/spatialdata_io/readers/xenium.py index ff036067..721708f6 100644 --- a/src/spatialdata_io/readers/xenium.py +++ b/src/spatialdata_io/readers/xenium.py @@ -17,6 +17,7 @@ import packaging.version import pandas as pd import pyarrow.parquet as pq +import scanpy as sc import tifffile import zarr from anndata import AnnData @@ -42,7 +43,6 @@ from spatialdata_io._constants._constants import XeniumKeys from spatialdata_io._docs import inject_docs from spatialdata_io._utils import deprecation_alias -from spatialdata_io.readers._utils._read_10x_h5 import _read_10x_h5 from spatialdata_io.readers._utils._utils import _initialize_raster_models_kwargs __all__ = ["xenium", "xenium_aligned_image", "xenium_explorer_selection"] @@ -520,7 +520,7 @@ def _get_points(path: Path, specs: dict[str, Any]) -> Table: def _get_tables_and_circles( path: Path, cells_as_circles: bool, specs: dict[str, Any] ) -> AnnData | tuple[AnnData, AnnData]: - adata = _read_10x_h5(path / XeniumKeys.CELL_FEATURE_MATRIX_FILE) + adata = sc.read_10x_h5(path / XeniumKeys.CELL_FEATURE_MATRIX_FILE) metadata = pd.read_parquet(path / XeniumKeys.CELL_METADATA_FILE) np.testing.assert_array_equal(metadata.cell_id.astype(str), adata.obs_names.values) circ = metadata[[XeniumKeys.CELL_X, XeniumKeys.CELL_Y]].to_numpy()