From 0c8021822479438ed855ebe3ebf769a7cb2f5e22 Mon Sep 17 00:00:00 2001 From: Marco Varrone Date: Tue, 28 Oct 2025 12:16:07 +0100 Subject: [PATCH 1/2] Load Xenium mask labels using Dask --- src/spatialdata_io/readers/xenium.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/spatialdata_io/readers/xenium.py b/src/spatialdata_io/readers/xenium.py index 87100c08..1244ecd0 100644 --- a/src/spatialdata_io/readers/xenium.py +++ b/src/spatialdata_io/readers/xenium.py @@ -458,7 +458,7 @@ def _get_labels_and_indices_mapping( with zarr_open(str(tmpdir), mode="r") as z: # get the labels - masks = z["masks"][f"{mask_index}"][...] + masks = da.from_array(z["masks"][f"{mask_index}"]) labels = Labels2DModel.parse( masks, dims=("y", "x"), transformations={"global": Identity()}, **labels_models_kwargs ) From d3a247c161f3f0b19b814c84ffd4d52e6da0fd67 Mon Sep 17 00:00:00 2001 From: Luca Marconato Date: Tue, 6 Jan 2026 20:49:37 +0100 Subject: [PATCH 2/2] persistent extraction of .zarr.zip --- src/spatialdata_io/readers/xenium.py | 330 +++++++++++++++------------ 1 file changed, 190 insertions(+), 140 deletions(-) diff --git a/src/spatialdata_io/readers/xenium.py b/src/spatialdata_io/readers/xenium.py index 1244ecd0..d9e32bd5 100644 --- a/src/spatialdata_io/readers/xenium.py +++ b/src/spatialdata_io/readers/xenium.py @@ -52,7 +52,11 @@ __all__ = ["xenium", "xenium_aligned_image", "xenium_explorer_selection"] -@deprecation_alias(cells_as_shapes="cells_as_circles", cell_boundaries="cells_boundaries", cell_labels="cells_labels") +@deprecation_alias( + cells_as_shapes="cells_as_circles", + cell_boundaries="cells_boundaries", + cell_labels="cells_labels", +) @inject_docs(xx=XeniumKeys) def xenium( path: str | Path, @@ -69,6 +73,7 @@ def xenium( cells_table: bool = True, n_jobs: int = 1, gex_only: bool = True, + cleanup_labels_zarr_tmpdir: bool = True, imread_kwargs: Mapping[str, Any] = MappingProxyType({}), image_models_kwargs: Mapping[str, Any] = MappingProxyType({}), labels_models_kwargs: Mapping[str, Any] = MappingProxyType({}), @@ -197,62 +202,86 @@ def xenium( else: table = None - if version is not None and version >= packaging.version.parse("2.0.0") and table is not None: - cell_summary_table = _get_cells_metadata_table_from_zarr(path, XeniumKeys.CELLS_ZARR, specs) - if not cell_summary_table[XeniumKeys.CELL_ID].equals(table.obs[XeniumKeys.CELL_ID]): - warnings.warn( - 'The "cell_id" column in the cells metadata table does not match the "cell_id" column in the annotation' - " table. This could be due to trying to read a new version that is not supported yet. Please " - "report this issue.", - UserWarning, - stacklevel=2, - ) - table.obs[XeniumKeys.Z_LEVEL] = cell_summary_table[XeniumKeys.Z_LEVEL] - table.obs[XeniumKeys.NUCLEUS_COUNT] = cell_summary_table[XeniumKeys.NUCLEUS_COUNT] - - polygons = {} - labels = {} - tables = {} - points = {} - images = {} - - # From the public release notes here: - # https://www.10xgenomics.com/support/software/xenium-onboard-analysis/latest/release-notes/release-notes-for-xoa - # we see that for distinguishing between the nuclei of polinucleated cells, the `label_id` column is used. - # This column is currently not found in the preview data, while I think it is needed in order to unambiguously match - # nuclei to cells. Therefore for the moment we only link the table to the cell labels, and not to the nucleus - # labels. - if nucleus_labels: - labels["nucleus_labels"], _ = _get_labels_and_indices_mapping( - path, - XeniumKeys.CELLS_ZARR, - specs, - mask_index=0, - labels_name="nucleus_labels", - labels_models_kwargs=labels_models_kwargs, - ) - if cells_labels: - labels["cell_labels"], cell_labels_indices_mapping = _get_labels_and_indices_mapping( - path, - XeniumKeys.CELLS_ZARR, - specs, - mask_index=1, - labels_name="cell_labels", - labels_models_kwargs=labels_models_kwargs, + tmpdir = tempfile.TemporaryDirectory() + if not cleanup_labels_zarr_tmpdir: + logging.info( + f"Extracting cells zarr in the temporary directory {tmpdir.name}. Since `cleanup_labels_zarr_tmpdir` " + f"is set to `False`, this directory cleanup will be deferred (up to the end of the process). " + f"If the process is interrupted aburptly cleanup may not occurr. Use with care to avoid uncleaned up " + f"temporary directories." ) - if cell_labels_indices_mapping is not None and table is not None: - if not pd.DataFrame.equals(cell_labels_indices_mapping["cell_id"], table.obs[str(XeniumKeys.CELL_ID)]): + zip_file = path / XeniumKeys.CELLS_ZARR + with zipfile.ZipFile(zip_file, "r") as zip_ref: + zip_ref.extractall(tmpdir.name) + try: + cells_zarr = zarr.open(str(tmpdir.name), mode="r") + if version is not None and version >= packaging.version.parse("2.0.0") and table is not None: + cell_summary_table = _get_cells_metadata_table_from_zarr(cells_zarr=cells_zarr) + if not cell_summary_table[XeniumKeys.CELL_ID].equals(table.obs[XeniumKeys.CELL_ID]): warnings.warn( - "The cell_id column in the cell_labels_table does not match the cell_id column derived from the " - "cell labels data. This could be due to trying to read a new version that is not supported yet. " - "Please report this issue.", + 'The "cell_id" column in the cells metadata table does not match the "cell_id" column in the annotation' + " table. This could be due to trying to read a new version that is not supported yet. Please " + "report this issue.", UserWarning, stacklevel=2, ) - else: - table.obs["cell_labels"] = cell_labels_indices_mapping["label_index"] - if not cells_as_circles: - table.uns[TableModel.ATTRS_KEY][TableModel.INSTANCE_KEY] = "cell_labels" + table.obs[XeniumKeys.Z_LEVEL] = cell_summary_table[XeniumKeys.Z_LEVEL] + table.obs[XeniumKeys.NUCLEUS_COUNT] = cell_summary_table[XeniumKeys.NUCLEUS_COUNT] + + polygons = {} + labels = {} + tables = {} + points = {} + images = {} + + # From the public release notes here: + # https://www.10xgenomics.com/support/software/xenium-onboard-analysis/latest/release-notes/release-notes-for-xoa + # we see that for distinguishing between the nuclei of polinucleated cells, the `label_id` column is used. + # This column is currently not found in the preview data, while I think it is needed in order to unambiguously match + # nuclei to cells. Therefore for the moment we only link the table to the cell labels, and not to the nucleus + # labels. + + if nucleus_labels: + labels["nucleus_labels"], _ = _get_labels_and_indices_mapping( + cells_zarr, + cleanup_labels_zarr_tmpdir, + specs, + mask_index=0, + labels_name="nucleus_labels", + labels_models_kwargs=labels_models_kwargs, + ) + if cells_labels: + labels["cell_labels"], cell_labels_indices_mapping = _get_labels_and_indices_mapping( + cells_zarr, + cleanup_labels_zarr_tmpdir, + specs, + mask_index=1, + labels_name="cell_labels", + labels_models_kwargs=labels_models_kwargs, + ) + if cell_labels_indices_mapping is not None and table is not None: + if not pd.DataFrame.equals( + cell_labels_indices_mapping["cell_id"], + table.obs[str(XeniumKeys.CELL_ID)], + ): + warnings.warn( + "The cell_id column in the cell_labels_table does not match the cell_id column derived from the " + "cell labels data. This could be due to trying to read a new version that is not supported yet. " + "Please report this issue.", + UserWarning, + stacklevel=2, + ) + else: + table.obs["cell_labels"] = cell_labels_indices_mapping["label_index"] + if not cells_as_circles: + table.uns[TableModel.ATTRS_KEY][TableModel.INSTANCE_KEY] = "cell_labels" + except Exception as e: + tmpdir.cleanup() + raise e + + # we cleanup now if we don't have lazy data + if not cells_labels and not nucleus_labels or cleanup_labels_zarr_tmpdir: + tmpdir.cleanup() if nucleus_boundaries: polygons["nucleus_boundaries"] = _get_polygons( @@ -381,7 +410,13 @@ def filter(self, record: logging.LogRecord) -> bool: if table is not None: tables["table"] = table - elements_dict = {"images": images, "labels": labels, "points": points, "tables": tables, "shapes": polygons} + elements_dict = { + "images": images, + "labels": labels, + "points": points, + "tables": tables, + "shapes": polygons, + } if cells_as_circles: elements_dict["shapes"][specs["region"]] = circles sdata = SpatialData(**elements_dict) @@ -402,7 +437,11 @@ def _decode_cell_id_column(cell_id_column: pd.Series) -> pd.Series: def _get_polygons( - path: Path, file: str, specs: dict[str, Any], n_jobs: int, idx: ArrayLike | None = None + path: Path, + file: str, + specs: dict[str, Any], + n_jobs: int, + idx: ArrayLike | None = None, ) -> GeoDataFrame: def _poly(arr: ArrayLike) -> Polygon: return Polygon(arr[:-1]) @@ -441,8 +480,8 @@ def _poly(arr: ArrayLike) -> Polygon: def _get_labels_and_indices_mapping( - path: Path, - file: str, + cells_zarr: zarr.Group, + cleanup_labels_zarr_tmpdir: bool, specs: dict[str, Any], mask_index: int, labels_name: str, @@ -451,74 +490,71 @@ def _get_labels_and_indices_mapping( if mask_index not in [0, 1]: raise ValueError(f"mask_index must be 0 or 1, found {mask_index}.") - with tempfile.TemporaryDirectory() as tmpdir: - zip_file = path / XeniumKeys.CELLS_ZARR - with zipfile.ZipFile(zip_file, "r") as zip_ref: - zip_ref.extractall(tmpdir) + # get the labels + if cleanup_labels_zarr_tmpdir: + masks = cells_zarr["masks"][f"{mask_index}"][...] + else: + masks = da.from_array(cells_zarr["masks"][f"{mask_index}"]) + labels = Labels2DModel.parse( + masks, + dims=("y", "x"), + transformations={"global": Identity()}, + **labels_models_kwargs, + ) - with zarr_open(str(tmpdir), mode="r") as z: - # get the labels - masks = da.from_array(z["masks"][f"{mask_index}"]) - labels = Labels2DModel.parse( - masks, dims=("y", "x"), transformations={"global": Identity()}, **labels_models_kwargs + # build the matching table + version = _parse_version_of_xenium_analyzer(specs) + if mask_index == 0: + # nuclei currently not supported + return labels, None + if version is None or version is not None and version < packaging.version.parse("1.3.0"): + # supported in version 1.3.0 and not supported in version 1.0.2; conservatively, let's assume it is not + # supported in versions < 1.3.0 + return labels, None + + cell_id, dataset_suffix = cells_zarr["cell_id"][...].T + cell_id_str = cell_id_str_from_prefix_suffix_uint32(cell_id, dataset_suffix) + + # this information will probably be available in the `label_id` column for version > 2.0.0 (see public + # release notes mentioned above) + real_label_index = get_element_instances(labels).values + + # background removal + if real_label_index[0] == 0: + real_label_index = real_label_index[1:] + + if version < packaging.version.parse("2.0.0"): + expected_label_index = cells_zarr["seg_mask_value"][...] + + if not np.array_equal(expected_label_index, real_label_index): + raise ValueError( + "The label indices from the labels differ from the ones from the input data. Please report " + f"this issue. Real label indices: {real_label_index}, expected label indices: " + f"{expected_label_index}." ) - - # build the matching table - version = _parse_version_of_xenium_analyzer(specs) - if mask_index == 0: - # nuclei currently not supported - return labels, None - if version is None or version is not None and version < packaging.version.parse("1.3.0"): - # supported in version 1.3.0 and not supported in version 1.0.2; conservatively, let's assume it is not - # supported in versions < 1.3.0 - return labels, None - - cell_id, dataset_suffix = z["cell_id"][...].T - cell_id_str = cell_id_str_from_prefix_suffix_uint32(cell_id, dataset_suffix) - - # this information will probably be available in the `label_id` column for version > 2.0.0 (see public - # release notes mentioned above) - real_label_index = get_element_instances(labels).values - - # background removal - if real_label_index[0] == 0: - real_label_index = real_label_index[1:] - - if version < packaging.version.parse("2.0.0"): - expected_label_index = z["seg_mask_value"][...] - - if not np.array_equal(expected_label_index, real_label_index): - raise ValueError( - "The label indices from the labels differ from the ones from the input data. Please report " - f"this issue. Real label indices: {real_label_index}, expected label indices: " - f"{expected_label_index}." - ) - else: - labels_positional_indices = z["polygon_sets"][f"{mask_index}"]["cell_index"][...] - if not np.array_equal(labels_positional_indices, np.arange(len(labels_positional_indices))): - raise ValueError( - "The positional indices of the labels do not match the expected range. Please report this " - "issue." - ) - - # labels_index is an uint32, so let's cast to np.int64 to avoid the risk of overflow on some systems - indices_mapping = pd.DataFrame( - { - "region": labels_name, - "cell_id": cell_id_str, - "label_index": real_label_index.astype(np.int64), - } + else: + labels_positional_indices = cells_zarr["polygon_sets"][f"{mask_index}"]["cell_index"][...] + if not np.array_equal(labels_positional_indices, np.arange(len(labels_positional_indices))): + raise ValueError( + "The positional indices of the labels do not match the expected range. Please report this issue." ) - # because AnnData converts the indices to str - indices_mapping.index = indices_mapping.index.astype(str) - return labels, indices_mapping + + # labels_index is an uint32, so let's cast to np.int64 to avoid the risk of overflow on some systems + indices_mapping = pd.DataFrame( + { + "region": labels_name, + "cell_id": cell_id_str, + "label_index": real_label_index.astype(np.int64), + } + ) + # because AnnData converts the indices to str + indices_mapping.index = indices_mapping.index.astype(str) + return labels, indices_mapping @inject_docs(xx=XeniumKeys) def _get_cells_metadata_table_from_zarr( - path: Path, - file: str, - specs: dict[str, Any], + cells_zarr: zarr.Group, ) -> AnnData: """Read cells metadata from ``{xx.CELLS_ZARR}``. @@ -526,35 +562,34 @@ def _get_cells_metadata_table_from_zarr( nucleus_count for versions >= 2.0.0. """ # for version >= 2.0.0, in this function we could also parse the segmentation method used to obtain the masks - with tempfile.TemporaryDirectory() as tmpdir: - zip_file = path / XeniumKeys.CELLS_ZARR - with zipfile.ZipFile(zip_file, "r") as zip_ref: - zip_ref.extractall(tmpdir) - - with zarr_open(str(tmpdir), mode="r") as z: - x = z["cell_summary"][...] - column_names = z["cell_summary"].attrs["column_names"] - df = pd.DataFrame(x, columns=column_names) - cell_id_prefix = z["cell_id"][:, 0] - dataset_suffix = z["cell_id"][:, 1] + x = cells_zarr["cell_summary"][...] + column_names = cells_zarr["cell_summary"].attrs["column_names"] + df = pd.DataFrame(x, columns=column_names) + cell_id_prefix = cells_zarr["cell_id"][:, 0] + dataset_suffix = cells_zarr["cell_id"][:, 1] - cell_id_str = cell_id_str_from_prefix_suffix_uint32(cell_id_prefix, dataset_suffix) - df[XeniumKeys.CELL_ID] = cell_id_str - # because AnnData converts the indices to str - df.index = df.index.astype(str) - return df + cell_id_str = cell_id_str_from_prefix_suffix_uint32(cell_id_prefix, dataset_suffix) + df[XeniumKeys.CELL_ID] = cell_id_str + # because AnnData converts the indices to str + df.index = df.index.astype(str) + return df def _get_points(path: Path, specs: dict[str, Any]) -> Table: table = read_parquet(path / XeniumKeys.TRANSCRIPTS_FILE) table["feature_name"] = table["feature_name"].apply( - lambda x: x.decode("utf-8") if isinstance(x, bytes) else str(x), meta=("feature_name", "object") + lambda x: x.decode("utf-8") if isinstance(x, bytes) else str(x), + meta=("feature_name", "object"), ) transform = Scale([1.0 / specs["pixel_size"], 1.0 / specs["pixel_size"]], axes=("x", "y")) points = PointsModel.parse( table, - coordinates={"x": XeniumKeys.TRANSCRIPTS_X, "y": XeniumKeys.TRANSCRIPTS_Y, "z": XeniumKeys.TRANSCRIPTS_Z}, + coordinates={ + "x": XeniumKeys.TRANSCRIPTS_X, + "y": XeniumKeys.TRANSCRIPTS_Y, + "z": XeniumKeys.TRANSCRIPTS_Z, + }, feature_key=XeniumKeys.FEATURE_NAME, instance_key=XeniumKeys.CELL_ID, transformations={"global": transform}, @@ -576,7 +611,12 @@ def _get_tables_and_circles( adata.obs["region"] = specs["region"] adata.obs["region"] = adata.obs["region"].astype("category") adata.obs[XeniumKeys.CELL_ID] = _decode_cell_id_column(adata.obs[XeniumKeys.CELL_ID]) - table = TableModel.parse(adata, region=specs["region"], region_key="region", instance_key=str(XeniumKeys.CELL_ID)) + table = TableModel.parse( + adata, + region=specs["region"], + region_key="region", + instance_key=str(XeniumKeys.CELL_ID), + ) if cells_as_circles: transform = Scale([1.0 / specs["pixel_size"], 1.0 / specs["pixel_size"]], axes=("x", "y")) radii = np.sqrt(adata.obs[XeniumKeys.CELL_AREA].to_numpy() / np.pi) @@ -605,7 +645,11 @@ def _get_images( # let's add a dummy channel as a temporary workaround. image = da.concatenate([image, da.zeros_like(image[0:1])], axis=0) return Image2DModel.parse( - image, transformations={"global": Identity()}, dims=("c", "y", "x"), rgb=None, **image_models_kwargs + image, + transformations={"global": Identity()}, + dims=("c", "y", "x"), + rgb=None, + **image_models_kwargs, ) @@ -620,14 +664,18 @@ def _add_aligned_images( csv_files = list(path.glob("*.csv")) for file in ome_tif_files: element_name = None - for suffix in [XeniumKeys.ALIGNED_HE_IMAGE_SUFFIX, XeniumKeys.ALIGNED_IF_IMAGE_SUFFIX]: + for suffix in [ + XeniumKeys.ALIGNED_HE_IMAGE_SUFFIX, + XeniumKeys.ALIGNED_IF_IMAGE_SUFFIX, + ]: if file.name.endswith(suffix): element_name = suffix.replace(XeniumKeys.ALIGNMENT_FILE_SUFFIX_TO_REMOVE, "") break if element_name is not None: # check if an alignment file exists expected_filename = file.name.replace( - XeniumKeys.ALIGNMENT_FILE_SUFFIX_TO_REMOVE, XeniumKeys.ALIGNMENT_FILE_SUFFIX_TO_ADD + XeniumKeys.ALIGNMENT_FILE_SUFFIX_TO_REMOVE, + XeniumKeys.ALIGNMENT_FILE_SUFFIX_TO_ADD, ) alignment_files = [f for f in csv_files if f.name == expected_filename] assert len(alignment_files) <= 1, f"Found more than one alignment file for {file.name}." @@ -833,7 +881,9 @@ def cell_id_str_from_prefix_suffix_uint32(cell_id_prefix: ArrayLike, dataset_suf return np.array(cell_id_str) -def prefix_suffix_uint32_from_cell_id_str(cell_id_str: ArrayLike) -> tuple[ArrayLike, ArrayLike]: +def prefix_suffix_uint32_from_cell_id_str( + cell_id_str: ArrayLike, +) -> tuple[ArrayLike, ArrayLike]: # parse the string into the prefix and suffix cell_id_prefix_str, dataset_suffix = zip(*[x.split("-") for x in cell_id_str], strict=False) dataset_suffix_int = [int(x) for x in dataset_suffix]