From 9ac2d92722b33fd45df465fc8eccd3db9f70ee04 Mon Sep 17 00:00:00 2001 From: Tim Treis Date: Thu, 27 Feb 2025 12:54:54 +0100 Subject: [PATCH 1/6] now removing faulty IDs --- src/spatialdata_io/readers/xenium.py | 48 ++++++++++++++++++++++++++-- 1 file changed, 46 insertions(+), 2 deletions(-) diff --git a/src/spatialdata_io/readers/xenium.py b/src/spatialdata_io/readers/xenium.py index ff036067..d9715b40 100644 --- a/src/spatialdata_io/readers/xenium.py +++ b/src/spatialdata_io/readers/xenium.py @@ -248,21 +248,33 @@ def xenium( table.uns[TableModel.ATTRS_KEY][TableModel.INSTANCE_KEY] = "cell_labels" if nucleus_boundaries: + invalid_nuc_ids = _find_invalid_ids(path, XeniumKeys.NUCLEUS_BOUNDARIES_FILE) + if len(invalid_nuc_ids) > 0: + logging.warning( + f"Found {len(invalid_nuc_ids)} invalid polygons for nuclei, removing the masks corresponding to the IDs: {invalid_nuc_ids}" + ) polygons["nucleus_boundaries"] = _get_polygons( path, XeniumKeys.NUCLEUS_BOUNDARIES_FILE, specs, n_jobs, idx=table.obs[str(XeniumKeys.CELL_ID)].copy(), + invalid_ids=invalid_nuc_ids, ) if cells_boundaries: + invalid_cell_ids = _find_invalid_ids(path, XeniumKeys.CELL_BOUNDARIES_FILE) + if len(invalid_cell_ids) > 0: + logging.warning( + f"Found {len(invalid_cell_ids)} invalid polygons for cells, removing the masks corresponding to the IDs: {invalid_cell_ids}" + ) polygons["cell_boundaries"] = _get_polygons( path, XeniumKeys.CELL_BOUNDARIES_FILE, specs, n_jobs, idx=table.obs[str(XeniumKeys.CELL_ID)].copy(), + invalid_ids=invalid_cell_ids, ) if transcripts: @@ -338,7 +350,9 @@ def filter(self, record: logging.LogRecord) -> bool: logger.removeFilter(IgnoreSpecificMessage()) if table is not None: - tables["table"] = table + valid_nucleus_mask = ~table.obs[XeniumKeys.CELL_ID].isin(invalid_nuc_ids) + valid_cell_mask = ~table.obs[XeniumKeys.CELL_ID].isin(invalid_cell_ids) + tables["table"] = table[valid_nucleus_mask & valid_cell_mask].copy() elements_dict = {"images": images, "labels": labels, "points": points, "tables": tables, "shapes": polygons} if cells_as_circles: @@ -354,6 +368,16 @@ def filter(self, record: logging.LogRecord) -> bool: return sdata +def _find_invalid_ids( + path: Path, + file: str, +) -> ArrayLike: + """Filter out cell ids with too few vertices to form a valid polygon.""" + df = pq.read_table(path / file).to_pandas() + invalid_ids = df.groupby(XeniumKeys.CELL_ID).filter(lambda x: len(x) < 4)[XeniumKeys.CELL_ID].unique() + return [] if len(invalid_ids) == 0 else invalid_ids + + def _decode_cell_id_column(cell_id_column: pd.Series) -> pd.Series: if isinstance(cell_id_column.iloc[0], bytes): return cell_id_column.apply(lambda x: x.decode("utf-8")) @@ -361,17 +385,37 @@ def _decode_cell_id_column(cell_id_column: pd.Series) -> pd.Series: def _get_polygons( - path: Path, file: str, specs: dict[str, Any], n_jobs: int, idx: ArrayLike | None = None + path: Path, + file: str, + specs: dict[str, Any], + n_jobs: int, + idx: ArrayLike | None = None, + invalid_ids: ArrayLike | None = None, ) -> GeoDataFrame: def _poly(arr: ArrayLike) -> Polygon: return Polygon(arr[:-1]) + if invalid_ids is None: + invalid_ids = [] + # seems to be faster than pd.read_parquet df = pq.read_table(path / file).to_pandas() + df[XeniumKeys.CELL_ID] = _decode_cell_id_column(df[XeniumKeys.CELL_ID]) + + # Filter based on valid cell IDs if idx is provided + if idx is not None: + idx = idx[~idx.isin(invalid_ids)] + if len(invalid_ids) > 0: + idx = idx.reset_index(drop=True) + df = df[df[XeniumKeys.CELL_ID].isin(idx)] + else: + # If no idx provided, just (potentially) filter out invalid IDs + df = df[~df[XeniumKeys.CELL_ID].isin(invalid_ids)] group_by = df.groupby(XeniumKeys.CELL_ID) index = pd.Series(group_by.indices.keys()) index = _decode_cell_id_column(index) + out = Parallel(n_jobs=n_jobs)( delayed(_poly)(i.to_numpy()) for _, i in group_by[[XeniumKeys.BOUNDARIES_VERTEX_X, XeniumKeys.BOUNDARIES_VERTEX_Y]] From 1abd69a39ede273032935db5a7b95e0b8e2431ed Mon Sep 17 00:00:00 2001 From: Tim Treis Date: Sat, 22 Mar 2025 12:53:44 +0100 Subject: [PATCH 2/6] updated action --- .github/workflows/test.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/test.yaml b/.github/workflows/test.yaml index 8546b3d7..62e51ab0 100644 --- a/.github/workflows/test.yaml +++ b/.github/workflows/test.yaml @@ -26,7 +26,7 @@ jobs: PYTHON: ${{ matrix.python }} steps: - - uses: actions/checkout@v2 + - uses: actions/checkout@v4 - name: Set up Python ${{ matrix.python }} uses: actions/setup-python@v4 with: From 2e228cf42fe34c8831edc23132ded3825e6a8007 Mon Sep 17 00:00:00 2001 From: Luca Marconato Date: Thu, 8 Jan 2026 16:10:21 +0100 Subject: [PATCH 3/6] wip support for polygons with 3 vertices without filtering --- src/spatialdata_io/readers/xenium.py | 68 ++++++++++------------------ 1 file changed, 25 insertions(+), 43 deletions(-) diff --git a/src/spatialdata_io/readers/xenium.py b/src/spatialdata_io/readers/xenium.py index b0f5663b..4380d8bb 100644 --- a/src/spatialdata_io/readers/xenium.py +++ b/src/spatialdata_io/readers/xenium.py @@ -259,33 +259,21 @@ def xenium( table.uns[TableModel.ATTRS_KEY][TableModel.INSTANCE_KEY] = "cell_labels" if nucleus_boundaries: - invalid_nuc_ids = _find_invalid_ids(path, XeniumKeys.NUCLEUS_BOUNDARIES_FILE) - if len(invalid_nuc_ids) > 0: - logging.warning( - f"Found {len(invalid_nuc_ids)} invalid polygons for nuclei, removing the masks corresponding to the IDs: {invalid_nuc_ids}" - ) polygons["nucleus_boundaries"] = _get_polygons( path, XeniumKeys.NUCLEUS_BOUNDARIES_FILE, specs, n_jobs, idx=table.obs[str(XeniumKeys.CELL_ID)].copy(), - invalid_ids=invalid_nuc_ids, ) if cells_boundaries: - invalid_cell_ids = _find_invalid_ids(path, XeniumKeys.CELL_BOUNDARIES_FILE) - if len(invalid_cell_ids) > 0: - logging.warning( - f"Found {len(invalid_cell_ids)} invalid polygons for cells, removing the masks corresponding to the IDs: {invalid_cell_ids}" - ) polygons["cell_boundaries"] = _get_polygons( path, XeniumKeys.CELL_BOUNDARIES_FILE, specs, n_jobs, idx=table.obs[str(XeniumKeys.CELL_ID)].copy(), - invalid_ids=invalid_cell_ids, ) if transcripts: @@ -394,10 +382,11 @@ def filter(self, record: logging.LogRecord) -> bool: del image_models_kwargs["c_coords"] logger.removeFilter(IgnoreSpecificMessage()) - if table is not None: - valid_nucleus_mask = ~table.obs[XeniumKeys.CELL_ID].isin(invalid_nuc_ids) - valid_cell_mask = ~table.obs[XeniumKeys.CELL_ID].isin(invalid_cell_ids) - tables["table"] = table[valid_nucleus_mask & valid_cell_mask].copy() + # if table is not None: + # valid_nucleus_mask = ~table.obs[XeniumKeys.CELL_ID].isin(invalid_nuc_ids) + # valid_cell_mask = ~table.obs[XeniumKeys.CELL_ID].isin(invalid_cell_ids) + # tables["table"] = table[valid_nucleus_mask & valid_cell_mask].copy() + tables["table"] = table elements_dict = { "images": images, @@ -419,16 +408,6 @@ def filter(self, record: logging.LogRecord) -> bool: return sdata -def _find_invalid_ids( - path: Path, - file: str, -) -> ArrayLike: - """Filter out cell ids with too few vertices to form a valid polygon.""" - df = pq.read_table(path / file).to_pandas() - invalid_ids = df.groupby(XeniumKeys.CELL_ID).filter(lambda x: len(x) < 4)[XeniumKeys.CELL_ID].unique() - return [] if len(invalid_ids) == 0 else invalid_ids - - def _decode_cell_id_column(cell_id_column: pd.Series) -> pd.Series: if isinstance(cell_id_column.iloc[0], bytes): return cell_id_column.apply(lambda x: x.decode("utf-8")) @@ -441,27 +420,30 @@ def _get_polygons( specs: dict[str, Any], n_jobs: int, idx: ArrayLike | None = None, - invalid_ids: ArrayLike | None = None, ) -> GeoDataFrame: - def _poly(arr: ArrayLike) -> Polygon: - return Polygon(arr[:-1]) - - if invalid_ids is None: - invalid_ids = [] - # seems to be faster than pd.read_parquet df = pq.read_table(path / file).to_pandas() df[XeniumKeys.CELL_ID] = _decode_cell_id_column(df[XeniumKeys.CELL_ID]) - # Filter based on valid cell IDs if idx is provided - if idx is not None: - idx = idx[~idx.isin(invalid_ids)] - if len(invalid_ids) > 0: - idx = idx.reset_index(drop=True) - df = df[df[XeniumKeys.CELL_ID].isin(idx)] - else: - # If no idx provided, just (potentially) filter out invalid IDs - df = df[~df[XeniumKeys.CELL_ID].isin(invalid_ids)] + # # filter out cell ids with too few vertices to form a valid polygon. + # invalid_ids = df.groupby(XeniumKeys.CELL_ID).filter(lambda x: len(x) < 3)[ + # XeniumKeys.CELL_ID].unique() + # invalid_ids = [] if len(invalid_ids) == 0 else invalid_ids + # + # if len(invalid_ids) > 0: + # logging.warning( + # f"Found {len(invalid_ids)} invalid polygons for {file}, removing the masks corresponding to the IDs: {invalid_ids}" + # ) + # + # # Filter based on valid cell IDs if idx is provided + # if idx is not None: + # idx = idx[~idx.isin(invalid_ids)] + # if len(invalid_ids) > 0: + # idx = idx.reset_index(drop=True) + # df = df[df[XeniumKeys.CELL_ID].isin(idx)] + # else: + # # If no idx provided, just (potentially) filter out invalid IDs + # df = df[~df[XeniumKeys.CELL_ID].isin(invalid_ids)] group_by = df.groupby(XeniumKeys.CELL_ID) index = pd.Series(group_by.indices.keys()) @@ -470,7 +452,7 @@ def _poly(arr: ArrayLike) -> Polygon: index = _decode_cell_id_column(index) out = Parallel(n_jobs=n_jobs)( - delayed(_poly)(i.to_numpy()) + delayed(Polygon)(i.to_numpy()) for _, i in group_by[[XeniumKeys.BOUNDARIES_VERTEX_X, XeniumKeys.BOUNDARIES_VERTEX_Y]] ) geo_df = GeoDataFrame({"geometry": out}) From 9768bfdb4e4d1756be77954e57c54691581ba9c3 Mon Sep 17 00:00:00 2001 From: Luca Marconato Date: Thu, 8 Jan 2026 16:11:38 +0100 Subject: [PATCH 4/6] fix --- src/spatialdata_io/readers/xenium.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/spatialdata_io/readers/xenium.py b/src/spatialdata_io/readers/xenium.py index 4380d8bb..6b946ff8 100644 --- a/src/spatialdata_io/readers/xenium.py +++ b/src/spatialdata_io/readers/xenium.py @@ -382,11 +382,11 @@ def filter(self, record: logging.LogRecord) -> bool: del image_models_kwargs["c_coords"] logger.removeFilter(IgnoreSpecificMessage()) - # if table is not None: + if table is not None: + tables["table"] = table # valid_nucleus_mask = ~table.obs[XeniumKeys.CELL_ID].isin(invalid_nuc_ids) # valid_cell_mask = ~table.obs[XeniumKeys.CELL_ID].isin(invalid_cell_ids) # tables["table"] = table[valid_nucleus_mask & valid_cell_mask].copy() - tables["table"] = table elements_dict = { "images": images, @@ -423,8 +423,8 @@ def _get_polygons( ) -> GeoDataFrame: # seems to be faster than pd.read_parquet df = pq.read_table(path / file).to_pandas() - df[XeniumKeys.CELL_ID] = _decode_cell_id_column(df[XeniumKeys.CELL_ID]) + # df[XeniumKeys.CELL_ID] = _decode_cell_id_column(df[XeniumKeys.CELL_ID]) # # filter out cell ids with too few vertices to form a valid polygon. # invalid_ids = df.groupby(XeniumKeys.CELL_ID).filter(lambda x: len(x) < 3)[ # XeniumKeys.CELL_ID].unique() From 57a1cb20f87ea12022ff609e39b52c927350ec12 Mon Sep 17 00:00:00 2001 From: Luca Marconato Date: Fri, 9 Jan 2026 11:52:41 +0100 Subject: [PATCH 5/6] ehapes in xenium remove constrain on representation --- src/spatialdata_io/readers/xenium.py | 38 ++++++++-------------------- 1 file changed, 10 insertions(+), 28 deletions(-) diff --git a/src/spatialdata_io/readers/xenium.py b/src/spatialdata_io/readers/xenium.py index 6b946ff8..f14e21af 100644 --- a/src/spatialdata_io/readers/xenium.py +++ b/src/spatialdata_io/readers/xenium.py @@ -4,9 +4,7 @@ import logging import os import re -import tempfile import warnings -import zipfile from pathlib import Path from types import MappingProxyType from typing import TYPE_CHECKING, Any @@ -245,7 +243,10 @@ def xenium( labels_models_kwargs=labels_models_kwargs, ) if cell_labels_indices_mapping is not None and table is not None: - if not pd.DataFrame.equals(cell_labels_indices_mapping["cell_id"], table.obs[str(XeniumKeys.CELL_ID)]): + if not pd.DataFrame.equals( + cell_labels_indices_mapping["cell_id"], + table.obs[str(XeniumKeys.CELL_ID)], + ): warnings.warn( "The cell_id column in the cell_labels_table does not match the cell_id column derived from the " "cell labels data. This could be due to trying to read a new version that is not supported yet. " @@ -384,9 +385,6 @@ def filter(self, record: logging.LogRecord) -> bool: if table is not None: tables["table"] = table - # valid_nucleus_mask = ~table.obs[XeniumKeys.CELL_ID].isin(invalid_nuc_ids) - # valid_cell_mask = ~table.obs[XeniumKeys.CELL_ID].isin(invalid_cell_ids) - # tables["table"] = table[valid_nucleus_mask & valid_cell_mask].copy() elements_dict = { "images": images, @@ -424,27 +422,6 @@ def _get_polygons( # seems to be faster than pd.read_parquet df = pq.read_table(path / file).to_pandas() - # df[XeniumKeys.CELL_ID] = _decode_cell_id_column(df[XeniumKeys.CELL_ID]) - # # filter out cell ids with too few vertices to form a valid polygon. - # invalid_ids = df.groupby(XeniumKeys.CELL_ID).filter(lambda x: len(x) < 3)[ - # XeniumKeys.CELL_ID].unique() - # invalid_ids = [] if len(invalid_ids) == 0 else invalid_ids - # - # if len(invalid_ids) > 0: - # logging.warning( - # f"Found {len(invalid_ids)} invalid polygons for {file}, removing the masks corresponding to the IDs: {invalid_ids}" - # ) - # - # # Filter based on valid cell IDs if idx is provided - # if idx is not None: - # idx = idx[~idx.isin(invalid_ids)] - # if len(invalid_ids) > 0: - # idx = idx.reset_index(drop=True) - # df = df[df[XeniumKeys.CELL_ID].isin(idx)] - # else: - # # If no idx provided, just (potentially) filter out invalid IDs - # df = df[~df[XeniumKeys.CELL_ID].isin(invalid_ids)] - group_by = df.groupby(XeniumKeys.CELL_ID) index = pd.Series(group_by.indices.keys()) # convert the index to str since we will compare it with an AnnData object, where the index is a str @@ -492,7 +469,12 @@ def _get_labels_and_indices_mapping( z = zarr.open(store, mode="r") # get the labels masks = da.from_array(z["masks"][f"{mask_index}"]) - labels = Labels2DModel.parse(masks, dims=("y", "x"), transformations={"global": Identity()}, **labels_models_kwargs) + labels = Labels2DModel.parse( + masks, + dims=("y", "x"), + transformations={"global": Identity()}, + **labels_models_kwargs, + ) # build the matching table version = _parse_version_of_xenium_analyzer(specs) From 091c5f3806a0a6e9122beda2990789aba131911e Mon Sep 17 00:00:00 2001 From: Luca Marconato Date: Fri, 9 Jan 2026 11:54:41 +0100 Subject: [PATCH 6/6] fix formatting --- src/spatialdata_io/readers/xenium.py | 12 ++---------- 1 file changed, 2 insertions(+), 10 deletions(-) diff --git a/src/spatialdata_io/readers/xenium.py b/src/spatialdata_io/readers/xenium.py index f14e21af..5c2be682 100644 --- a/src/spatialdata_io/readers/xenium.py +++ b/src/spatialdata_io/readers/xenium.py @@ -243,10 +243,7 @@ def xenium( labels_models_kwargs=labels_models_kwargs, ) if cell_labels_indices_mapping is not None and table is not None: - if not pd.DataFrame.equals( - cell_labels_indices_mapping["cell_id"], - table.obs[str(XeniumKeys.CELL_ID)], - ): + if not pd.DataFrame.equals(cell_labels_indices_mapping["cell_id"], table.obs[str(XeniumKeys.CELL_ID)]): warnings.warn( "The cell_id column in the cell_labels_table does not match the cell_id column derived from the " "cell labels data. This could be due to trying to read a new version that is not supported yet. " @@ -469,12 +466,7 @@ def _get_labels_and_indices_mapping( z = zarr.open(store, mode="r") # get the labels masks = da.from_array(z["masks"][f"{mask_index}"]) - labels = Labels2DModel.parse( - masks, - dims=("y", "x"), - transformations={"global": Identity()}, - **labels_models_kwargs, - ) + labels = Labels2DModel.parse(masks, dims=("y", "x"), transformations={"global": Identity()}, **labels_models_kwargs) # build the matching table version = _parse_version_of_xenium_analyzer(specs)