From 9ac2d92722b33fd45df465fc8eccd3db9f70ee04 Mon Sep 17 00:00:00 2001
From: Tim Treis <tim.treis@stud.uni-heidelberg.de>
Date: Thu, 27 Feb 2025 12:54:54 +0100
Subject: [PATCH 1/6] now removing faulty IDs

---
 src/spatialdata_io/readers/xenium.py | 48 ++++++++++++++++++++++++++--
 1 file changed, 46 insertions(+), 2 deletions(-)

diff --git a/src/spatialdata_io/readers/xenium.py b/src/spatialdata_io/readers/xenium.py
index ff036067..d9715b40 100644
--- a/src/spatialdata_io/readers/xenium.py
+++ b/src/spatialdata_io/readers/xenium.py
@@ -248,21 +248,33 @@ def xenium(
                     table.uns[TableModel.ATTRS_KEY][TableModel.INSTANCE_KEY] = "cell_labels"
 
     if nucleus_boundaries:
+        invalid_nuc_ids = _find_invalid_ids(path, XeniumKeys.NUCLEUS_BOUNDARIES_FILE)
+        if len(invalid_nuc_ids) > 0:
+            logging.warning(
+                f"Found {len(invalid_nuc_ids)} invalid polygons for nuclei, removing the masks corresponding to the IDs: {invalid_nuc_ids}"
+            )
         polygons["nucleus_boundaries"] = _get_polygons(
             path,
             XeniumKeys.NUCLEUS_BOUNDARIES_FILE,
             specs,
             n_jobs,
             idx=table.obs[str(XeniumKeys.CELL_ID)].copy(),
+            invalid_ids=invalid_nuc_ids,
         )
 
     if cells_boundaries:
+        invalid_cell_ids = _find_invalid_ids(path, XeniumKeys.CELL_BOUNDARIES_FILE)
+        if len(invalid_cell_ids) > 0:
+            logging.warning(
+                f"Found {len(invalid_cell_ids)} invalid polygons for cells, removing the masks corresponding to the IDs: {invalid_cell_ids}"
+            )
         polygons["cell_boundaries"] = _get_polygons(
             path,
             XeniumKeys.CELL_BOUNDARIES_FILE,
             specs,
             n_jobs,
             idx=table.obs[str(XeniumKeys.CELL_ID)].copy(),
+            invalid_ids=invalid_cell_ids,
         )
 
     if transcripts:
@@ -338,7 +350,9 @@ def filter(self, record: logging.LogRecord) -> bool:
             logger.removeFilter(IgnoreSpecificMessage())
 
     if table is not None:
-        tables["table"] = table
+        valid_nucleus_mask = ~table.obs[XeniumKeys.CELL_ID].isin(invalid_nuc_ids)
+        valid_cell_mask = ~table.obs[XeniumKeys.CELL_ID].isin(invalid_cell_ids)
+        tables["table"] = table[valid_nucleus_mask & valid_cell_mask].copy()
 
     elements_dict = {"images": images, "labels": labels, "points": points, "tables": tables, "shapes": polygons}
     if cells_as_circles:
@@ -354,6 +368,16 @@ def filter(self, record: logging.LogRecord) -> bool:
     return sdata
 
 
+def _find_invalid_ids(
+    path: Path,
+    file: str,
+) -> ArrayLike:
+    """Filter out cell ids with too few vertices to form a valid polygon."""
+    df = pq.read_table(path / file).to_pandas()
+    invalid_ids = df.groupby(XeniumKeys.CELL_ID).filter(lambda x: len(x) < 4)[XeniumKeys.CELL_ID].unique()
+    return [] if len(invalid_ids) == 0 else invalid_ids
+
+
 def _decode_cell_id_column(cell_id_column: pd.Series) -> pd.Series:
     if isinstance(cell_id_column.iloc[0], bytes):
         return cell_id_column.apply(lambda x: x.decode("utf-8"))
@@ -361,17 +385,37 @@ def _decode_cell_id_column(cell_id_column: pd.Series) -> pd.Series:
 
 
 def _get_polygons(
-    path: Path, file: str, specs: dict[str, Any], n_jobs: int, idx: ArrayLike | None = None
+    path: Path,
+    file: str,
+    specs: dict[str, Any],
+    n_jobs: int,
+    idx: ArrayLike | None = None,
+    invalid_ids: ArrayLike | None = None,
 ) -> GeoDataFrame:
     def _poly(arr: ArrayLike) -> Polygon:
         return Polygon(arr[:-1])
 
+    if invalid_ids is None:
+        invalid_ids = []
+
     # seems to be faster than pd.read_parquet
     df = pq.read_table(path / file).to_pandas()
+    df[XeniumKeys.CELL_ID] = _decode_cell_id_column(df[XeniumKeys.CELL_ID])
+
+    # Filter based on valid cell IDs if idx is provided
+    if idx is not None:
+        idx = idx[~idx.isin(invalid_ids)]
+        if len(invalid_ids) > 0:
+            idx = idx.reset_index(drop=True)
+        df = df[df[XeniumKeys.CELL_ID].isin(idx)]
+    else:
+        # If no idx provided, just (potentially) filter out invalid IDs
+        df = df[~df[XeniumKeys.CELL_ID].isin(invalid_ids)]
 
     group_by = df.groupby(XeniumKeys.CELL_ID)
     index = pd.Series(group_by.indices.keys())
     index = _decode_cell_id_column(index)
+
     out = Parallel(n_jobs=n_jobs)(
         delayed(_poly)(i.to_numpy())
         for _, i in group_by[[XeniumKeys.BOUNDARIES_VERTEX_X, XeniumKeys.BOUNDARIES_VERTEX_Y]]

From 1abd69a39ede273032935db5a7b95e0b8e2431ed Mon Sep 17 00:00:00 2001
From: Tim Treis <tim.treis@stud.uni-heidelberg.de>
Date: Sat, 22 Mar 2025 12:53:44 +0100
Subject: [PATCH 2/6] updated action

---
 .github/workflows/test.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/test.yaml b/.github/workflows/test.yaml
index 8546b3d7..62e51ab0 100644
--- a/.github/workflows/test.yaml
+++ b/.github/workflows/test.yaml
@@ -26,7 +26,7 @@ jobs:
             PYTHON: ${{ matrix.python }}
 
         steps:
-            - uses: actions/checkout@v2
+            - uses: actions/checkout@v4
             - name: Set up Python ${{ matrix.python }}
               uses: actions/setup-python@v4
               with:

From 2e228cf42fe34c8831edc23132ded3825e6a8007 Mon Sep 17 00:00:00 2001
From: Luca Marconato <m.lucalmer@gmail.com>
Date: Thu, 8 Jan 2026 16:10:21 +0100
Subject: [PATCH 3/6] wip support for polygons with 3 vertices without
 filtering

---
 src/spatialdata_io/readers/xenium.py | 68 ++++++++++------------------
 1 file changed, 25 insertions(+), 43 deletions(-)

diff --git a/src/spatialdata_io/readers/xenium.py b/src/spatialdata_io/readers/xenium.py
index b0f5663b..4380d8bb 100644
--- a/src/spatialdata_io/readers/xenium.py
+++ b/src/spatialdata_io/readers/xenium.py
@@ -259,33 +259,21 @@ def xenium(
                     table.uns[TableModel.ATTRS_KEY][TableModel.INSTANCE_KEY] = "cell_labels"
 
     if nucleus_boundaries:
-        invalid_nuc_ids = _find_invalid_ids(path, XeniumKeys.NUCLEUS_BOUNDARIES_FILE)
-        if len(invalid_nuc_ids) > 0:
-            logging.warning(
-                f"Found {len(invalid_nuc_ids)} invalid polygons for nuclei, removing the masks corresponding to the IDs: {invalid_nuc_ids}"
-            )
         polygons["nucleus_boundaries"] = _get_polygons(
             path,
             XeniumKeys.NUCLEUS_BOUNDARIES_FILE,
             specs,
             n_jobs,
             idx=table.obs[str(XeniumKeys.CELL_ID)].copy(),
-            invalid_ids=invalid_nuc_ids,
         )
 
     if cells_boundaries:
-        invalid_cell_ids = _find_invalid_ids(path, XeniumKeys.CELL_BOUNDARIES_FILE)
-        if len(invalid_cell_ids) > 0:
-            logging.warning(
-                f"Found {len(invalid_cell_ids)} invalid polygons for cells, removing the masks corresponding to the IDs: {invalid_cell_ids}"
-            )
         polygons["cell_boundaries"] = _get_polygons(
             path,
             XeniumKeys.CELL_BOUNDARIES_FILE,
             specs,
             n_jobs,
             idx=table.obs[str(XeniumKeys.CELL_ID)].copy(),
-            invalid_ids=invalid_cell_ids,
         )
 
     if transcripts:
@@ -394,10 +382,11 @@ def filter(self, record: logging.LogRecord) -> bool:
             del image_models_kwargs["c_coords"]
             logger.removeFilter(IgnoreSpecificMessage())
 
-    if table is not None:
-        valid_nucleus_mask = ~table.obs[XeniumKeys.CELL_ID].isin(invalid_nuc_ids)
-        valid_cell_mask = ~table.obs[XeniumKeys.CELL_ID].isin(invalid_cell_ids)
-        tables["table"] = table[valid_nucleus_mask & valid_cell_mask].copy()
+    # if table is not None:
+    #     valid_nucleus_mask = ~table.obs[XeniumKeys.CELL_ID].isin(invalid_nuc_ids)
+    #     valid_cell_mask = ~table.obs[XeniumKeys.CELL_ID].isin(invalid_cell_ids)
+    #     tables["table"] = table[valid_nucleus_mask & valid_cell_mask].copy()
+    tables["table"] = table
 
     elements_dict = {
         "images": images,
@@ -419,16 +408,6 @@ def filter(self, record: logging.LogRecord) -> bool:
     return sdata
 
 
-def _find_invalid_ids(
-    path: Path,
-    file: str,
-) -> ArrayLike:
-    """Filter out cell ids with too few vertices to form a valid polygon."""
-    df = pq.read_table(path / file).to_pandas()
-    invalid_ids = df.groupby(XeniumKeys.CELL_ID).filter(lambda x: len(x) < 4)[XeniumKeys.CELL_ID].unique()
-    return [] if len(invalid_ids) == 0 else invalid_ids
-
-
 def _decode_cell_id_column(cell_id_column: pd.Series) -> pd.Series:
     if isinstance(cell_id_column.iloc[0], bytes):
         return cell_id_column.apply(lambda x: x.decode("utf-8"))
@@ -441,27 +420,30 @@ def _get_polygons(
     specs: dict[str, Any],
     n_jobs: int,
     idx: ArrayLike | None = None,
-    invalid_ids: ArrayLike | None = None,
 ) -> GeoDataFrame:
-    def _poly(arr: ArrayLike) -> Polygon:
-        return Polygon(arr[:-1])
-
-    if invalid_ids is None:
-        invalid_ids = []
-
     # seems to be faster than pd.read_parquet
     df = pq.read_table(path / file).to_pandas()
     df[XeniumKeys.CELL_ID] = _decode_cell_id_column(df[XeniumKeys.CELL_ID])
 
-    # Filter based on valid cell IDs if idx is provided
-    if idx is not None:
-        idx = idx[~idx.isin(invalid_ids)]
-        if len(invalid_ids) > 0:
-            idx = idx.reset_index(drop=True)
-        df = df[df[XeniumKeys.CELL_ID].isin(idx)]
-    else:
-        # If no idx provided, just (potentially) filter out invalid IDs
-        df = df[~df[XeniumKeys.CELL_ID].isin(invalid_ids)]
+    # # filter out cell ids with too few vertices to form a valid polygon.
+    # invalid_ids = df.groupby(XeniumKeys.CELL_ID).filter(lambda x: len(x) < 3)[
+    #     XeniumKeys.CELL_ID].unique()
+    # invalid_ids = [] if len(invalid_ids) == 0 else invalid_ids
+    #
+    # if len(invalid_ids) > 0:
+    #     logging.warning(
+    #         f"Found {len(invalid_ids)} invalid polygons for {file}, removing the masks corresponding to the IDs: {invalid_ids}"
+    #     )
+    #
+    # # Filter based on valid cell IDs if idx is provided
+    # if idx is not None:
+    #     idx = idx[~idx.isin(invalid_ids)]
+    #     if len(invalid_ids) > 0:
+    #         idx = idx.reset_index(drop=True)
+    #     df = df[df[XeniumKeys.CELL_ID].isin(idx)]
+    # else:
+    #     # If no idx provided, just (potentially) filter out invalid IDs
+    #     df = df[~df[XeniumKeys.CELL_ID].isin(invalid_ids)]
 
     group_by = df.groupby(XeniumKeys.CELL_ID)
     index = pd.Series(group_by.indices.keys())
@@ -470,7 +452,7 @@ def _poly(arr: ArrayLike) -> Polygon:
     index = _decode_cell_id_column(index)
 
     out = Parallel(n_jobs=n_jobs)(
-        delayed(_poly)(i.to_numpy())
+        delayed(Polygon)(i.to_numpy())
         for _, i in group_by[[XeniumKeys.BOUNDARIES_VERTEX_X, XeniumKeys.BOUNDARIES_VERTEX_Y]]
     )
     geo_df = GeoDataFrame({"geometry": out})

From 9768bfdb4e4d1756be77954e57c54691581ba9c3 Mon Sep 17 00:00:00 2001
From: Luca Marconato <m.lucalmer@gmail.com>
Date: Thu, 8 Jan 2026 16:11:38 +0100
Subject: [PATCH 4/6] fix

---
 src/spatialdata_io/readers/xenium.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/spatialdata_io/readers/xenium.py b/src/spatialdata_io/readers/xenium.py
index 4380d8bb..6b946ff8 100644
--- a/src/spatialdata_io/readers/xenium.py
+++ b/src/spatialdata_io/readers/xenium.py
@@ -382,11 +382,11 @@ def filter(self, record: logging.LogRecord) -> bool:
             del image_models_kwargs["c_coords"]
             logger.removeFilter(IgnoreSpecificMessage())
 
-    # if table is not None:
+    if table is not None:
+        tables["table"] = table
     #     valid_nucleus_mask = ~table.obs[XeniumKeys.CELL_ID].isin(invalid_nuc_ids)
     #     valid_cell_mask = ~table.obs[XeniumKeys.CELL_ID].isin(invalid_cell_ids)
     #     tables["table"] = table[valid_nucleus_mask & valid_cell_mask].copy()
-    tables["table"] = table
 
     elements_dict = {
         "images": images,
@@ -423,8 +423,8 @@ def _get_polygons(
 ) -> GeoDataFrame:
     # seems to be faster than pd.read_parquet
     df = pq.read_table(path / file).to_pandas()
-    df[XeniumKeys.CELL_ID] = _decode_cell_id_column(df[XeniumKeys.CELL_ID])
 
+    # df[XeniumKeys.CELL_ID] = _decode_cell_id_column(df[XeniumKeys.CELL_ID])
     # # filter out cell ids with too few vertices to form a valid polygon.
     # invalid_ids = df.groupby(XeniumKeys.CELL_ID).filter(lambda x: len(x) < 3)[
     #     XeniumKeys.CELL_ID].unique()

From 57a1cb20f87ea12022ff609e39b52c927350ec12 Mon Sep 17 00:00:00 2001
From: Luca Marconato <m.lucalmer@gmail.com>
Date: Fri, 9 Jan 2026 11:52:41 +0100
Subject: [PATCH 5/6] ehapes in xenium remove constrain on representation

---
 src/spatialdata_io/readers/xenium.py | 38 ++++++++--------------------
 1 file changed, 10 insertions(+), 28 deletions(-)

diff --git a/src/spatialdata_io/readers/xenium.py b/src/spatialdata_io/readers/xenium.py
index 6b946ff8..f14e21af 100644
--- a/src/spatialdata_io/readers/xenium.py
+++ b/src/spatialdata_io/readers/xenium.py
@@ -4,9 +4,7 @@
 import logging
 import os
 import re
-import tempfile
 import warnings
-import zipfile
 from pathlib import Path
 from types import MappingProxyType
 from typing import TYPE_CHECKING, Any
@@ -245,7 +243,10 @@ def xenium(
             labels_models_kwargs=labels_models_kwargs,
         )
         if cell_labels_indices_mapping is not None and table is not None:
-            if not pd.DataFrame.equals(cell_labels_indices_mapping["cell_id"], table.obs[str(XeniumKeys.CELL_ID)]):
+            if not pd.DataFrame.equals(
+                cell_labels_indices_mapping["cell_id"],
+                table.obs[str(XeniumKeys.CELL_ID)],
+            ):
                 warnings.warn(
                     "The cell_id column in the cell_labels_table does not match the cell_id column derived from the "
                     "cell labels data. This could be due to trying to read a new version that is not supported yet. "
@@ -384,9 +385,6 @@ def filter(self, record: logging.LogRecord) -> bool:
 
     if table is not None:
         tables["table"] = table
-    #     valid_nucleus_mask = ~table.obs[XeniumKeys.CELL_ID].isin(invalid_nuc_ids)
-    #     valid_cell_mask = ~table.obs[XeniumKeys.CELL_ID].isin(invalid_cell_ids)
-    #     tables["table"] = table[valid_nucleus_mask & valid_cell_mask].copy()
 
     elements_dict = {
         "images": images,
@@ -424,27 +422,6 @@ def _get_polygons(
     # seems to be faster than pd.read_parquet
     df = pq.read_table(path / file).to_pandas()
 
-    # df[XeniumKeys.CELL_ID] = _decode_cell_id_column(df[XeniumKeys.CELL_ID])
-    # # filter out cell ids with too few vertices to form a valid polygon.
-    # invalid_ids = df.groupby(XeniumKeys.CELL_ID).filter(lambda x: len(x) < 3)[
-    #     XeniumKeys.CELL_ID].unique()
-    # invalid_ids = [] if len(invalid_ids) == 0 else invalid_ids
-    #
-    # if len(invalid_ids) > 0:
-    #     logging.warning(
-    #         f"Found {len(invalid_ids)} invalid polygons for {file}, removing the masks corresponding to the IDs: {invalid_ids}"
-    #     )
-    #
-    # # Filter based on valid cell IDs if idx is provided
-    # if idx is not None:
-    #     idx = idx[~idx.isin(invalid_ids)]
-    #     if len(invalid_ids) > 0:
-    #         idx = idx.reset_index(drop=True)
-    #     df = df[df[XeniumKeys.CELL_ID].isin(idx)]
-    # else:
-    #     # If no idx provided, just (potentially) filter out invalid IDs
-    #     df = df[~df[XeniumKeys.CELL_ID].isin(invalid_ids)]
-
     group_by = df.groupby(XeniumKeys.CELL_ID)
     index = pd.Series(group_by.indices.keys())
     # convert the index to str since we will compare it with an AnnData object, where the index is a str
@@ -492,7 +469,12 @@ def _get_labels_and_indices_mapping(
     z = zarr.open(store, mode="r")
     # get the labels
     masks = da.from_array(z["masks"][f"{mask_index}"])
-    labels = Labels2DModel.parse(masks, dims=("y", "x"), transformations={"global": Identity()}, **labels_models_kwargs)
+    labels = Labels2DModel.parse(
+        masks,
+        dims=("y", "x"),
+        transformations={"global": Identity()},
+        **labels_models_kwargs,
+    )
 
     # build the matching table
     version = _parse_version_of_xenium_analyzer(specs)

From 091c5f3806a0a6e9122beda2990789aba131911e Mon Sep 17 00:00:00 2001
From: Luca Marconato <m.lucalmer@gmail.com>
Date: Fri, 9 Jan 2026 11:54:41 +0100
Subject: [PATCH 6/6] fix formatting

---
 src/spatialdata_io/readers/xenium.py | 12 ++----------
 1 file changed, 2 insertions(+), 10 deletions(-)

diff --git a/src/spatialdata_io/readers/xenium.py b/src/spatialdata_io/readers/xenium.py
index f14e21af..5c2be682 100644
--- a/src/spatialdata_io/readers/xenium.py
+++ b/src/spatialdata_io/readers/xenium.py
@@ -243,10 +243,7 @@ def xenium(
             labels_models_kwargs=labels_models_kwargs,
         )
         if cell_labels_indices_mapping is not None and table is not None:
-            if not pd.DataFrame.equals(
-                cell_labels_indices_mapping["cell_id"],
-                table.obs[str(XeniumKeys.CELL_ID)],
-            ):
+            if not pd.DataFrame.equals(cell_labels_indices_mapping["cell_id"], table.obs[str(XeniumKeys.CELL_ID)]):
                 warnings.warn(
                     "The cell_id column in the cell_labels_table does not match the cell_id column derived from the "
                     "cell labels data. This could be due to trying to read a new version that is not supported yet. "
@@ -469,12 +466,7 @@ def _get_labels_and_indices_mapping(
     z = zarr.open(store, mode="r")
     # get the labels
     masks = da.from_array(z["masks"][f"{mask_index}"])
-    labels = Labels2DModel.parse(
-        masks,
-        dims=("y", "x"),
-        transformations={"global": Identity()},
-        **labels_models_kwargs,
-    )
+    labels = Labels2DModel.parse(masks, dims=("y", "x"), transformations={"global": Identity()}, **labels_models_kwargs)
 
     # build the matching table
     version = _parse_version_of_xenium_analyzer(specs)