Skip to content

concat error if mdata.obsm contains multi-dimensional array and merge is not None #103

@racng

Description

@racng

Describe the bug
A clear and concise description of what the bug is.
Indexing error when I use merge='same' or merge='unique' when calling mudata.concat on MuData objects that have different embeddings stored in mdata.obsm slot. For example, if mdata1 has 'X_totavi' and mdata2 does not.

The error disappears if I delete the embeddings that are not shared before using concat.

for mdata in [mdata1, mdata2]:
    keys = list(mdata.obsm.keys())
    for x in keys:
        if x not in ['rna', 'prot', 'airr']:
            del mdata.obsm[x]

To Reproduce

# Here mdata1 and mdata2 both have 10,000 observations
mdata = mudata.concat([mdata1, mdata2], 
    join='outer', merge='same',
    label='source', keys=['pb', 'tumor'])

---> [12](vscode-notebook-cell:?execution_count=65&line=12) 
mdata = mudata.concat([mdata1, mdata2], 
     13     join='outer', merge='same',
     14     label='source', keys=['pb', 'tumor'])

File ~/mambaforge/envs/compbio-8-25/lib/python3.11/site-packages/mudata/_core/merge.py:287, in concat(mdatas, join, merge, uns_merge, label, keys, index_unique, fill_value, pairwise)
    283 uns = uns_merge([m.uns for m in mdatas])
    285 # TODO: attrmap
--> [287](https://vscode-remote+ssh-002dremote-002bheath1-002esystemsbiology-002enet.vscode-resource.vscode-cdn.net/users/rng/proj/tlc/~/mambaforge/envs/compbio-8-25/lib/python3.11/site-packages/mudata/_core/merge.py:287) return MuData(
    288     **{
    289         "data": modalities,
    290         "axis": axis,
    291         dim: concat_annot,
    292         alt_dim: alt_annot,
    293         f"{dim}m": concat_mapping,
    294         f"{alt_dim}m": alt_mapping,
    295         f"{dim}p": concat_pairwise,
    296         f"{alt_dim}p": alt_pairwise,
    297         "uns": uns,
    298     }
    299 )

File ~/mambaforge/envs/compbio-8-25/lib/python3.11/site-packages/mudata/_core/mudata.py:232, in MuData.__init__(self, data, feature_types_names, as_view, index, **kwargs)
    229 self._axis = kwargs.get("axis") or 0
    231 # Restore proper .obs and .var
--> [232](https://vscode-remote+ssh-002dremote-002bheath1-002esystemsbiology-002enet.vscode-resource.vscode-cdn.net/users/rng/proj/tlc/~/mambaforge/envs/compbio-8-25/lib/python3.11/site-packages/mudata/_core/mudata.py:232) self.update()
    234 self._uns = kwargs.get("uns") or {}
    236 return

File ~/mambaforge/envs/compbio-8-25/lib/python3.11/site-packages/mudata/_core/mudata.py:1829, in MuData.update(self)
   1827 if len(self.mod) > 0:
   1828     self.update_var()
-> [1829](https://vscode-remote+ssh-002dremote-002bheath1-002esystemsbiology-002enet.vscode-resource.vscode-cdn.net/users/rng/proj/tlc/~/mambaforge/envs/compbio-8-25/lib/python3.11/site-packages/mudata/_core/mudata.py:1829)     self.update_obs()

File ~/mambaforge/envs/compbio-8-25/lib/python3.11/site-packages/mudata/_core/mudata.py:1461, in MuData.update_obs(self)
   1457 """
   1458 Update global .obs_names according to the .obs_names of all the modalities.
   1459 """
   1460 join_common = self.axis == 1
-> [1461](https://vscode-remote+ssh-002dremote-002bheath1-002esystemsbiology-002enet.vscode-resource.vscode-cdn.net/users/rng/proj/tlc/~/mambaforge/envs/compbio-8-25/lib/python3.11/site-packages/mudata/_core/mudata.py:1461) self._update_attr("obs", axis=1, join_common=join_common)

File ~/mambaforge/envs/compbio-8-25/lib/python3.11/site-packages/mudata/_core/mudata.py:602, in MuData._update_attr(self, attr, axis, **kwargs)
    600     if "join_common" in kwargs:
    601         join_common = kwargs.pop("join_common")
--> [602](https://vscode-remote+ssh-002dremote-002bheath1-002esystemsbiology-002enet.vscode-resource.vscode-cdn.net/users/rng/proj/tlc/~/mambaforge/envs/compbio-8-25/lib/python3.11/site-packages/mudata/_core/mudata.py:602)     self._update_attr_legacy(attr, axis, join_common, **kwargs)
    603     return
    605 prev_index = getattr(self, attr).index

File ~/mambaforge/envs/compbio-8-25/lib/python3.11/site-packages/mudata/_core/mudata.py:1309, in MuData._update_attr_legacy(self, attr, axis, join_common, **kwargs)
   1307 for mx_key, mx in attrm.items():
   1308     if mx_key not in self.mod.keys():  # not a modality name
-> [1309](https://vscode-remote+ssh-002dremote-002bheath1-002esystemsbiology-002enet.vscode-resource.vscode-cdn.net/users/rng/proj/tlc/~/mambaforge/envs/compbio-8-25/lib/python3.11/site-packages/mudata/_core/mudata.py:1309)         attrm[mx_key] = attrm[mx_key][index_order]
   1310         attrm[mx_key][index_order == -1] = np.nan
   1312 # Update .obsp/.varp (size might have changed)

File ~/mambaforge/envs/compbio-8-25/lib/python3.11/site-packages/pandas/core/frame.py:4113, in DataFrame.__getitem__(self, key)
   4111     if is_iterator(key):
   4112         key = list(key)
-> [4113](https://vscode-remote+ssh-002dremote-002bheath1-002esystemsbiology-002enet.vscode-resource.vscode-cdn.net/users/rng/proj/tlc/~/mambaforge/envs/compbio-8-25/lib/python3.11/site-packages/pandas/core/frame.py:4113)     indexer = self.columns._get_indexer_strict(key, "columns")[1]
   4115 # take() does not accept boolean indexers
   4116 if getattr(indexer, "dtype", None) == bool:

File ~/mambaforge/envs/compbio-8-25/lib/python3.11/site-packages/pandas/core/indexes/base.py:6212, in Index._get_indexer_strict(self, key, axis_name)
   6209 else:
   6210     keyarr, indexer, new_indexer = self._reindex_non_unique(keyarr)
-> [6212](https://vscode-remote+ssh-002dremote-002bheath1-002esystemsbiology-002enet.vscode-resource.vscode-cdn.net/users/rng/proj/tlc/~/mambaforge/envs/compbio-8-25/lib/python3.11/site-packages/pandas/core/indexes/base.py:6212) self._raise_if_missing(keyarr, indexer, axis_name)
   6214 keyarr = self.take(indexer)
   6215 if isinstance(key, Index):
   6216     # GH 42790 - Preserve name from an Index

File ~/mambaforge/envs/compbio-8-25/lib/python3.11/site-packages/pandas/core/indexes/base.py:6261, in Index._raise_if_missing(self, key, indexer, axis_name)
   6259 if nmissing:
   6260     if nmissing == len(indexer):
-> [6261](https://vscode-remote+ssh-002dremote-002bheath1-002esystemsbiology-002enet.vscode-resource.vscode-cdn.net/users/rng/proj/tlc/~/mambaforge/envs/compbio-8-25/lib/python3.11/site-packages/pandas/core/indexes/base.py:6261)         raise KeyError(f"None of [{key}] are in the [{axis_name}]")
   6263     not_found = list(ensure_index(key)[missing_mask.nonzero()[0]].unique())
   6264     raise KeyError(f"{not_found} not in index")

KeyError: "None of [Index([    3,     4,     7,     8,    11,    12,    15,    17,    18,    20,\n       ...\n       19906, 19907, 19934, 19941, 19948, 19963, 19967, 19976, 19986, 19998],\n      dtype='int64', length=20000)] are in the [columns]"

Expected behaviour
I expected concat to fill missing .obsm embeddings values with null if they are not shared.

System

  • OS: linux
  • Python version Python 3.11.13
  • Versions of libraries involved: mudata v0.3.2

Additional context
None

Metadata

Metadata

Assignees

No one assigned

    Labels

    bugSomething isn't working

    Type

    No type

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions