From df6f416f5e2f57f77c92897bb186b5bcb41ad4d5 Mon Sep 17 00:00:00 2001
From: Goncalo Pinto <goncalo.pinto@helmholtz-munich.de>
Date: Sun, 26 Jan 2025 19:25:31 +0100
Subject: [PATCH 1/5] Changing a typo to try it out

---
 pertpy/preprocessing/_guide_rna.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pertpy/preprocessing/_guide_rna.py b/pertpy/preprocessing/_guide_rna.py
index e85e9c29..6388891e 100644
--- a/pertpy/preprocessing/_guide_rna.py
+++ b/pertpy/preprocessing/_guide_rna.py
@@ -17,7 +17,7 @@
 
 
 class GuideAssignment:
-    """Offers simple guide assigment based on count thresholds."""
+    """Offers Simple guide assigment based on count thresholds."""
 
     def assign_by_threshold(
         self,

From a519397324a38656123d8d7fbf515f49c8c7ad6a Mon Sep 17 00:00:00 2001
From: Goncalo Pinto <goncalo.pinto@helmholtz-munich.de>
Date: Mon, 10 Feb 2025 19:03:23 +0100
Subject: [PATCH 2/5] dialogue embeddings added

---
 pertpy/tools/_dialogue.py | 33 +++++++++++++++------------------
 1 file changed, 15 insertions(+), 18 deletions(-)

diff --git a/pertpy/tools/_dialogue.py b/pertpy/tools/_dialogue.py
index 3194ecae..638dc430 100644
--- a/pertpy/tools/_dialogue.py
+++ b/pertpy/tools/_dialogue.py
@@ -82,27 +82,24 @@ def _get_pseudobulks(
 
         return pseudobulk
 
-    def _pseudobulk_pca(self, adata: AnnData, groupby: str, n_components: int = 50) -> pd.DataFrame:
-        """Return cell-averaged PCA components.
-
-        TODO: consider merging with `get_pseudobulks`
-        TODO: DIALOGUE recommends running PCA on each cell type separately before running PMD - this should be implemented as an option here.
+    def _pseudobulk_feature(
+        self, adata: AnnData, groupby: str, n_components: int = 50, feature_key: str = "X_pca"
+    ) -> pd.DataFrame:
+        """Return Cell-averaged components from a custom feature space.
 
         Args:
-            groupby: The key to groupby for pseudobulks
-            n_components: The number of PCA components
+            groupby: The key to groupby for pseudobulks.
+            n_components: The number of components to use.
+            feature_key: The key in adata.obsm for the feature space (e.g., "X_pca", "X_umap").
 
         Returns:
-            A pseudobulk of PCA components.
+            A pseudobulk DataFrame of the averaged components.
         """
         aggr = {}
-
         for category in adata.obs.loc[:, groupby].cat.categories:
             temp = adata.obs.loc[:, groupby] == category
-            aggr[category] = adata[temp].obsm["X_pca"][:, :n_components].mean(axis=0)
-
+            aggr[category] = adata[temp].obsm[feature_key][:, :n_components].mean(axis=0)
         aggr = pd.DataFrame(aggr)
-
         return aggr
 
     def _scale_data(self, pseudobulks: pd.DataFrame, normalize: bool = True) -> np.ndarray:
@@ -558,7 +555,7 @@ def _load(
         self,
         adata: AnnData,
         ct_order: list[str],
-        agg_pca: bool = True,
+        agg_feature: bool = True,
         normalize: bool = True,
     ) -> tuple[list, dict]:
         """Separates cell into AnnDatas by celltype_key and creates the multifactor PMD input.
@@ -568,14 +565,14 @@ def _load(
         Args:
             adata: AnnData object generate celltype objects for
             ct_order: The order of cell types
-            agg_pca: Whether to aggregate pseudobulks with PCA or not.
+            agg_feature: Whether to aggregate pseudobulks with some embeddings or not.
             normalize: Whether to mimic DIALOGUE behavior or not.
 
         Returns:
             A celltype_label:array dictionary.
         """
         ct_subs = {ct: adata[adata.obs[self.celltype_key] == ct].copy() for ct in ct_order}
-        fn = self._pseudobulk_pca if agg_pca else self._get_pseudobulks
+        fn = self._pseudobulk_feature if agg_feature else self._get_pseudobulks
         ct_aggr = {ct: fn(ad, self.sample_id) for ct, ad in ct_subs.items()}  # type: ignore
 
         # TODO: implement check (as in https://github.com/livnatje/DIALOGUE/blob/55da9be0a9bf2fcd360d9e11f63e30d041ec4318/R/DIALOGUE.main.R#L114-L119)
@@ -593,7 +590,7 @@ def calculate_multifactor_PMD(
         adata: AnnData,
         penalties: list[int] = None,
         ct_order: list[str] = None,
-        agg_pca: bool = True,
+        agg_feature: bool = True,
         solver: Literal["lp", "bs"] = "bs",
         normalize: bool = True,
     ) -> tuple[AnnData, dict[str, np.ndarray], dict[Any, Any], dict[Any, Any]]:
@@ -606,7 +603,7 @@ def calculate_multifactor_PMD(
             sample_id: Key to use for pseudobulk determination.
             penalties: PMD penalties.
             ct_order: The order of cell types.
-            agg_pca: Whether to calculate cell-averaged PCA components.
+            agg_features: Whether to calculate cell-averaged principal components.
             solver: Which solver to use for PMD. Must be one of "lp" (linear programming) or "bs" (binary search).
                     For differences between these to please refer to https://github.com/theislab/sparsecca/blob/main/examples/linear_programming_multicca.ipynb
             normalize: Whether to mimic DIALOGUE as close as possible
@@ -631,7 +628,7 @@ def calculate_multifactor_PMD(
         else:
             ct_order = cell_types = adata.obs[self.celltype_key].astype("category").cat.categories
 
-        mcca_in, ct_subs = self._load(adata, ct_order=cell_types, agg_pca=agg_pca, normalize=normalize)
+        mcca_in, ct_subs = self._load(adata, ct_order=cell_types, agg_feature=agg_feature, normalize=normalize)
 
         n_samples = mcca_in[0].shape[1]
         if penalties is None:

From 1a2edceb57d99ff7c1d4b77db5608972d2d76266 Mon Sep 17 00:00:00 2001
From: Goncalo Pinto <goncalo.pinto@helmholtz-munich.de>
Date: Mon, 10 Feb 2025 19:06:07 +0100
Subject: [PATCH 3/5] adding the TODO

---
 pertpy/tools/_dialogue.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/pertpy/tools/_dialogue.py b/pertpy/tools/_dialogue.py
index 638dc430..97498a13 100644
--- a/pertpy/tools/_dialogue.py
+++ b/pertpy/tools/_dialogue.py
@@ -87,6 +87,9 @@ def _pseudobulk_feature(
     ) -> pd.DataFrame:
         """Return Cell-averaged components from a custom feature space.
 
+        TODO: consider merging with `get_pseudobulks`
+        TODO: DIALOGUE recommends running PCA on each cell type separately before running PMD - this should be implemented as an option here.
+
         Args:
             groupby: The key to groupby for pseudobulks.
             n_components: The number of components to use.

From 0e52356e04d011eb5e96dd852bc630a82f67b638 Mon Sep 17 00:00:00 2001
From: Goncalo Pinto <goncalo.pinto@helmholtz-munich.de>
Date: Fri, 21 Feb 2025 18:22:45 +0100
Subject: [PATCH 4/5] Fixed bugs in DIALOGUE for _load, _scale_data and
 pseudobulk

---
 .RData                    | Bin 0 -> 2595 bytes
 .Rhistory                 |   2 +
 pertpy/tools/_dialogue.py | 132 +++++++++++++++++++++++++++-----------
 3 files changed, 98 insertions(+), 36 deletions(-)
 create mode 100644 .RData
 create mode 100644 .Rhistory

diff --git a/.RData b/.RData
new file mode 100644
index 0000000000000000000000000000000000000000..d019649332dde5b8be6c4635956b2c7e6e2ba379
GIT binary patch
literal 2595
zcmV+;3f%P{iwFP!000000|5*Q^Yv0hSThP(3IG5A0{{dB0ssRA000001yxi=EjR!G
z1Ofm60096500{s901PftVQyq^Z7y?VWn=&V01W^D0&)NVD5C%X0&r2*@oHqtv*W`g
zlgP7^e<{hj=+^O7QD5iM{1jHy-~~6_r2>@`Je!vmXCQ}7G@*GQQSS8kv6p?p;OuRg
z82zH!+nQnmtW!%jHF!Q`=^6jM?gm~n=+FBy@Wigj!1d{a^)jbW;|uy&UeHuze32_$
z9Ug;Q*9WnEi<}KRx9wRLQOe5Dz{A70Lnup8&k08lKQ`>4Tt5pX5p>`M-Q|wlQDW!u
zC3_%!LV%+#z=6!Sv`+VtSa<nY;!F|u65Ro;Q~GN>@iXZ^yx75cF!@G-+-B;OtFIAs
zUL-*qHr2amxsn~*1M3Tb5-`ZC0C8Yu7k+qfHRoD><owP$Da}HE;wqsIJ)`d{qKIF*
z4(TbV#g?<g*B{EO*3{fr%f+*7Ey7u~ep`$i&ERR6JYj}fMy_&?Z&!5MldM29{Jjb@
z{94P)rV}rf^f;`uXxY%#`o<xs-j8M$dxT%goVA<jt@35(l*{qUb&zUxbWy8ELEZ>P
z1jjQL?py)Wi><&bqKdq{DudfuvFPT+KFI+ed-nTx!T&YCOf(Hbsimr<IwCfhM?1R8
z2|(Y0s6jZO2@Zj(_BVQ-^$a-2)}3^T)Ve0-%JZrb+nDqVkhzpqNhW3}fp1=MNaU(c
z@|9K|(o$Kjm8(P&i17To*fGSHoIbpjL#NxFr@V%L-Hi}2lIjl<hhvHvaY#~VUa6@I
zVpddg9N&kxag=aAcZG$Vc7kTnfE#&V^q>Xf1R+8o`)-_@?H54)25k5$pnZhJ-5NW_
zU(-;Fa`*>%sF~7~$A<RHluwJ?#zg!eICa4wCbDm628)F)Oi(^{ED<p8CZ2m0@`s>n
z5ghinA8q`pvr45&R6}V(;HZ$EZiz_*c(yBR=f5ZHM)W?zHDzIJ;^J<ZDEol5kO0}V
zPyx+#9nO~;Q}?P*$ILd=T3rhO6?)~OWV43ax>(Znuz49Rb>{9$7cZ-@^Q3IHZdJeq
zs}i|O=W8SLzp(0@Z}C9o;Qobx5odYVDtxH%QBB*d1FEQ)kt0&B00;|SZfR7azZS_I
zBA=<X%vA13<3(pJe?o32LECkr+h9}ei{a5Ua4w+UD%V8pIV@>C1U+L=ecBtc*^?aC
zvADi>s98a`R7OKw3J<46(F9M4lr=%;UvlM=I7@py;}EzeqtOu)%U*elphpZj^W3kd
zb1jchgbH3bJ_C$0&uFNiUP+67HFi=x&40{n0kkkjfhEiKOQIpO%Dd?Q*1&0ox4{qg
z@{UEHakon=T8zKko*i}17CP}SzH?4mSs1PNkU4S6Vy4KsZ10ovUSu>8sue-qL()ES
z`76=mws2Yu$FYhK<-SnxpC_m5Z9Lgdg$;$TG-@YcTI0v5PRPn!J}LPw+t$a0=wEwg
z=M?fc4+YTQNCM$@!<$#6f*=cb)1eqtr0#7ND3_yUE9@sLC;hgQ-kOzB6H~)Fuy_G&
zXc-#*p$07)KhLufo5bbDul2_NSTd?6Dhpb@InaMb+mT>(FdkuN4hKm?mYlF*hV8iE
z6v}}}=ffiDeke|g4hi*4vNi>+b3e`gd~{upjOD`sbYh{XEPE<w-GC;z-+@HshfZ4h
zPj}z(txQLtq1`36*7{RU*fWn%x7d`?FZufH4`%x|g0FG-i6m?wrPbtRoRX^J%Ih?u
zZ7|Zz-*FV!s(xIi?B|%8Z~S{#c+IZ+B`O6e<fA9cU5Jk%&FLd=xR%9iZXXAV!PM8t
z`o;Tk(8BV71zWZx=-{M5+hM#xnXaz`=2t%`-mHbS3%yTI|5}n^4HGhSZ#cjj=h+)7
zt;U(gHIHflCWH!?i?z8Z5b|E*+skTeP>}nU(^2!&RNhc%JICrr`CL4xTCI8*Wr}D!
zQG?7T?C4?}eaUk#?)IB{*#DPfw=@D<Sf$@fT_ONBPCGlWq(Es3$3bxb+zw>w?Ki7p
zzzhN4Wt~fRSGv`|vGbUH)|gj9kGauOd?rhQSZ`o}bL2pyVU=)RwNijyT&wc$<M5oE
zw=os{FFvMX`lmjmGQ6_PERDEN)#`h-E@N@jEJzt|9jPECRaW?rv)>ruT$H-LriI_7
z8G@4o2OH<)51>{Hqah)?PHxm^WEZ;kQEZr+uYJBm;2ON5#?u)@?f9xdCz-zcdWJ6_
z>rY0}aYV_yCv`y~iLzyjO^dMzCs0(@ixFS$Ii4g>DThJ_92{6W?roE~c1kSIdqd`{
z8>oX6t%<Sc>$WLn{=ZXOvGf{h<Yjr61>z4LFZ<J%D*#A5Zvj~d@XoO@O82lnlFZ^;
z>0Q^*n0f+2OS5>PwOEA<uX)kEsOCRe?k_q^9;BUF-BoUgjuHT(u4{9Q8nC}SPVs_1
zbN=h($Y)H*wR~f2piRPUMyiCuI3r|(y9nt7EouA@vKCgdxt}^Jw^Vl65=AySB0_Y(
z$lC*<_Fx#5D&cK`!!8Q|q1Snkge)_h6FpQifZBc5K$GTdEVwV7Cs{)=2S!roVGo2w
zd<5?FgEdF1!gAl-e@pdKz7TT80@3+}-d=j!*GFSO{@ljdLM?`aM+&ILWdo5m3TP2E
zok<=vsCI=e<bUj@Y_ws1FeR#^ZlYix(Yt8ealo7n*1>_j#g0vlXt%LNC5)WYEggG;
znK~ktrE~s#R2XgSk2#p_m8KkkL+_dxiDWJW85Jj^Cekf%z$@K15pV))rm+rOc)qIa
zRwp&b_&hN9)D5l^cWT-Z0pos}3CKO5{V9X)&eoNZe_ysH(iErFJ_UE}>jDXON}J#w
zogky0a-oD&_wGQCpO>AqcI@hTHvJ*k;hO)udQ-@%D|mnL=NY7nR0b;Cmd_m=M#KgH
zp7nKxyE3nraSOf<6wv)8I+4{{LmsitMF$|Yp_~juRqc~C*UEL>4#TR4xhO0?O$p@I
zB{oEria!nm=X6t5JLO9-mtsWwGkY>Xc!2yq{(&#wT22SQMt3nbcub`RFx{t3S^7YH
zzcaYave+S|F8PC^LT2c+S+Cltv?N|)6xF0|f0D?Ek?R()$T02dzHu)XJ$@lMqvtGP
z`uu+|$ITo9Xe#w#i=+P67>E}IjOj#ztClM81s|7{tkkJ?DaGA`bHak=j9U~M1K@7w
zdSMFm?5-lzT~`VtDXb9k8ol@U0$MB*bQ7+=>Ntx2X4#+0V8)!kjE~F}(S%x{dbKlh
zbMkh4BFj<tE09~XGEvAYXWr)Ka>orN1YB%6DXqGZD2naYc!SYkC+M^8yvg#6VfMXQ
z@Bbn+A~Y+o2c`4=fFjRuRXZkwKS02b=0RvJuMTa7<TrBCiwrKb^ql|z0R9NS=Wz@Q
F007i7@v8s;

literal 0
HcmV?d00001

diff --git a/.Rhistory b/.Rhistory
new file mode 100644
index 00000000..54ab75ab
--- /dev/null
+++ b/.Rhistory
@@ -0,0 +1,2 @@
+devtools::install_github(repo = "https://github.com/livnatje/DIALOGUE")
+q()
diff --git a/pertpy/tools/_dialogue.py b/pertpy/tools/_dialogue.py
index 97498a13..64f2a541 100644
--- a/pertpy/tools/_dialogue.py
+++ b/pertpy/tools/_dialogue.py
@@ -55,7 +55,7 @@ def __init__(self, sample_id: str, celltype_key: str, n_counts_key: str, n_mpcs:
         self.n_mcps = n_mpcs
 
     def _get_pseudobulks(
-        self, adata: AnnData, groupby: str, strategy: Literal["median", "mean"] = "median"
+        self, adata: AnnData, groupby: str, strategy: Literal["median", "mean"] = "mean"
     ) -> pd.DataFrame:
         """Return cell-averaged data by groupby.
 
@@ -82,46 +82,74 @@ def _get_pseudobulks(
 
         return pseudobulk
 
-    def _pseudobulk_feature(
-        self, adata: AnnData, groupby: str, n_components: int = 50, feature_key: str = "X_pca"
+    def _pseudobulk_feature_space(
+        self,
+        adata: AnnData,
+        groupby: str,
+        n_components: int = 50,
+        feature_space_key: str = "X_pca",
+        agg_func=np.median,  # default to np.median; user can supply np.mean, etc.
     ) -> pd.DataFrame:
-        """Return Cell-averaged components from a custom feature space.
-
-        TODO: consider merging with `get_pseudobulks`
-        TODO: DIALOGUE recommends running PCA on each cell type separately before running PMD - this should be implemented as an option here.
+        """Return cell-averaged components from a passed feature space using a custom aggregation function.
 
         Args:
-            groupby: The key to groupby for pseudobulks.
-            n_components: The number of components to use.
-            feature_key: The key in adata.obsm for the feature space (e.g., "X_pca", "X_umap").
+            groupby: The key to group by for pseudobulks (e.g., a sample identifier).
+            n_components: The number of components (features) to use.
+            feature_space_key: The key in adata.obsm for the feature space (e.g., "X_pca", "X_umap").
+            agg_func: The aggregation function to use (e.g., np.median, np.mean). It should accept an `axis` argument.
 
         Returns:
-            A pseudobulk DataFrame of the averaged components.
+            A pseudobulk DataFrame of the aggregated components, with samples as rows and components as columns.
         """
         aggr = {}
+        # Loop over each category in the specified groupby column (assumed to be categorical)
         for category in adata.obs.loc[:, groupby].cat.categories:
             temp = adata.obs.loc[:, groupby] == category
-            aggr[category] = adata[temp].obsm[feature_key][:, :n_components].mean(axis=0)
-        aggr = pd.DataFrame(aggr)
-        return aggr
-
-    def _scale_data(self, pseudobulks: pd.DataFrame, normalize: bool = True) -> np.ndarray:
-        """Row-wise mean center and scale by the standard deviation.
+            # Apply the user-provided aggregation function along axis 0 (averaging across cells)
+            aggr[category] = agg_func(adata[temp].obsm[feature_space_key][:, :n_components], axis=0)
+        # Create a DataFrame; keys become columns
+        aggr_df = pd.DataFrame(aggr)
+        # Transpose so that rows correspond to samples and columns to features
+        return aggr_df.T
+
+    def _scale_data(self, pseudobulks: pd.DataFrame, normalize: bool = True, cap: float = 0.01) -> np.ndarray:
+        """Row-wise mean center and scale by the standard deviation,
+        and then cap extreme values based on quantiles.
+
+        This mimics the following R function (excluding row subsetting):
+
+            f <- function(X1){
+            if(param$center.flag){
+                X1 <- center.matrix(X1, dim = 2, sd.flag = TRUE)
+                X1 <- cap.mat(X1, cap = 0.01, MARGIN = 2)
+            }
+            X1 <- X1[samplesU, ]
+            return(X1)
+            }
 
         Args:
-            pseudobulks: The pseudobulk PCA components.
-            normalize: Whether to mimic DIALOGUE behavior or not.
+            pseudobulks: The pseudobulk PCA components as a DataFrame (samples as rows, features as columns).
+            normalize: Whether to perform centering, scaling, and capping.
+            cap: The quantile threshold for capping. For example, cap=0.01 means that for each column, values
+                above the 99th percentile are set to the 99th percentile, and values below the 1st percentile are set to the 1st percentile.
 
         Returns:
-            The scaled count matrix.
+            The processed (scaled and capped) matrix as a NumPy array.
         """
-        # TODO: the `scale` function we implemented to match the R `scale` fn should already contain this functionality.
-        # DIALOGUE doesn't scale the data before passing to multicca, unlike what is recommended by sparsecca.
-        # However, performing this scaling _does_ increase overall correlation of the end result
         if normalize:
-            return pseudobulks.to_numpy()
+            # Center and scale (column-wise: subtract the column mean and divide by the column std)
+            scaled = (pseudobulks - pseudobulks.mean()) / pseudobulks.std()
+
+            # Apply quantile-based capping column-wise.
+            capped = scaled.copy()
+            for col in scaled.columns:
+                lower = scaled[col].quantile(cap)  # lower quantile (e.g., 1st percentile)
+                upper = scaled[col].quantile(1 - cap)  # upper quantile (e.g., 99th percentile)
+                capped[col] = scaled[col].clip(lower=lower, upper=upper)
+
+            return capped.to_numpy()
         else:
-            return ((pseudobulks - pseudobulks.mean()) / pseudobulks.std()).to_numpy()
+            return pseudobulks.to_numpy()
 
     def _concat_adata_mcp_scores(
         self, adata: AnnData, ct_subs: dict[str, AnnData], mcp_scores: dict[str, np.ndarray], celltype_key: str
@@ -560,31 +588,61 @@ def _load(
         ct_order: list[str],
         agg_feature: bool = True,
         normalize: bool = True,
+        subset_common: bool = True,  # new optional parameter
     ) -> tuple[list, dict]:
-        """Separates cell into AnnDatas by celltype_key and creates the multifactor PMD input.
+        """Separates cells into AnnDatas by celltype_key and creates the multifactor PMD input.
 
         Mimics DIALOGUE's `make.cell.types` and the pre-processing that occurs in DIALOGUE1.
 
         Args:
-            adata: AnnData object generate celltype objects for
-            ct_order: The order of cell types
+            adata: AnnData object to generate celltype objects for.
+            ct_order: The order of cell types.
             agg_feature: Whether to aggregate pseudobulks with some embeddings or not.
             normalize: Whether to mimic DIALOGUE behavior or not.
+            subset_common: If True, restrict output to common samples across cell types.
 
         Returns:
-            A celltype_label:array dictionary.
+            A tuple with:
+            - mcca_in: A list of pseudobulk matrices (one per cell type), with rows corresponding to sample IDs (if subset_common is True).
+            - ct_subs: A dictionary mapping each cell type to its corresponding AnnData subset (restricted to common samples if subset_common is True).
         """
+        # 1. Split the AnnData into cell-type–specific subsets.
         ct_subs = {ct: adata[adata.obs[self.celltype_key] == ct].copy() for ct in ct_order}
-        fn = self._pseudobulk_feature if agg_feature else self._get_pseudobulks
+
+        # 2. Choose the aggregation function based on the flag.
+        fn = self._pseudobulk_feature_space if agg_feature else self._get_pseudobulks
+
+        # 3. Compute pseudobulk features for each cell type.
+        #    Here, fn should return a DataFrame with sample IDs as the row index.
         ct_aggr = {ct: fn(ad, self.sample_id) for ct, ad in ct_subs.items()}  # type: ignore
 
-        # TODO: implement check (as in https://github.com/livnatje/DIALOGUE/blob/55da9be0a9bf2fcd360d9e11f63e30d041ec4318/R/DIALOGUE.main.R#L114-L119)
-        # that there are at least 5 share samples here
+        # 4. Apply scaling/normalization to the aggregated data.
+        #    We wrap the output back in a DataFrame to preserve the sample IDs.
+        ct_scaled = {
+            ct: pd.DataFrame(self._scale_data(df, normalize=normalize), index=df.index, columns=df.columns)
+            for ct, df in ct_aggr.items()
+        }
 
-        # TODO: https://github.com/livnatje/DIALOGUE/blob/55da9be0a9bf2fcd360d9e11f63e30d041ec4318/R/DIALOGUE.main.R#L121-L131
-        ct_preprocess = {ct: self._scale_data(ad, normalize=normalize).T for ct, ad in ct_aggr.items()}
+        if subset_common:
+            # 5. Determine the set of common samples across all cell types (using the scaled data).
+            common_samples = set(ct_scaled[ct_order[0]].index)
+            for ct in ct_order[1:]:
+                common_samples = common_samples.intersection(set(ct_scaled[ct].index))
+            common_samples_sorted = sorted(common_samples)
 
-        mcca_in = [ct_preprocess[ct] for ct in ct_order]
+            # Check if there are at least 5 common samples.
+            if len(common_samples_sorted) < 5:
+                raise ValueError("Cannot run DIALOGUE with less than 5 common samples across cell types.")
+
+            # 6. Subset each scaled pseudobulk DataFrame to only the common samples.
+            ct_scaled = {ct: df.loc[common_samples_sorted] for ct, df in ct_scaled.items()}
+
+            # 7. Also, restrict each cell-type AnnData to cells belonging to one of the common samples.
+            for ct in ct_subs:
+                ct_subs[ct] = ct_subs[ct][ct_subs[ct].obs[self.sample_id].isin(common_samples_sorted)].copy()
+
+        # 8. Order the preprocessed pseudobulk matrices as a list in the order specified by ct_order.
+        mcca_in = [ct_scaled[ct] for ct in ct_order]
 
         return mcca_in, ct_subs
 
@@ -633,6 +691,8 @@ def calculate_multifactor_PMD(
 
         mcca_in, ct_subs = self._load(adata, ct_order=cell_types, agg_feature=agg_feature, normalize=normalize)
 
+        mcca_in = [df.to_numpy() if hasattr(df, "to_numpy") else df for df in mcca_in]
+
         n_samples = mcca_in[0].shape[1]
         if penalties is None:
             try:
@@ -656,7 +716,7 @@ def calculate_multifactor_PMD(
         ws_dict = {ct: ws[i] for i, ct in enumerate(ct_order)}
 
         pre_r_scores = {
-            ct: ct_subs[ct].obsm["X_pca"][:, :50] @ ws[i]
+            ct: ct_subs[ct].obsm["X_pca"][:, :30] @ ws[i]
             for i, ct in enumerate(cell_types)  # TODO change from 50
         }
 

From c27ffed0a6883a4d70cfbc1c9e0ba4059806a38b Mon Sep 17 00:00:00 2001
From: Lukas Heumos <lukas.heumos@posteo.net>
Date: Sat, 22 Feb 2025 12:03:31 +0100
Subject: [PATCH 5/5] Back to 50

---
 pertpy/tools/_dialogue.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pertpy/tools/_dialogue.py b/pertpy/tools/_dialogue.py
index 64f2a541..35debef6 100644
--- a/pertpy/tools/_dialogue.py
+++ b/pertpy/tools/_dialogue.py
@@ -716,7 +716,7 @@ def calculate_multifactor_PMD(
         ws_dict = {ct: ws[i] for i, ct in enumerate(ct_order)}
 
         pre_r_scores = {
-            ct: ct_subs[ct].obsm["X_pca"][:, :30] @ ws[i]
+            ct: ct_subs[ct].obsm["X_pca"][:, :50] @ ws[i]
             for i, ct in enumerate(cell_types)  # TODO change from 50
         }