From df6f416f5e2f57f77c92897bb186b5bcb41ad4d5 Mon Sep 17 00:00:00 2001 From: Goncalo Pinto Date: Sun, 26 Jan 2025 19:25:31 +0100 Subject: [PATCH 1/5] Changing a typo to try it out --- pertpy/preprocessing/_guide_rna.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pertpy/preprocessing/_guide_rna.py b/pertpy/preprocessing/_guide_rna.py index e85e9c29..6388891e 100644 --- a/pertpy/preprocessing/_guide_rna.py +++ b/pertpy/preprocessing/_guide_rna.py @@ -17,7 +17,7 @@ class GuideAssignment: - """Offers simple guide assigment based on count thresholds.""" + """Offers Simple guide assigment based on count thresholds.""" def assign_by_threshold( self, From a519397324a38656123d8d7fbf515f49c8c7ad6a Mon Sep 17 00:00:00 2001 From: Goncalo Pinto Date: Mon, 10 Feb 2025 19:03:23 +0100 Subject: [PATCH 2/5] dialogue embeddings added --- pertpy/tools/_dialogue.py | 33 +++++++++++++++------------------ 1 file changed, 15 insertions(+), 18 deletions(-) diff --git a/pertpy/tools/_dialogue.py b/pertpy/tools/_dialogue.py index 3194ecae..638dc430 100644 --- a/pertpy/tools/_dialogue.py +++ b/pertpy/tools/_dialogue.py @@ -82,27 +82,24 @@ def _get_pseudobulks( return pseudobulk - def _pseudobulk_pca(self, adata: AnnData, groupby: str, n_components: int = 50) -> pd.DataFrame: - """Return cell-averaged PCA components. - - TODO: consider merging with `get_pseudobulks` - TODO: DIALOGUE recommends running PCA on each cell type separately before running PMD - this should be implemented as an option here. + def _pseudobulk_feature( + self, adata: AnnData, groupby: str, n_components: int = 50, feature_key: str = "X_pca" + ) -> pd.DataFrame: + """Return Cell-averaged components from a custom feature space. Args: - groupby: The key to groupby for pseudobulks - n_components: The number of PCA components + groupby: The key to groupby for pseudobulks. + n_components: The number of components to use. + feature_key: The key in adata.obsm for the feature space (e.g., "X_pca", "X_umap"). Returns: - A pseudobulk of PCA components. + A pseudobulk DataFrame of the averaged components. """ aggr = {} - for category in adata.obs.loc[:, groupby].cat.categories: temp = adata.obs.loc[:, groupby] == category - aggr[category] = adata[temp].obsm["X_pca"][:, :n_components].mean(axis=0) - + aggr[category] = adata[temp].obsm[feature_key][:, :n_components].mean(axis=0) aggr = pd.DataFrame(aggr) - return aggr def _scale_data(self, pseudobulks: pd.DataFrame, normalize: bool = True) -> np.ndarray: @@ -558,7 +555,7 @@ def _load( self, adata: AnnData, ct_order: list[str], - agg_pca: bool = True, + agg_feature: bool = True, normalize: bool = True, ) -> tuple[list, dict]: """Separates cell into AnnDatas by celltype_key and creates the multifactor PMD input. @@ -568,14 +565,14 @@ def _load( Args: adata: AnnData object generate celltype objects for ct_order: The order of cell types - agg_pca: Whether to aggregate pseudobulks with PCA or not. + agg_feature: Whether to aggregate pseudobulks with some embeddings or not. normalize: Whether to mimic DIALOGUE behavior or not. Returns: A celltype_label:array dictionary. """ ct_subs = {ct: adata[adata.obs[self.celltype_key] == ct].copy() for ct in ct_order} - fn = self._pseudobulk_pca if agg_pca else self._get_pseudobulks + fn = self._pseudobulk_feature if agg_feature else self._get_pseudobulks ct_aggr = {ct: fn(ad, self.sample_id) for ct, ad in ct_subs.items()} # type: ignore # TODO: implement check (as in https://github.com/livnatje/DIALOGUE/blob/55da9be0a9bf2fcd360d9e11f63e30d041ec4318/R/DIALOGUE.main.R#L114-L119) @@ -593,7 +590,7 @@ def calculate_multifactor_PMD( adata: AnnData, penalties: list[int] = None, ct_order: list[str] = None, - agg_pca: bool = True, + agg_feature: bool = True, solver: Literal["lp", "bs"] = "bs", normalize: bool = True, ) -> tuple[AnnData, dict[str, np.ndarray], dict[Any, Any], dict[Any, Any]]: @@ -606,7 +603,7 @@ def calculate_multifactor_PMD( sample_id: Key to use for pseudobulk determination. penalties: PMD penalties. ct_order: The order of cell types. - agg_pca: Whether to calculate cell-averaged PCA components. + agg_features: Whether to calculate cell-averaged principal components. solver: Which solver to use for PMD. Must be one of "lp" (linear programming) or "bs" (binary search). For differences between these to please refer to https://github.com/theislab/sparsecca/blob/main/examples/linear_programming_multicca.ipynb normalize: Whether to mimic DIALOGUE as close as possible @@ -631,7 +628,7 @@ def calculate_multifactor_PMD( else: ct_order = cell_types = adata.obs[self.celltype_key].astype("category").cat.categories - mcca_in, ct_subs = self._load(adata, ct_order=cell_types, agg_pca=agg_pca, normalize=normalize) + mcca_in, ct_subs = self._load(adata, ct_order=cell_types, agg_feature=agg_feature, normalize=normalize) n_samples = mcca_in[0].shape[1] if penalties is None: From 1a2edceb57d99ff7c1d4b77db5608972d2d76266 Mon Sep 17 00:00:00 2001 From: Goncalo Pinto Date: Mon, 10 Feb 2025 19:06:07 +0100 Subject: [PATCH 3/5] adding the TODO --- pertpy/tools/_dialogue.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/pertpy/tools/_dialogue.py b/pertpy/tools/_dialogue.py index 638dc430..97498a13 100644 --- a/pertpy/tools/_dialogue.py +++ b/pertpy/tools/_dialogue.py @@ -87,6 +87,9 @@ def _pseudobulk_feature( ) -> pd.DataFrame: """Return Cell-averaged components from a custom feature space. + TODO: consider merging with `get_pseudobulks` + TODO: DIALOGUE recommends running PCA on each cell type separately before running PMD - this should be implemented as an option here. + Args: groupby: The key to groupby for pseudobulks. n_components: The number of components to use. From 0e52356e04d011eb5e96dd852bc630a82f67b638 Mon Sep 17 00:00:00 2001 From: Goncalo Pinto Date: Fri, 21 Feb 2025 18:22:45 +0100 Subject: [PATCH 4/5] Fixed bugs in DIALOGUE for _load, _scale_data and pseudobulk --- .RData | Bin 0 -> 2595 bytes .Rhistory | 2 + pertpy/tools/_dialogue.py | 132 +++++++++++++++++++++++++++----------- 3 files changed, 98 insertions(+), 36 deletions(-) create mode 100644 .RData create mode 100644 .Rhistory diff --git a/.RData b/.RData new file mode 100644 index 0000000000000000000000000000000000000000..d019649332dde5b8be6c4635956b2c7e6e2ba379 GIT binary patch literal 2595 zcmV+;3f%P{iwFP!000000|5*Q^Yv0hSThP(3IG5A0{{dB0ssRA000001yxi=EjR!G z1Ofm60096500{s901PftVQyq^Z7y?VWn=&V01W^D0&)NVD5C%X0&r2*@oHqtv*W`g zlgP7^e<{hj=+^O7QD5iM{1jHy-~~6_r2>@`Je!vmXCQ}7G@*GQQSS8kv6p?p;OuRg z82zH!+nQnmtW!%jHF!Q`=^6jM?gm~n=+FBy@Wigj!1d{a^)jbW;|uy&UeHuze32_$ z9Ug;Q*9WnEi<}KRx9wRLQOe5Dz{A70Lnup8&k08lKQ`>4Tt5pX5p>`M-Q|wlQDW!u zC3_%!LV%+#z=6!Sv`+VtSa@iXZ^yx75cF!@G-+-B;OtFIAs zUL-*qHr2amxsn~*1M3Tb5-`ZC0C8Yu7k+qfHRoD>P z1jjQL?py)Wi><&bqKdq{DudfuvFPT+KFI+ed-nTx!T&YCOf(HbsimrXf1R+8o`)-_@?H54)25k5$pnZhJ-5NW_ zU(-;Fa`*>%sF~7~$An z5ghinA8q`pvr45&R6}V(;HZ$EZiz_*c(yBR=f5ZHM)W?zHDzIJ;^J(Znuz49Rb>{9$7cZ-@^Q3IHZdJeq zs}i|O=W8SLzp(0@Z}C9o;Qobx5odYVDtxH%QBB*d1FEQ)kt0&B00;|SZfR7azZS_I zBA=C1U+L=ecBtc*^?aC zvADi>s98a`R7OKw3J<46(F9M4lr=%;UvlM=I7@py;}EzeqtOu)%U*elphpZj^W3kd zb1jchgbH3bJ_C$0&uFNiUP+67HFi=x&40{n0kkkjfhEiKOQIpO%Dd?Q*1&0ox4{qg z@{UEHakon=T8zKko*i}17CP}SzH?4mSs1PNkU4S6Vy4KsZ10ovUSu>8sue-qL()ES z`76=mws2Yu$FYhK<-SnxpC_m5Z9Lgdg$;$TG-@YcTI0v5PRPn!J}LPw+t$a0=wEwg z=M?fc4+YTQNCM$@!<$#6f*=cb)1eqtr0#7ND3_yUE9@sLC;hgQ-kOzB6H~)Fuy_G& zXc-#*p$07)KhLufo5bbDul2_NSTd?6Dhpb@InaMb+mT>(FdkuN4hKm?mYlF*hV8iE z6v}}}=ffiDeke|g4hi*4vNi>+b3e`gd~{upjOD`sbYh{XEPE}nU(^2!&RNhc%JICrr`CL4xTCI8*Wr}D! zQG?7T?C4?}eaUk#?)IB{*#DPfw=@Dw?Ki7p zzzhN4Wt~fRSGv`|vGbUH)|gj9kGauOd?rhQSZ`o}bL2pyVU=)RwNijyT&wc$ruT$H-LriI_7 z8G@4o2OH<)51>{Hqah)?PHxm^WEZ;kQEZr+uYJBm;2ON5#?u)@?f9xdCz-zcdWJ6_ z>rY0}aYV_yCv`y~iLzyjO^dMzCs0(@ixFS$Ii4g>DThJ_92{6W?roE~c1kSIdqd`{ z8>oX6t%$WLn{=ZXOvGf{hz4LFZ0Q^*n0f+2OS5>PwOEA_j#g0vlXt%LNC5)WYEggG; znK~ktrE~s#R2XgSk2#p_m8KkkL+_dxiDWJW85Jj^Cekf%z$@K15pV))rm+rOc)qIa zRwp&b_&hN9)D5l^cWT-Z0pos}3CKO5{V9X)&eoNZe_ysH(iErFJ_UE}>jDXON}J#w zogky0a-oD&_wGQCpO>AqcI@hTHvJ*k;hO)udQ-@%D|mnL=NY7nR0b;Cmd_m=M#KgH zp7nKxyE3nraSOf<6wv)8I+4{{LmsitMF$|Yp_~juRqc~C*UEL>4#TR4xhO0?O$p@I zB{oEria!nm=X6t5JLO9-mtsWwGkY>Xc!2yq{(&#wT22SQMt3nbcub`RFx{t3S^7YH zzcaYave+S|F8PC^LT2c+S+Cltv?N|)6xF0|f0D?Ek?R()$T02dzHu)XJ$@lMqvtGP z`uu+|$ITo9Xe#w#i=+P67>E}IjOj#ztClM81s|7{tkkJ?DaGA`bHak=j9U~M1K@7w zdSMFm?5-lzT~`VtDXb9k8ol@U0$MB*bQ7+=>Ntx2X4#+0V8)!kjE~F}(S%x{dbKlh zbMkh4BFjorN1YB%6DXqGZD2naYc!SYkC+M^8yvg#6VfMXQ z@Bbn+A~Y+o2c`4=fFjRuRXZkwKS02b=0RvJuMTa7 pd.DataFrame: """Return cell-averaged data by groupby. @@ -82,46 +82,74 @@ def _get_pseudobulks( return pseudobulk - def _pseudobulk_feature( - self, adata: AnnData, groupby: str, n_components: int = 50, feature_key: str = "X_pca" + def _pseudobulk_feature_space( + self, + adata: AnnData, + groupby: str, + n_components: int = 50, + feature_space_key: str = "X_pca", + agg_func=np.median, # default to np.median; user can supply np.mean, etc. ) -> pd.DataFrame: - """Return Cell-averaged components from a custom feature space. - - TODO: consider merging with `get_pseudobulks` - TODO: DIALOGUE recommends running PCA on each cell type separately before running PMD - this should be implemented as an option here. + """Return cell-averaged components from a passed feature space using a custom aggregation function. Args: - groupby: The key to groupby for pseudobulks. - n_components: The number of components to use. - feature_key: The key in adata.obsm for the feature space (e.g., "X_pca", "X_umap"). + groupby: The key to group by for pseudobulks (e.g., a sample identifier). + n_components: The number of components (features) to use. + feature_space_key: The key in adata.obsm for the feature space (e.g., "X_pca", "X_umap"). + agg_func: The aggregation function to use (e.g., np.median, np.mean). It should accept an `axis` argument. Returns: - A pseudobulk DataFrame of the averaged components. + A pseudobulk DataFrame of the aggregated components, with samples as rows and components as columns. """ aggr = {} + # Loop over each category in the specified groupby column (assumed to be categorical) for category in adata.obs.loc[:, groupby].cat.categories: temp = adata.obs.loc[:, groupby] == category - aggr[category] = adata[temp].obsm[feature_key][:, :n_components].mean(axis=0) - aggr = pd.DataFrame(aggr) - return aggr - - def _scale_data(self, pseudobulks: pd.DataFrame, normalize: bool = True) -> np.ndarray: - """Row-wise mean center and scale by the standard deviation. + # Apply the user-provided aggregation function along axis 0 (averaging across cells) + aggr[category] = agg_func(adata[temp].obsm[feature_space_key][:, :n_components], axis=0) + # Create a DataFrame; keys become columns + aggr_df = pd.DataFrame(aggr) + # Transpose so that rows correspond to samples and columns to features + return aggr_df.T + + def _scale_data(self, pseudobulks: pd.DataFrame, normalize: bool = True, cap: float = 0.01) -> np.ndarray: + """Row-wise mean center and scale by the standard deviation, + and then cap extreme values based on quantiles. + + This mimics the following R function (excluding row subsetting): + + f <- function(X1){ + if(param$center.flag){ + X1 <- center.matrix(X1, dim = 2, sd.flag = TRUE) + X1 <- cap.mat(X1, cap = 0.01, MARGIN = 2) + } + X1 <- X1[samplesU, ] + return(X1) + } Args: - pseudobulks: The pseudobulk PCA components. - normalize: Whether to mimic DIALOGUE behavior or not. + pseudobulks: The pseudobulk PCA components as a DataFrame (samples as rows, features as columns). + normalize: Whether to perform centering, scaling, and capping. + cap: The quantile threshold for capping. For example, cap=0.01 means that for each column, values + above the 99th percentile are set to the 99th percentile, and values below the 1st percentile are set to the 1st percentile. Returns: - The scaled count matrix. + The processed (scaled and capped) matrix as a NumPy array. """ - # TODO: the `scale` function we implemented to match the R `scale` fn should already contain this functionality. - # DIALOGUE doesn't scale the data before passing to multicca, unlike what is recommended by sparsecca. - # However, performing this scaling _does_ increase overall correlation of the end result if normalize: - return pseudobulks.to_numpy() + # Center and scale (column-wise: subtract the column mean and divide by the column std) + scaled = (pseudobulks - pseudobulks.mean()) / pseudobulks.std() + + # Apply quantile-based capping column-wise. + capped = scaled.copy() + for col in scaled.columns: + lower = scaled[col].quantile(cap) # lower quantile (e.g., 1st percentile) + upper = scaled[col].quantile(1 - cap) # upper quantile (e.g., 99th percentile) + capped[col] = scaled[col].clip(lower=lower, upper=upper) + + return capped.to_numpy() else: - return ((pseudobulks - pseudobulks.mean()) / pseudobulks.std()).to_numpy() + return pseudobulks.to_numpy() def _concat_adata_mcp_scores( self, adata: AnnData, ct_subs: dict[str, AnnData], mcp_scores: dict[str, np.ndarray], celltype_key: str @@ -560,31 +588,61 @@ def _load( ct_order: list[str], agg_feature: bool = True, normalize: bool = True, + subset_common: bool = True, # new optional parameter ) -> tuple[list, dict]: - """Separates cell into AnnDatas by celltype_key and creates the multifactor PMD input. + """Separates cells into AnnDatas by celltype_key and creates the multifactor PMD input. Mimics DIALOGUE's `make.cell.types` and the pre-processing that occurs in DIALOGUE1. Args: - adata: AnnData object generate celltype objects for - ct_order: The order of cell types + adata: AnnData object to generate celltype objects for. + ct_order: The order of cell types. agg_feature: Whether to aggregate pseudobulks with some embeddings or not. normalize: Whether to mimic DIALOGUE behavior or not. + subset_common: If True, restrict output to common samples across cell types. Returns: - A celltype_label:array dictionary. + A tuple with: + - mcca_in: A list of pseudobulk matrices (one per cell type), with rows corresponding to sample IDs (if subset_common is True). + - ct_subs: A dictionary mapping each cell type to its corresponding AnnData subset (restricted to common samples if subset_common is True). """ + # 1. Split the AnnData into cell-type–specific subsets. ct_subs = {ct: adata[adata.obs[self.celltype_key] == ct].copy() for ct in ct_order} - fn = self._pseudobulk_feature if agg_feature else self._get_pseudobulks + + # 2. Choose the aggregation function based on the flag. + fn = self._pseudobulk_feature_space if agg_feature else self._get_pseudobulks + + # 3. Compute pseudobulk features for each cell type. + # Here, fn should return a DataFrame with sample IDs as the row index. ct_aggr = {ct: fn(ad, self.sample_id) for ct, ad in ct_subs.items()} # type: ignore - # TODO: implement check (as in https://github.com/livnatje/DIALOGUE/blob/55da9be0a9bf2fcd360d9e11f63e30d041ec4318/R/DIALOGUE.main.R#L114-L119) - # that there are at least 5 share samples here + # 4. Apply scaling/normalization to the aggregated data. + # We wrap the output back in a DataFrame to preserve the sample IDs. + ct_scaled = { + ct: pd.DataFrame(self._scale_data(df, normalize=normalize), index=df.index, columns=df.columns) + for ct, df in ct_aggr.items() + } - # TODO: https://github.com/livnatje/DIALOGUE/blob/55da9be0a9bf2fcd360d9e11f63e30d041ec4318/R/DIALOGUE.main.R#L121-L131 - ct_preprocess = {ct: self._scale_data(ad, normalize=normalize).T for ct, ad in ct_aggr.items()} + if subset_common: + # 5. Determine the set of common samples across all cell types (using the scaled data). + common_samples = set(ct_scaled[ct_order[0]].index) + for ct in ct_order[1:]: + common_samples = common_samples.intersection(set(ct_scaled[ct].index)) + common_samples_sorted = sorted(common_samples) - mcca_in = [ct_preprocess[ct] for ct in ct_order] + # Check if there are at least 5 common samples. + if len(common_samples_sorted) < 5: + raise ValueError("Cannot run DIALOGUE with less than 5 common samples across cell types.") + + # 6. Subset each scaled pseudobulk DataFrame to only the common samples. + ct_scaled = {ct: df.loc[common_samples_sorted] for ct, df in ct_scaled.items()} + + # 7. Also, restrict each cell-type AnnData to cells belonging to one of the common samples. + for ct in ct_subs: + ct_subs[ct] = ct_subs[ct][ct_subs[ct].obs[self.sample_id].isin(common_samples_sorted)].copy() + + # 8. Order the preprocessed pseudobulk matrices as a list in the order specified by ct_order. + mcca_in = [ct_scaled[ct] for ct in ct_order] return mcca_in, ct_subs @@ -633,6 +691,8 @@ def calculate_multifactor_PMD( mcca_in, ct_subs = self._load(adata, ct_order=cell_types, agg_feature=agg_feature, normalize=normalize) + mcca_in = [df.to_numpy() if hasattr(df, "to_numpy") else df for df in mcca_in] + n_samples = mcca_in[0].shape[1] if penalties is None: try: @@ -656,7 +716,7 @@ def calculate_multifactor_PMD( ws_dict = {ct: ws[i] for i, ct in enumerate(ct_order)} pre_r_scores = { - ct: ct_subs[ct].obsm["X_pca"][:, :50] @ ws[i] + ct: ct_subs[ct].obsm["X_pca"][:, :30] @ ws[i] for i, ct in enumerate(cell_types) # TODO change from 50 } From c27ffed0a6883a4d70cfbc1c9e0ba4059806a38b Mon Sep 17 00:00:00 2001 From: Lukas Heumos Date: Sat, 22 Feb 2025 12:03:31 +0100 Subject: [PATCH 5/5] Back to 50 --- pertpy/tools/_dialogue.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pertpy/tools/_dialogue.py b/pertpy/tools/_dialogue.py index 64f2a541..35debef6 100644 --- a/pertpy/tools/_dialogue.py +++ b/pertpy/tools/_dialogue.py @@ -716,7 +716,7 @@ def calculate_multifactor_PMD( ws_dict = {ct: ws[i] for i, ct in enumerate(ct_order)} pre_r_scores = { - ct: ct_subs[ct].obsm["X_pca"][:, :30] @ ws[i] + ct: ct_subs[ct].obsm["X_pca"][:, :50] @ ws[i] for i, ct in enumerate(cell_types) # TODO change from 50 }