diff --git a/code/mnm_analysis/auto_finemapping_postprocessing.ipynb b/code/mnm_analysis/auto_finemapping_postprocessing.ipynb new file mode 100644 index 000000000..44f28fc25 --- /dev/null +++ b/code/mnm_analysis/auto_finemapping_postprocessing.ipynb @@ -0,0 +1,1111 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "vscode": { + "languageId": "sos" + } + }, + "outputs": [], + "source": [ + " sos run /home/rl3328/xqtl-protocol/code/mnm_analysis/auto_finemapping_post_processing.ipynb gwas_results_export \\\n", + " --region_file /mnt/vast/hpc/homes/rf2872/codes/xqtl-analysis/resource/hg38_1362_blocks.bed \\\n", + " --file_path /home/rl3328/image_QTL/output/univariate_rss \\\n", + " --name AI_IMAGE \\\n", + " --suffix univariate.rds \\\n", + " --prefix Automated_QC \\\n", + " --min_corr 0.8 \\\n", + " --geno_ref /mnt/vast/hpc/csg/rf2872/data/Fungen_xQTL/geno_align/Fungen_xQTL.ROSMAP_NIA_WGS.ROSMAP_NIA_WGS.MSBB_WGS_ADSP_hg38.MiGA.MAP_Brain-xQTL_Gwas_geno_0.STARNET.aligned.bim.gz \\\n", + " --cwd /home/rl3328/image_QTL/auto_finemapping_export \\\n", + " --gwas \\\n", + " --cs_corr_thres 0.3 \\ ## the default is 0.5\n", + " --context-meta /home/rl3328/GWAS/image_GWAS/context_meta.tsv \\ # to be developed\n", + " --region-name chr3_162827771_166218666 \\#optional \n", + " --step1_only #optional " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "vscode": { + "languageId": "sos" + } + }, + "outputs": [], + "source": [ + "[global]\n", + "import glob\n", + "import pandas as pd\n", + "# A region list file documenting the chr_pos_ref_alt of a susie_object\n", + "parameter: cwd = path(\"output\")\n", + "parameter: name = \"demo\"\n", + "\n", + "## Path to work directory where output locates\n", + "## Containers that contains the necessary packages\n", + "parameter: container = \"\"\n", + "import re\n", + "parameter: entrypoint= \"\"\n", + "# For cluster jobs, number commands to run per job\n", + "parameter: job_size = 50\n", + "# Wall clock time expected\n", + "parameter: walltime = \"96h\"\n", + "# Memory expected\n", + "parameter: mem = \"6G\"\n", + "# Number of threads\n", + "parameter: numThreads = 2\n", + "parameter: windows = 1000000\n", + "# use this function to edit memory string for PLINK input\n", + "from sos.utils import expand_size" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "vscode": { + "languageId": "sos" + } + }, + "outputs": [], + "source": [ + "# Exporting cis susie_twas results\n", + "[cis_results_export_1, gwas_results_export_1]\n", + "# per chunk we process at most 200 datasets\n", + "parameter: per_chunk = 200\n", + "# Region list should have last column being region name. \n", + "parameter:region_file=path()\n", + "# the path stored original output files\n", + "parameter:file_path=''\n", + "# assuming orinal files are named as prefix.id.suffix\n", + "# prefix of the original files (before id) to identify file.\n", + "parameter:prefix=[]\n", + "# suffix of the original files (after id) to identify file.\n", + "parameter:suffix=str\n", + "# if need to rename context, please load condition meta file\n", + "parameter: condition_meta = path()\n", + "# if the pip all variants in a cs < this threshold, then remove this cs\n", + "parameter: pip_thres = 0.05\n", + "# only keep top cs_size variants in one cs \n", + "parameter: cs_size = 3\n", + "# provide exported meta to filter the exported genes \n", + "parameter: exported_file = path()\n", + "# Optional: if a region list is provide the analysis will be focused on provided region. \n", + "# The LAST column of this list will contain the ID of regions to focus on\n", + "# Otherwise, all regions with both genotype and phenotype files will be analyzed\n", + "parameter: region_list = path()\n", + "# Optional: if a region name is provided \n", + "# the analysis would be focused on the union of provides region list and region names\n", + "parameter: region_name = []\n", + "# the (aligned) geno bim file to check allele flipping\n", + "parameter: geno_ref = path()\n", + "# context meta file to map renamed context backs\n", + "parameter: context_meta = path()\n", + "# default cretria in susie purity filtering, recommended to use 0.8 here \n", + "parameter: min_corr = []\n", + "# set this parameter as `True` when exporting gwas data.\n", + "parameter: gwas = False\n", + "# set this parameter as `True` when exporting fsusie data.\n", + "parameter: fsusie = False\n", + "# set this parameter as `True` when exporting metaQTL data.\n", + "parameter: metaQTL = False\n", + "# set this parameter as `True` when exporting mnm data.\n", + "parameter: mnm = False\n", + "# flag this CS if cs_corr between this CS and top_cs > cs_corr_thres\n", + "parameter: cs_corr_thres = 0.5\n", + "\n", + "import pandas as pd\n", + "import os\n", + "\n", + "region = pd.read_csv(region_file, sep='\\t', names=['chr', 'start', 'end', 'id'])\n", + "\n", + "region_ids = []\n", + "# If region_list is provided, read the file and extract IDs\n", + "if not region_list.is_dir():\n", + " if region_list.is_file():\n", + " region_list_df = pd.read_csv(region_list, sep='\\t', header=None, comment = \"#\")\n", + " region_ids = region_list_df.iloc[:, -1].unique() # Extracting the last column for IDs\n", + " else:\n", + " raise ValueError(\"The region_list path provided is not a file.\")\n", + " \n", + "if len(region_name) > 0:\n", + " region_ids = list(set(region_ids).union(set(region_name)))\n", + " \n", + "if len(region_ids) > 0:\n", + " region = region[region['id'].isin(region_ids)]\n", + "\n", + "# Function to create list of formatted strings for each row\n", + "def create_formatted_list(row):\n", + " if len(prefix) > 0:\n", + " formatted_list = [f\"{file_path}/{p}.{row['id']}.{suffix}\" for p in prefix]\n", + " else:\n", + " formatted_list = [f\"{file_path}/{row['id']}.{suffix}\"] # GWAS data do not have prefix \n", + " return formatted_list\n", + "\n", + "# Apply the function to each row\n", + "region['original_data'] = region.apply(create_formatted_list, axis=1)\n", + "\n", + "def group_by_region(lst, partition):\n", + " # from itertools import accumulate\n", + " # partition = [len(x) for x in partition]\n", + " # Compute the cumulative sums once\n", + " # cumsum_vector = list(accumulate(partition))\n", + " # Use slicing based on the cumulative sums\n", + " # return [lst[(cumsum_vector[i-1] if i > 0 else 0):cumsum_vector[i]] for i in range(len(partition))]\n", + " return partition\n", + "\n", + "def filter_existing_paths(row):\n", + " existing_paths = [path for path in row if os.path.exists(path)]\n", + " return existing_paths\n", + "\n", + "def is_file_exported(paths, results_set):\n", + " for path in paths:\n", + " basename = os.path.basename(path)\n", + " if basename not in results_set:\n", + " return False\n", + " return True\n", + "\n", + "region['original_data'] = region['original_data'].apply(filter_existing_paths)\n", + "region = region[region['original_data'].map(bool)]\n", + "\n", + "# if provided exported meta, check if the original data are all exported already, isfo skip them \n", + "if not exported_file.is_dir():\n", + " if exported_file.is_file():\n", + " results = pd.read_csv(exported_file, sep='\\t')\n", + " results_set = set(results['original_data'])\n", + " results_set = {item.strip() for sub in results_set for item in sub.split(',')}\n", + " mask = region['original_data'].apply(lambda paths: not is_file_exported(paths, results_set))\n", + " region = region[mask]\n", + " else:\n", + " raise ValueError(\"The exported_file path provided is not a file.\")\n", + "\n", + "# added because there is frequently changed ID from upstream analysis!!! \n", + "# If after filtering no region remains, update 'id' by concatenating 'chr' and original 'id' with '_'\n", + "if region.empty:\n", + " print(\"No regions remain after filtering. Reattempting using concatenated id (chr_id).\")\n", + " # Update the 'id' column using the concatenation of the first column (chr) and the original 'id'\n", + " region = pd.read_csv(region_file, sep='\\t', names=['chr', 'start', 'end', 'id'])\n", + " region['id'] = region['chr'] + \"_\" + region['id']\n", + " \n", + " # Reapply region_ids filter if provided\n", + " if len(region_ids) > 0:\n", + " region = region[region['id'].isin(region_ids)]\n", + " \n", + " # Recreate original_data with the updated id\n", + " region['original_data'] = region.apply(create_formatted_list, axis=1)\n", + " region['original_data'] = region['original_data'].apply(filter_existing_paths)\n", + " region = region[region['original_data'].map(bool)]\n", + " \n", + " # Reapply exported_file filtering if applicable\n", + " if not exported_file.is_dir():\n", + " if exported_file.is_file():\n", + " results = pd.read_csv(exported_file, sep='\\t')\n", + " results_set = set(results['original_data'])\n", + " results_set = {item.strip() for sub in results_set for item in sub.split(',')}\n", + " mask = region['original_data'].apply(lambda paths: not is_file_exported(paths, results_set))\n", + " region = region[mask]\n", + " else:\n", + " raise ValueError(\"The exported_file path provided is not a file.\")\n", + "regional_data = {\n", + " 'meta': [(row['chr'], row['start'], row['end'], row['id']) for _, row in region.iterrows()],\n", + " 'data': [(row['original_data']) for _, row in region.iterrows()]\n", + "}\n", + "\n", + "meta_info = regional_data['meta']\n", + "stop_if(len(regional_data['data']) == 0, f' All files have been exported already')\n", + "\n", + "input: regional_data[\"data\"], group_by = lambda x: group_by_region(x, regional_data[\"data\"]), group_with = \"meta_info\"\n", + "output: f\"{cwd}/{name}_cache/{name}_{_meta_info[3]}.tsv\"\n", + "task: trunk_workers = job_size, walltime = walltime, trunk_size = job_size, mem = mem, cores = numThreads, tags = f'{_output:bn}'\n", + "R: expand = \"${ }\",stderr = f'{_output:n}.stderr', stdout = f'{_output:n}.stdout', container = container, entrypoint=entrypoint\n", + " library(tidyverse)\n", + " library(data.table)\n", + " library(susieR)\n", + " library(pecotmr)\n", + " # check top_loci existing or not\n", + " has_rows <- function(df) {\n", + " !is.null(df) && nrow(df) > 0\n", + " }\n", + "\n", + " # function to allele flipping checking by mapping them to aligned genotype bim file\n", + " align_to_genoref <- function(var_list, geno_ref, region ){\n", + " geno_ref <- pecotmr:::tabix_region(file= geno_ref,\n", + " region = region)\n", + " colnames(geno_ref) <- c('chr', 'pos', 'alt', 'ref')\n", + " geno_ref <- geno_ref %>% mutate(chr = gsub('chr','',chr))\n", + " var_list_df <- data.frame(chr = str_split(var_list,\":|_\",simplify = T)[,1] %>% gsub('chr','',.),\n", + " pos = str_split(var_list,\":|_\",simplify = T)[,2],\n", + " ref = str_split(var_list,\":|_\",simplify = T)[,3],\n", + " alt = str_split(var_list,\":|_\",simplify = T)[,4])\n", + " # merge_genotype_data from below cell\n", + " aligned_var_df <- merge_genotype_data(geno_ref, var_list_df, all=FALSE)\n", + " aligned_var <- aligned_var_df %>%\n", + " mutate(id = {\n", + " if (grepl(\":\", var_list[1])) {\n", + " if (grepl(\"_\", var_list[1])) {\n", + " paste(chr, paste(pos, ref, alt, sep = \"_\"),sep = ':')\n", + " } else {\n", + " paste(chr, pos, ref, alt, sep = \":\")\n", + " }\n", + " } else {\n", + " paste(chr, pos, ref, alt, sep = \"_\")\n", + " }\n", + " }) %>%\n", + " pull(id)\n", + " if (grepl(\"chr\", var_list[1])) aligned_var <- paste0(\"chr\",aligned_var)\n", + " return(aligned_var)\n", + " }\n", + " \n", + " # function to map variant list to geno ref\n", + " merge_genotype_data <- function(df1, df2, all = TRUE) {\n", + " setDT(df1)\n", + " setDT(df2)\n", + " df1[, key := paste(chr, pos, pmin(alt, ref), pmax(alt, ref))]\n", + " df2[, key := paste(chr, pos, pmin(alt, ref), pmax(alt, ref))]\n", + " df2[df1, on = \"key\", flip := i.alt == ref & i.ref == alt, by = .EACHI]\n", + " df2[flip == TRUE, c(\"alt\", \"ref\") := .(ref, alt)]\n", + " if (all) {\n", + " df_combined <- unique(rbindlist(list(df1[, .(chr, pos, alt, ref)], df2[, .(chr, pos, alt, ref)])), by = c(\"chr\", \"pos\", \"alt\", \"ref\"))\n", + " } else {\n", + " df_combined <- df2[, .(chr, pos, alt, ref)]\n", + " }\n", + " return(df_combined)\n", + " }\n", + "\n", + " # Function to filter credible sets with all variants PIP < 0.05 and update rows based on _min_corr suffix condition\n", + " update_and_filter_cs_ids <- function(dat_con, dat_susie, df, purity_thresholds) {\n", + " # Identify cs_coverage columns\n", + " cs_columns <- grep(\"^cs_coverage\", names(df), value = TRUE)\n", + " \n", + " # Flag rows with any cs_coverage > 0\n", + " df$cs_all_non_zero_orig <- rowSums(df[cs_columns] == 0) != length(cs_columns)\n", + " \n", + " # Iterate over cs_coverage columns and their unique CS IDs\n", + " for (cs_column in cs_columns) {\n", + " unique_cs_ids <- unique(df[[cs_column]])\n", + " # Update top loci based on minimum correlation for each coverage column\n", + " for(purity_threshold in purity_thresholds){\n", + " df <- update_top_loci_cs_annotation(dat_con, dat_susie, top_loci_df = df, coverage_value = cs_column, threshold = purity_threshold)\n", + " }\n", + " \n", + " for (cs_id in unique_cs_ids[unique_cs_ids > 0]) {\n", + " # Flag CS IDs where all PIPs < pip_thres (default 0.05)\n", + " pip_check <- df[df[[cs_column]] == cs_id, \"pip\"] < 0.05\n", + " # label the whole cluster if all pip < pip_thres (default 0.05)\n", + " if (all(pip_check, na.rm = TRUE)) {\n", + " df[[cs_column]] <- ifelse(df[[cs_column]] == cs_id, 0, df[[cs_column]])\n", + " }\n", + " }\n", + " }\n", + " # Identify min corr cs_coverage columns\n", + " mincor_columns <- grep(\"_min_corr\", names(df), value = TRUE) # Identify _mincor columns\n", + " \n", + " # Filter out rows where all cs_coverage columns are 0, all _mincor columns are 0,\n", + " # and cs_all_non_zero_orig is TRUE\n", + " df <- df %>%\n", + " filter(!(cs_all_non_zero_orig &\n", + " rowSums(df[cs_columns] == 0) == length(cs_columns) &\n", + " rowSums(df[mincor_columns] == 0) == length(mincor_columns))) %>%\n", + " select(-cs_all_non_zero_orig) # Remove the temporary flag column\n", + " df <- df %>%\n", + " select(-starts_with(\"cs_coverage\"), all_of(cs_columns), all_of(mincor_columns))\n", + " return(df)\n", + " }\n", + " \n", + " \n", + " # update top loci table\n", + " update_top_loci_cs_annotation <- function(dat_con, dat_susie, top_loci_df, coverage_value, threshold ) {\n", + " # update susie obj based CS\n", + " if(coverage_value == 'cs_coverage_0.95') dat_susie_tmp <- dat_susie\n", + " else dat_susie_tmp <- dat_susie$sets_secondary[[gsub('cs_','',coverage_value)]]\n", + " #get purity res \n", + " purity_res <- dat_susie_tmp$sets$purity \n", + " \n", + " not_pass_min_cs <- rownames(purity_res)[purity_res$min.abs.corr < threshold] %>%\n", + " gsub('L', '', .)\n", + " top_loci_df[[paste0( coverage_value, \"_min_corr_\", threshold)]] <- top_loci_df[[coverage_value]]\n", + " if (length(not_pass_min_cs) > 0) {\n", + " top_loci_df[[paste0( coverage_value, \"_min_corr_\", threshold)]][top_loci_df[[coverage_value]] %in% not_pass_min_cs] <- 0\n", + " }\n", + " return(top_loci_df)\n", + " }\n", + " \n", + " # function to decide run update_and_filter_cs_ids or not\n", + " process_top_loci <- function(dat_con, dat_susie,purity_thresholds) {\n", + " data_frame <- dat_con$top_loci\n", + " if (has_rows(data_frame)) {\n", + " return(update_and_filter_cs_ids(dat_con, dat_susie, data_frame, purity_thresholds))\n", + " } else {\n", + " return(data_frame)\n", + " }\n", + " }\n", + "\n", + " # function to get pip with their variant name\n", + " get_pip_withname <- function(susie_obj){\n", + " pip = susie_obj$susie_result_trimmed$pip\n", + " if(!(is.null(pip))) names(pip) = susie_obj$variant_names\n", + " return(pip)\n", + " }\n", + " \n", + " # add column in top loci to indicate if the variant is identified in susie_on_top_pc top loci table or not\n", + " annotate_susie <- function(obj, top_loci){\n", + " tryCatch({\n", + " df <- data.frame()\n", + " \n", + " results <- lapply(obj[['susie_on_top_pc']], function(x) {\n", + " x[['top_loci']]\n", + " })\n", + " results <- results[!sapply(results, is.null)]\n", + " \n", + " df <- bind_rows(results)\n", + " \n", + " top_loci_others <- df %>%\n", + " filter(rowSums(select(., starts_with('cs_coverage_'))) == 0)\n", + " top_loci_cs <- df %>%\n", + " filter(rowSums(select(., starts_with('cs_coverage_'))) > 0)\n", + " \n", + " susie_vars <- rbind(top_loci_others %>% filter(pip >= 0.05), top_loci_cs) %>% pull(variant_id)\n", + "\n", + "\n", + " if(length(susie_vars) > 0) susie_vars <- align_to_genoref(var_list = susie_vars, geno_ref = geno_ref, region = paste0(gsub(\"chr\",\"\",\"${_meta_info[0]}\"), \":\", \"${_meta_info[1]}\", \"-\", \"${_meta_info[2]}\"))\n", + " \n", + " susie_vars_cs <- top_loci_cs %>% pull(variant_id)\n", + " if(length(susie_vars_cs) > 0) susie_vars_cs <- align_to_genoref(var_list = susie_vars_cs, geno_ref = geno_ref, region = paste0(gsub(\"chr\",\"\",\"${_meta_info[0]}\"), \":\", \"${_meta_info[1]}\", \"-\", \"${_meta_info[2]}\"))\n", + " \n", + " top_loci <- top_loci %>% mutate(annotated_susie = ifelse(variant_id %in% susie_vars, 1, 0),\n", + " annotated_susie_cs = ifelse(variant_id %in% susie_vars_cs, 1, 0))\n", + " \n", + " return(top_loci)\n", + " }, error = function(e) {\n", + " warning(\"Error in annotate_susie: \", e$message)\n", + " })\n", + " }\n", + " \n", + " # function to match context values and add analysis_name\n", + " match_contexts <- function(df, context_meta) {\n", + " # Split context_meta's context column into separate rows, one for each comma-separated value\n", + " context_meta_expanded <- context_meta %>% \n", + " separate_rows(context, sep = \",\") %>%\n", + " mutate(context = trimws(context)) # Remove potential leading and trailing whitespace\n", + " # Loop through each context in df to find the most precise match in context_meta_expanded\n", + " df$super_context <- sapply(df$context, function(df_context) {\n", + " # Find all potential matches\n", + " potential_matches <- context_meta_expanded %>%\n", + " filter(str_detect(df_context, context)) %>%\n", + " arrange(desc(nchar(context))) # Sort potential matches by descending length of context\n", + "\n", + " # Select the longest match as the most precise one\n", + " if(nrow(potential_matches) > 0) {\n", + " return(potential_matches$context[1])\n", + " } else {\n", + " return(NA) # Return NA if no match is found\n", + " }\n", + " })\n", + "\n", + " return(df)\n", + " }\n", + " # Define a function to process context data and match contexts\n", + " process_and_match_contexts <- function(context_meta_path, cons_top_loci, res_minp) {\n", + " # Read context metadata\n", + " context_meta <- fread(context_meta_path)\n", + "\n", + " # Extract super contexts\n", + " super_contexts <- cons_top_loci[[1]] %>% unlist %>% names %>% as.character\n", + " # Create a data frame of context and minimum p-values\n", + " context_df <- data.frame(context = super_contexts,\n", + " min_p = res_minp[[1]][super_contexts] %>% unlist %>% as.numeric)\n", + " # Match contexts using the previously defined match_contexts function\n", + " context_df <- match_contexts(context_df, context_meta)\n", + " # Filter context_df by super_context\n", + " context_df_no_order <- context_df %>% filter(context == super_context)\n", + " context_df_with_order <- context_df %>% filter(context != super_context) \n", + "\n", + " return(list(context_df_no_order= context_df_no_order,context_df_with_order = context_df_with_order ))\n", + " }\n", + "\n", + " # Function to filter introns based on leafcutter2 cluster\n", + " process_lf2_cluster <- function(df){\n", + " df <- df%>%\n", + " mutate(cluster_id = str_extract(context, \"clu_\\\\d+\"), \n", + " category = str_extract(context, \"([^:]+)(?=:EN)\")) %>%\n", + " group_by(cluster_id) %>%\n", + " # filter the intron clusters with NE and IN events\n", + " mutate(cluster_status = ifelse(any(category %in% c(\"NE\", \"IN\")), \"no\", \"yes\")) %>%\n", + " filter(cluster_status == \"yes\") %>%\n", + " ungroup %>%\n", + " filter(!str_detect(context, \":PR:\")) %>% #FIXME: only keep unproductive events in leafcutter2 results. \n", + " select(-cluster_status, -cluster_id, -category)\n", + " return(df)\n", + " }\n", + "\n", + " # Function to combine the contexted with and without order \n", + " combine_order <- function(context_df_no_order, context_df_with_order){\n", + " context_df_with_order <- context_df_with_order %>%\n", + " group_by(super_context) %>%\n", + " summarise(min_p = min(min_p), .groups = 'keep') %>%\n", + " left_join(context_df_with_order, by = c(\"super_context\", \"min_p\" = \"min_p\"))\n", + "\n", + " # Combine context_df_no_order and context_df_with_order contexts\n", + " cons_top_loci_minp <- c(context_df_no_order$context, context_df_with_order$context)\n", + " return(cons_top_loci_minp)\n", + " }\n", + " \n", + " # Revised summarise_lfsr_by_marker function with distinct error cases\n", + " summarise_lfsr_by_marker <- function(multicontext_res,\n", + " pos = NULL,\n", + " markers = NULL,\n", + " conditions = NULL,\n", + " poslim = NULL,\n", + " lfsr_cutoff = 0.01,\n", + " sentinel_only = FALSE,\n", + " cs_plot = NULL,\n", + " conditional_effect = TRUE) {\n", + " tryCatch({\n", + " # Extract fitted object from multicontext_res\n", + " fit <- multicontext_res[[1]]$mvsusie_fitted\n", + " \n", + " # Set default parameters if not provided\n", + " if (is.null(markers)) markers <- fit$variable_names\n", + " if (is.null(conditions)) conditions <- fit$condition_names\n", + " if (is.null(pos)) pos <- seq(1, length(markers))\n", + " if (is.null(poslim)) poslim <- range(pos)\n", + " if (is.null(cs_plot)) cs_plot <- names(fit$sets$cs)\n", + " \n", + " # Create initial data frame for plotting\n", + " pdat <- data.frame(pip = fit$pip, pos = pos, cs = as.character(NA),\n", + " stringsAsFactors = FALSE)\n", + " css <- names(fit$sets$cs)\n", + " for (i in css) {\n", + " j <- fit$sets$cs[[i]]\n", + " pdat[j, \"cs\"] <- i\n", + " }\n", + " rows <- which(!is.na(pdat$cs))\n", + " pdat_cs <- pdat[rows, ]\n", + " \n", + " # Filter by position limits\n", + " rows1 <- which(pdat$pos >= poslim[1] & pdat$pos <= poslim[2])\n", + " rows2 <- which(pdat_cs$pos >= poslim[1] & pdat_cs$pos <= poslim[2])\n", + " pdat <- pdat[rows1, ]\n", + " pdat_cs <- pdat_cs[rows2, ]\n", + " \n", + " # Order and label credible sets\n", + " pdat_cs$cs <- factor(pdat_cs$cs)\n", + " css <- levels(pdat_cs$cs)\n", + " L <- length(css)\n", + " cs_pos <- sapply(fit$sets$cs[css], function(x) median(pos[x]))\n", + " css <- css[order(cs_pos)]\n", + " pdat_cs$cs <- factor(pdat_cs$cs, levels = css)\n", + " cs_size <- sapply(fit$sets$cs[css], length)\n", + " for (i in 1:L) {\n", + " j <- css[i]\n", + " if (cs_size[i] == 1) {\n", + " levels(pdat_cs$cs)[i] <- sprintf(\"%s (1 SNP)\", j)\n", + " } else {\n", + " levels(pdat_cs$cs)[i] <- sprintf(\"%s (%d SNPs, %0.3f purity)\",\n", + " j, cs_size[j], fit$sets$purity[j, \"min.abs.corr\"])\n", + " }\n", + " }\n", + " \n", + " # Set up traits and effects matrix\n", + " traits <- conditions\n", + " r <- length(traits)\n", + " lmax <- nrow(fit$alpha)\n", + " fit$b1_rescaled <- fit$b1_rescaled[, -1, ]\n", + " rownames(fit$b1_rescaled) <- paste0(\"L\", 1:lmax)\n", + " rownames(fit$single_effect_lfsr) <- paste0(\"L\", 1:lmax)\n", + " colnames(fit$single_effect_lfsr) <- traits\n", + " rownames(fit$alpha) <- paste0(\"L\", 1:lmax)\n", + " effects <- matrix(0, r, L)\n", + " rownames(effects) <- traits\n", + " colnames(effects) <- css\n", + " \n", + " # Initialize effect data frame\n", + " effect_dat <- data.frame(matrix(as.numeric(NA),\n", + " prod(length(conditions) * length(markers)), 8))\n", + " names(effect_dat) <- c(\"trait\", \"marker\", \"pos\", \"effect\", \"z\", \"lfsr\", \"cs\", \"sentinel\")\n", + " effect_dat$trait <- rep(conditions, length(markers))\n", + " effect_dat$marker <- rep(markers, each = length(conditions))\n", + " effect_dat$pos <- rep(pos, each = length(conditions))\n", + " effect_dat$sentinel <- 0\n", + " \n", + " # Process each credible set\n", + " for (i in 1:L) {\n", + " l <- css[i]\n", + " j <- fit$sets$cs[[l]]\n", + " b <- fit$b1_rescaled[l, j, ]\n", + " if (conditional_effect) {\n", + " b <- b / fit$alpha[l, j]\n", + " }\n", + " marker_names <- markers[j]\n", + " marker_idx <- which(effect_dat$marker %in% marker_names)\n", + " effect_dat[marker_idx, \"cs\"] <- l\n", + " effect_dat[marker_idx, \"lfsr\"] <- rep(fit$single_effect_lfsr[l, ], length(marker_names))\n", + " effect_dat[marker_idx, \"effect\"] <- as.vector(t(b))\n", + " if (!is.null(fit$z)) {\n", + " effect_dat[marker_idx, \"z\"] <- as.vector(t(fit$z[j, ]))\n", + " }\n", + " max_idx <- which.max(fit$alpha[l, j])\n", + " effect_dat[which(effect_dat$marker == marker_names[max_idx]), \"sentinel\"] <- 1\n", + " effects[, i] <- ifelse(is.null(nrow(b)), b, b[max_idx, ])\n", + " }\n", + " \n", + " # Filter and process effect data\n", + " effect_dat <- effect_dat[which(!is.na(effect_dat$cs)), ]\n", + " rows1 <- which(effect_dat$pos >= poslim[1] & effect_dat$pos <= poslim[2])\n", + " effect_dat <- effect_dat[rows1, ]\n", + " effect_dat$marker_cs <- paste0(effect_dat$marker, \"(\", effect_dat$cs, \")\")\n", + " pdat_sentinel <- effect_dat[which(effect_dat$sentinel == 1), ]\n", + " pdat_sentinel <- unique(pdat_sentinel[, c(\"marker\", \"marker_cs\", \"pos\")])\n", + " pdat_sentinel$pip <- fit$pip[match(pdat_sentinel$marker, fit$variable_names)]\n", + " \n", + " # Optionally keep only sentinel variants\n", + " if (sentinel_only) {\n", + " effect_dat <- effect_dat[which(effect_dat$sentinel == 1), ]\n", + " }\n", + " # Optionally filter by specified credible sets to plot\n", + " if (!missing(cs_plot)) {\n", + " effect_dat <- effect_dat[which(effect_dat$cs %in% cs_plot), ]\n", + " }\n", + " effect_dat$cs <- factor(effect_dat$cs, levels = css)\n", + " effect_dat$trait <- factor(effect_dat$trait, traits)\n", + " rows <- which(effect_dat$lfsr < lfsr_cutoff)\n", + " effect_dat <- effect_dat[rows, ]\n", + " \n", + " # Error Case 1: No variants left after LFSR cutoff filtering\n", + " if (nrow(effect_dat) == 0) {\n", + " message(\"Warning Case 1: No variants passed the LFSR cutoff threshold; returning NA values.\")\n", + " top_loci <- multicontext_res[[1]][['top_loci']]\n", + " top_loci$trait <- NA\n", + " top_loci$lfsr <- NA\n", + " return(top_loci)\n", + " }\n", + " \n", + " effect_dat <- effect_dat %>% \n", + " mutate(pip = fit$pip[match(marker, names(fit$pip))],\n", + " variant_id = marker)\n", + " \n", + " # Summarise results and merge with top loci\n", + " result <- effect_dat %>%\n", + " group_by(variant_id, cs) %>%\n", + " arrange(trait) %>%\n", + " summarise(trait = paste(trait, collapse = \";\"), \n", + " lfsr = paste(lfsr, collapse = \";\"),\n", + " .groups = \"drop\") %>% \n", + " mutate(cs_coverage_0.95 = gsub('L', '', cs)) %>%\n", + " select(-cs)\n", + " \n", + " top_loci <- multicontext_res[[1]][['top_loci']]\n", + " merge(top_loci, result, by = c('variant_id', 'cs_coverage_0.95'), all.x = TRUE)\n", + " }, error = function(e) {\n", + " # Error Case 2: Any unexpected error in processing\n", + " message(\"Warning Case 2: An unexpected error occurred: \", e$message)\n", + " top_loci <- tryCatch(multicontext_res[[1]][['top_loci']],\n", + " error = function(x) data.frame(variant_id = NA, cs_coverage_0.95 = NA))\n", + " top_loci$trait <- NA\n", + " top_loci$lfsr <- NA\n", + " return(top_loci)\n", + " })\n", + " }\n", + " \n", + " # function to add effect size\n", + " add_coef_column <- function(dat_study) {\n", + " # Ensure the required components exist\n", + " # Check if dat_study contains both required components\n", + " if (!(\"susie_result_trimmed\" %in% names(dat_study)) || !(\"top_loci\" %in% names(dat_study))) {\n", + " # Return a null list instead of stopping\n", + " return(data.frame())\n", + " }\n", + " \n", + " # Extract the coefficient matrix and ensure top_loci is a data.frame\n", + " coef_matrix <- dat_study[[\"susie_result_trimmed\"]][[\"coef\"]][-1,] / dat_study[[\"susie_result_trimmed\"]][['pip']]\n", + " dt <- dat_study[[\"top_loci\"]]\n", + " if (!is.data.frame(dt)) {\n", + " dt <- as.data.frame(dt)\n", + " }\n", + " \n", + " # Check that dt has the required columns\n", + " if (!all(c(\"variant_id\", \"trait\") %in% names(dt))) {\n", + " stop(\"The 'top_loci' data frame must contain 'variant_id' and 'trait' columns.\")\n", + " }\n", + " \n", + " # Add the 'coef' column by matching each variant with its corresponding traits\n", + " dt[[\"conditional_effect\"]] <- mapply(function(variant, trait_str) {\n", + " # Return NA if trait_str is missing or variant not found in coef_matrix\n", + " if (is.na(trait_str) || !(variant %in% rownames(coef_matrix))) {\n", + " return(NA_character_)\n", + " }\n", + " # Split the trait string by ';' and trim extra whitespace\n", + " traits <- trimws(unlist(strsplit(trait_str, \";\")))\n", + " # Subset coefficients for the variant and specified traits\n", + " coefs <- coef_matrix[variant, traits, drop = TRUE]\n", + " paste(coefs, collapse = \";\")\n", + " }, dt[[\"variant_id\"]], dt[[\"trait\"]], USE.NAMES = FALSE)\n", + " \n", + " dat_study[[\"top_loci\"]] <- dt\n", + " return(dt)\n", + " }\n", + "\n", + " ###### MAIN ######\n", + " # Process each path and collect results\n", + " orig_files = c(${\",\".join(['\"%s\"' % x.absolute() for x in _input])})\n", + " # Extract info from each RDS file\n", + " results <- list()\n", + " gene = \"${_meta_info[3]}\"\n", + " geno_ref <- \"${geno_ref}\"\n", + "\n", + " # Post Processing: Extracting info\n", + " # use for loop instead of apply to save memory\n", + " res <- res_sum <- res_minp <- cons_top_loci <- list()\n", + " pip_sum <- data.frame()\n", + " for(i in seq_along(orig_files)) {\n", + " rds_path <- orig_files[i]\n", + " dat <- tryCatch({\n", + " readRDS(rds_path)\n", + " }, error = function(e) {\n", + " writeLines(rds_path %>% basename, gsub(\".tsv\",\"_error\",\"${_output}\"))\n", + " return(NULL) # \n", + " })\n", + "\n", + " if(is.null(dat)) next\n", + " \n", + " #extract qtl type from susie rds file name, if we have set decent condtiton name, this could be removed \n", + " temp_list <- list() # Temporary list to store inner results\n", + " genes <- names(dat)\n", + " for(id in genes){\n", + " dat_study <- dat[[id]]\n", + " temp_list <- list()\n", + " if(${\"TRUE\" if mnm else \"FALSE\"}) {\n", + " if (length(dat[[id]]) <= 1) {\n", + " message(\"No multi-contexts results due to no twas results; skipping multi-context export.\")\n", + " next\n", + " }\n", + " conditions <- paste(orig_files[i] %>% basename %>% str_split(., '[.]', simplify = T) %>% .[,1],dat_study[['context_names']] %>% paste( ., collapse = ';'), sep = ':') \n", + " dat_study[['top_loci']] <- summarise_lfsr_by_marker(dat)\n", + " dat_study[['top_loci']] <- add_coef_column(dat_study)\n", + " } else conditions <- names(dat_study)[sapply(dat_study, function(x) length(x) > 0)]\n", + " if(length(conditions) > 0) {\n", + " for(condition in conditions) {\n", + " if (${\"FALSE\" if mnm else \"TRUE\"}) dat_con <- dat_study[[condition]]${[['fsusie_summary']] if fsusie else ''} else dat_con <- dat_study${[['fsusie_summary']] if fsusie else ''}\n", + " if (${\"TRUE\" if gwas else \"FALSE\"}){\n", + " method <- names(dat_study[[condition]])[!names(dat_study[[condition]]) %in% c('rss_data_analyzed', 'diagnostics')] ## FIXME: this method is not showing in meta (not need if only one method). or we can use `context@method` in meta\n", + " # Make an automated decision on which method to use\n", + " if (!is.null(dat_con$diagnostics) && nrow(dat_con$diagnostics) > 0) { # Check if there's diagnostic table for this condition\n", + " # Identify high correlation columns for top_cs\n", + " top_cs_index <- which.max(abs(dat_con$diagnostics$top_z))\n", + " # Get the column names that match the pattern \"cs_corr_\" followed by numbers\n", + " cs_corr_cols <- grep(\"^cs_corr_\\\\d+$\", names(dat_con$diagnostics), value = TRUE)\n", + " # Initialize an empty list to store high correlation columns\n", + " high_corr_cols <- list()\n", + " # Iterate through the correlation columns and check for correlations > cs_corr_thres\n", + " for (col in cs_corr_cols) {\n", + " if (!is.na(dat_con$diagnostics[top_cs_index, ..col]) && abs(dat_con$diagnostics[top_cs_index, ..col]) > ${cs_corr_thres}) {\n", + " high_corr_cols <- c(high_corr_cols, col)\n", + " }\n", + " }\n", + " # Process the data\n", + " diagnostic_decision <- auto_decision(dat_con$diagnostics, high_corr_cols)\n", + " method <- if (unique(diagnostic_decision$method) == \"BCR\") {\n", + " message(paste0(\"BCR for \", condition, \" study and \", gene, \" block\"))\n", + " \"bayesian_conditional_regression_RSS_QC_RAISS_imputed\"\n", + " } else if (unique(diagnostic_decision$method) == \"SER\") {\n", + " message(paste0(\"SER for \", condition, \" study and \", gene, \" block\"))\n", + " \"single_effect_NO_QC\"\n", + " } else if (unique(diagnostic_decision$method) == \"BVSR\") {\n", + " message(paste0(\"BVSR for \", condition, \" study and \", gene, \" block\"))\n", + " \"susie_rss_RSS_QC_RAISS_imputed\"\n", + " } else {\n", + " stop(\"Invalid method selected.\")\n", + " }\n", + " }\n", + " if (length(method) != 1) {\n", + " stop(\"Multiple methods after automated QC.Pls check!\")\n", + " }\n", + " dat_con <- dat_con[[method]]\n", + " }\n", + " dat_susie <- dat_con$susie_result_trimmed\n", + " # calculate the sum of pip for the vairnats with pip > 0\n", + " pip_sum = rbind(pip_sum, \n", + " data.frame(pip_sum = dat_susie[['pip']] [dat_susie[['pip']] > 0] %>% sum, \n", + " condition = condition)\n", + " )\n", + " \n", + " # rename context names if needed\n", + " if(${\"TRUE\" if condition_meta.is_file() else \"FALSE\"}){\n", + " meta <- suppressMessages(read_delim(\"${condition_meta}\", col_names = F))\n", + " context <- meta %>% filter(X1 == condition) %>% pull(X2)\n", + " if (length(context) == 0) {\n", + " context <- condition\n", + " message(\"No matching entries found. context has been set to the condition value.\")\n", + " }\n", + " } else {context <- condition}\n", + " # align variants to aligned geno\n", + " if(has_rows(dat_con$top_loci) || has_rows(dat_con$preset_top_loci)) cons_top_loci[[id]][[context]] <- context else cons_top_loci[[id]][[context]] <- NULL \n", + " variant_ids <- c(dat_con$top_loci$variant_id, dat_con$variant_names, dat_con$preset_variants_result$top_loci$variant_id, dat_con$preset_variants_result$variant_names)\n", + " if (!is.null(variant_ids) & length(variant_ids) > 0) {\n", + " unique_variant_ids <- unique(variant_ids)\n", + " aligned_variant_ids <- align_to_genoref(unique_variant_ids, geno_ref, paste0(gsub(\"chr\",\"\",\"${_meta_info[0]}\"), \":\", \"${_meta_info[1]}\", \"-\", \"${_meta_info[2]}\"))\n", + " names(aligned_variant_ids) <- unique_variant_ids\n", + "\n", + " # change beta or z in top loci and sumstats\n", + " top_loci_changed_indexes <- which(dat_con$top_loci$variant_id != aligned_variant_ids[dat_con$top_loci$variant_id] %>% as.character )\n", + " if(has_rows(dat_con$top_loci) & length(top_loci_changed_indexes) > 0) {\n", + " dat_con$top_loci$conditional_effect[top_loci_changed_indexes] <- (-1)* dat_con$top_loci$conditional_effect[top_loci_changed_indexes] # FIXME if coef does not need to check flip\n", + " if (${\"FALSE\" if gwas else \"TRUE\"}) {\n", + " dat_con$top_loci$betahat[top_loci_changed_indexes] <- (-1)* dat_con$top_loci$betahat[top_loci_changed_indexes]\n", + " } else {\n", + " dat_con$top_loci$z[top_loci_changed_indexes] <- (-1)* dat_con$top_loci$z[top_loci_changed_indexes]\n", + " }\n", + " }\n", + " all_changed_indexes <- which(dat_con$variant_names != aligned_variant_ids[dat_con$variant_names] %>% as.character )\n", + " if(length(all_changed_indexes) > 0) {\n", + " if (${\"FALSE\" if gwas else \"TRUE\"}) {\n", + " dat_con$sumstats$betahat[all_changed_indexes] <- (-1)* dat_con$sumstats$betahat[all_changed_indexes]\n", + " } else {\n", + " dat_con$sumstats$z[all_changed_indexes] <- (-1)* dat_con$sumstats$z[all_changed_indexes]\n", + " }\n", + " }\n", + " \n", + "\n", + " # change variant names \n", + " if(has_rows(dat_con$top_loci)) dat_con$top_loci$variant_id <- aligned_variant_ids[dat_con$top_loci$variant_id] %>% as.character ###\n", + " dat_con$variant_names <- aligned_variant_ids[dat_con$variant_names] %>% as.character ###\n", + "\n", + " \n", + " if (${\"FALSE\" if gwas else \"TRUE\"}) { \n", + " res[[id]][[context]] <- list(\n", + " top_loci = process_top_loci(dat_con, dat_susie, purity_thresholds = c(${\",\".join(['\"%s\"' % x for x in min_corr])})),\n", + " pip = get_pip_withname(dat_con)\n", + " )\n", + " # the preset data in fsusie is actually from first PC and analyzed by susie, we'd like to remove them to avoid misleading\n", + " # although no preset res in mnm, we can put it here with the same layer structure. It would not report preset results\n", + " if (${\"FALSE\" if fsusie else \"TRUE\"}) {\n", + " if(has_rows(dat_con$preset_variants_result$top_loci)) {\n", + " dat_con$preset_variants_result$top_loci$variant_id <- aligned_variant_ids[dat_con$preset_variants_result$top_loci$variant_id] %>% as.character\n", + " # Calculate and add the conditional effect in one step\n", + " if(has_rows(dat_con$preset_variants_result[['top_loci']])){\n", + " dat_con$preset_variants_result[['top_loci']] <- dat_con$preset_variants_result[['top_loci']] %>%\n", + " mutate(conditional_effect = (coef(dat_con$preset_variants_result$susie_result_trimmed) /\n", + " dat_con$preset_variants_result$susie_result_trimmed[['pip']])[variant_id])\n", + " dat_con$top_loci$conditional_effect[top_loci_changed_indexes] <- (-1)* dat_con$top_loci$conditional_effect[top_loci_changed_indexes] # FIXME if coef does not need to check flip\n", + " }\n", + " }\n", + " dat_con$preset_variants_result$variant_names <- aligned_variant_ids[dat_con$preset_variants_result$variant_names] %>% as.character\n", + " # change beta in preset top loci\n", + " preset_top_loci_changed_indexes <- which(dat_con$preset_variants_result$top_loci$variant_id != aligned_variant_ids[dat_con$preset_variants_result$top_loci$variant_id] %>% as.character )\n", + " if(has_rows(dat_con$top_loci) & length(preset_top_loci_changed_indexes) > 0) dat_con$preset_variants_result$top_loci$betahat[preset_top_loci_changed_indexes] <- (-1)* dat_con$preset_variants_result$top_loci$betahat[preset_top_loci_changed_indexes] \n", + "\n", + " res[[id]][[context]][['region_info']] = dat_con$region_info\n", + " res[[id]][[context]][['CV_table']] = dat_con$twas_cv_result$performance\n", + " res[[id]][[context]][['preset_top_loci']] = process_top_loci(dat_con$preset_variants_result, dat_con$preset_variants_result$susie_result_trimmed, purity_thresholds = c(${\",\".join(['\"%s\"' % x for x in min_corr])}))\n", + " res[[id]][[context]][['preset_pip']] = get_pip_withname(dat_con$preset_variants_result)\n", + " } else {\n", + " res[[id]][[context]][['region_info']] = dat_study[[condition]]$region_info\n", + " res[[id]][[context]][['CV_table']] = dat_study[[condition]]$twas_cv_result$performance\n", + " res[[id]][[context]][['top_loci']] = annotate_susie(dat_study[[condition]], res[[id]][[context]][['top_loci']])\n", + " \n", + " } \n", + " # fsusie do not have sumstats or p value\n", + " if (${\"FALSE\" if fsusie or mnm else \"TRUE\"}) {\n", + " res_sum[[id]][[context]] <- list(\n", + " variant_names = dat_con$variant_names,\n", + " sumstats = dat_con$sumstats\n", + " )\n", + "\n", + " if (${\"FALSE\" if fsusie or mnm else \"TRUE\"}) res_minp[[id]][[context]] <- min(pecotmr:::wald_test_pval(dat_con$sumstats$betahat, dat_con$sumstats$sebetahat, n = 1000)) ##assuming sample size is 1000\n", + " }\n", + " \n", + " } else {\n", + " res[[id]][[context]][[method]] <- list(\n", + " top_loci = process_top_loci(dat_con, dat_susie, purity_thresholds = c(${\",\".join(['\"%s\"' % x for x in min_corr])})),\n", + " pip = get_pip_withname(dat_con)\n", + " )\n", + " res_sum[[id]][[context]][[method]] <- list(\n", + " variant_names = dat_con$variant_names,\n", + " sumstats = dat_con$sumstats\n", + " )\n", + "\n", + " }\n", + " }\n", + " \n", + "\n", + " if(has_rows(dat_con$top_loci) || has_rows(dat_con$preset_top_loci)) cons_top_loci[[id]][[context]] <- context else cons_top_loci[[id]][[context]] <- NULL\n", + " }\n", + " }\n", + " }\n", + " }\n", + "\n", + " cons_top_loci <- cons_top_loci %>% compact() # Use 'compact' to remove NULLs\n", + " cons_top_loci <- if(length(cons_top_loci) > 0) cons_top_loci else NA\n", + "\n", + " combine_data = combine_data_sumstats = cons_top_loci_minp = ''\n", + " combine_data = paste0(\"${_output:add}\",\"/\",\"${name}\", \".\", ${'\"epigenomics_\"' if fsusie else '\"\"'}, ${ '\"metabolomics_\"' if metaQTL else '\"\"'}, gene, \".cis_results_db.export.rds\")\n", + " if (${\"FALSE\" if fsusie else \"TRUE\"}) combine_data_sumstats = gsub(\"export.rds$\", \"export_sumstats.rds\", combine_data)\n", + " \n", + " if (${\"TRUE\" if exported_file.is_file() else \"FALSE\"}){\n", + " if (file.exists(combine_data)) {\n", + " res_exp <- readRDS(combine_data)\n", + " res[[gene]] <- c(res[[gene]], res_exp[[gene]]) # this may need to change if tehre are multiple genes in one rds file. \n", + " }\n", + " if (file.exists(combine_data_sumstats)) {\n", + " res_sum_exp <- readRDS(combine_data_sumstats)\n", + " res_sum[[gene]] <- c(res_sum[[gene]], res_sum_exp[[gene]])\n", + " }\n", + " }\n", + " saveRDS(res, combine_data)\n", + " # only save sumstats results when NOT fsusie or multi gene mvsusie\n", + " if (${\"FALSE\" if fsusie or mnm else \"TRUE\"}) saveRDS(res_sum, combine_data_sumstats)\n", + " \n", + " # generate md5 for data transferring\n", + " system(paste(\"md5sum \",combine_data, \" > \", paste0(combine_data, \".md5\")))\n", + " if (${\"FALSE\" if fsusie or mnm else \"TRUE\"}) system(paste(\"md5sum \",combine_data_sumstats, \" > \", paste0(combine_data_sumstats, \".md5\")))\n", + " \n", + " \n", + " TSS <- tryCatch({dat_con$region_info$region_coord$start}, error = function(e) {return(NA)})\n", + " if (length(res) > 0) conditions = paste(names(res[[1]]), collapse = \",\") else conditions = ''\n", + " \n", + " # fsusie does not have sumstats or pvalue, do not need to run this\n", + " if (${\"FALSE\" if fsusie or mnm or gwas else \"TRUE\"}) {\n", + " context_map <- process_and_match_contexts('${context_meta}', cons_top_loci, res_minp)\n", + " context_map$context_df_with_order <- process_lf2_cluster(context_map$context_df_with_order)\n", + " cons_top_loci_minp <- combine_order(context_map$context_df_no_order, context_map$context_df_with_order)\n", + " }\n", + "\n", + " # save pip sum file \n", + " write_delim(pip_sum, \n", + " paste0(\"${_output:add}\",\"/\",\"${name}\", \".\", ${'\"epigenomics_\"' if fsusie else '\"\"'}, ${ '\"metabolomics_\"' if metaQTL else '\"\"'}, gene, \".pip_sum\"), \n", + " delim = '\\t')\n", + "\n", + " meta = data.frame(chr=\"${_meta_info[0]}\", start=\"${_meta_info[1]}\", end=\"${_meta_info[2]}\", region_id=\"${_meta_info[3]}\", TSS = if(is.null(TSS)) NA else TSS, \n", + " original_data = paste(basename(orig_files), collapse = \",\"), combined_data = basename(combine_data), combined_data_sumstats = basename(combine_data_sumstats), \n", + " conditions = conditions, \n", + " conditions_top_loci = if(length(cons_top_loci) > 0) cons_top_loci[[1]] %>% unlist %>% names %>% as.character %>% paste(., collapse = ',') else ''\n", + " # , conditions_top_loci_minp = if(length(cons_top_loci_minp) > 0) cons_top_loci_minp %>% paste(., collapse = ',') else ''\n", + " )\n", + "\n", + " write_delim(meta, \"${_output}\", delim = '\\t')\n", + " " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "vscode": { + "languageId": "sos" + } + }, + "outputs": [], + "source": [ + "[cis_results_export_2, gwas_results_export_2]\n", + "# set it as True if you only want run first step (running parallel) and combine them manually after all finished \n", + "parameter: step1_only = False\n", + "skip_if(step1_only)\n", + "# provide exported meta to filter the exported genes \n", + "parameter: exported_file = path()\n", + "# optional: qtl or gwas, there is slightly different in qtl and gwas rds file\n", + "parameter: gwas = False\n", + "input: group_by = 'all'\n", + "output: f\"{cwd}/{name}.{'block_results_db' if gwas else 'cis_results_db'}.tsv\" \n", + "# stop_if(_input[0] not in locals().keys(), 'All files have been exported already') #FIXME should we remove to a separate file. sothat we can stop globally as above\n", + "task: trunk_workers = 1, walltime = '1h', trunk_size = 1, mem = '16G', cores = 1, tags = f'{_output:bn}'\n", + "bash: expand = \"${ }\", container = container, stderr = f'{_output:n}.stderr', stdout = f'{_output:n}.stdout', entrypoint=entrypoint\n", + " if [ -e \"${_output:ad}/${name}_cache/\" ]; then\n", + " sed 's/^chr/#chr/' `ls ${_output:ad}/${name}_cache/*tsv |head -n1` | head -n 1 > ${_output:an}.temp\n", + " tail -n +2 -q ${_output:ad}/${name}_cache/*.tsv >> ${_output:an}.temp\n", + " error_files=$(find \"${_output:ad}/${name}_cache/\" -type f -name \"*_error\")\n", + "\n", + " if [[ -n $error_files ]]; then\n", + " cat $error_files >> ${_output:an}.error_genes\n", + " else\n", + " echo \"No truncated files detected\"\n", + " fi\n", + " else\n", + " echo \"All files have been exported already\"\n", + " fi\n", + " \n", + "R: expand = \"${ }\", container = container, stderr = f'{_output:n}.stderr', stdout = f'{_output:n}.stdout', entrypoint=entrypoint\n", + " if (file.exists(paste0(${_output:anr},\".temp\"))) {\n", + " library(tidyverse)\n", + " meta <- read_delim(paste0(${_output:anr},\".temp\"), delim = '\\t')\n", + "\n", + " if (${\"TRUE\" if exported_file.is_file() else \"FALSE\"}){\n", + " exp_meta <- read_delim(${exported_file:r}, delim = '\\t')\n", + " meta <- bind_rows(meta, exp_meta) %>%\n", + " group_by(`#chr`, start, end, region_id, TSS) %>%\n", + " summarise(across(c(original_data, combined_data, combined_data_sumstats, conditions, conditions_top_loci), \n", + " ~paste(unique(.), collapse = \",\")),\n", + " .groups = 'drop')\n", + " }\n", + "\n", + " write_delim(meta, ${_output:r}, delim = '\\t')\n", + " }" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "vscode": { + "languageId": "sos" + } + }, + "outputs": [], + "source": [ + "[cis_results_export_3]\n", + "# set it as True if you only want run first step (running parallel) and combine them manually after all finished \n", + "parameter: step1_only = False\n", + "\n", + "skip_if(step1_only)\n", + "bash: expand = \"${ }\", container = container, stderr = f'{_input:n}.stderr', stdout = f'{_input:n}.stdout', entrypoint=entrypoint \n", + " rm -rf \"${_input:ad}/${name}_cache/\"\n", + " rm -rf ${_input:an}.temp" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "vscode": { + "languageId": "sos" + } + }, + "outputs": [], + "source": [ + "[export_top_loci]\n", + "parameter: export_path=path\n", + "parameter: region = str\n", + "parameter: prefix = str\n", + "parameter: suffix = str\n", + "parameter: fsusie_prefix = ''\n", + "parameter: preset_top_loci = False\n", + "parameter: lfsr_thres = 0.01\n", + "input: f\"{export_path}/{prefix}.{fsusie_prefix}{region}.{suffix}\" \n", + "output: f\"{cwd:a}/{prefix}.{region}.toploci.bed.gz\" \n", + "task: trunk_workers = 1, walltime = '1h', trunk_size = job_size, mem = '16G', cores = 1, tags = f'{_output:bn}'\n", + "R: expand = \"${ }\", container = container, stderr = f'{_output:n}.stderr', stdout = f'{_output:n}.stdout', entrypoint=entrypoint\n", + " library(tidyverse)\n", + " library(data.table)\n", + " lfsr_thres <- as.numeric(${lfsr_thres})\n", + " #function to filter lfsr\n", + " filter_lfsr <- function(df, lfsr_thres){\n", + " df_na <- df %>% filter(is.na(lfsr)|lfsr == '') %>% mutate(event_ID = NA)\n", + " df %>%\n", + " mutate(\n", + " lfsr_list = str_split(lfsr, \";\") %>% map(as.numeric),\n", + " effect_list = str_split(conditional_effect, \";\"),\n", + " event_list = str_split(event_ID, \";\"),\n", + " valid_indices = map(lfsr_list, ~ which(.x < lfsr_thres))\n", + " ) %>%\n", + " rowwise() %>%\n", + " mutate(\n", + " event_ID = paste(event_list[valid_indices], collapse = \";\"),\n", + " conditional_effect = paste(effect_list[valid_indices], collapse = \";\"),\n", + " lfsr = paste(lfsr_list[valid_indices], collapse = \";\")\n", + " ) %>%\n", + " filter(event_ID != \"\") %>%\n", + " select(any_of(colnames(file_top_loci_rename))) %>%\n", + " ungroup() %>% rbind(df, df_na)\n", + " }\n", + " \n", + " ## main\n", + " res <- readRDS(\"${_input}\")\n", + " file_top_loci <- lapply(names(res), function(gene) {\n", + " lapply(names(res[[gene]]), function(context) {\n", + " lapply(c(\"top_loci\"${',\"preset_top_loci\"' if preset_top_loci else ''}), function(method) {\n", + " finemap_method <- names(res[[gene]][[context]])\n", + " if (!(method %in% finemap_method)) {# for rss_susie\n", + " if (length(finemap_method) == 1) { # that is for one model \n", + " temp_df <- res[[gene]][[context]][[finemap_method]][[method]]\n", + " } else {\n", + " stop(\"Not the only finemap model, models are \", finemap_method)\n", + " }\n", + " if (!is.null(temp_df)) {\n", + " mutate(temp_df, study = context, method = method, finemap_method = finemap_method, region = gene)\n", + " } else {\n", + " NULL \n", + " }\n", + " } else {\n", + " temp_df <- res[[gene]][[context]][[method]]\n", + " if (!is.null(temp_df)) {\n", + " mutate(temp_df, study = context, method = method, region = gene)\n", + " } else {\n", + " NULL \n", + " }\n", + " }\n", + " })\n", + " }) %>% bind_rows() \n", + " }) %>% bind_rows() \n", + " if (nrow(file_top_loci) == 0 || is.null(file_top_loci)) {\n", + " warning(\"No top loci found in the finemapping rds.\")\n", + " } else {\n", + " file_top_loci_rename <- file_top_loci %>% \n", + " # Select necessary columns from the original table\n", + " select(any_of(c(\n", + " \"variant_id\", \"pip\", \n", + " \"cs_coverage_0.95_min_corr\",\"cs_coverage_0.7_min_corr\", \"cs_coverage_0.5_min_corr\",\n", + " \"cs_coverage_0.95_min_corr_0.8\",\"cs_coverage_0.7_min_corr_0.8\", \"cs_coverage_0.5_min_corr_0.8\",\n", + " \"cs_coverage_0.95_min_corr_0.5\",\"cs_coverage_0.7_min_corr_0.5\", \"cs_coverage_0.5_min_corr_0.5\",\n", + " \"study\", \"method\", \"region\", \"finemap_method\", \"conditional_effect\", \"lfsr\", \"trait\"\n", + " ))) %>% \n", + " mutate(\n", + " # Split variant_names into separate components\n", + " chr = str_split(variant_id, \":\", simplify = TRUE)[, 1], # Chromosome\n", + " pos = str_split(variant_id, \":\", simplify = TRUE)[, 2], # Position\n", + " a1 = str_split(variant_id, \":\", simplify = TRUE)[, 4], # Allele 1\n", + " a2 = str_split(variant_id, \":\", simplify = TRUE)[, 3], # Allele 2\n", + " # Rename and duplicate columns as required\n", + " variant_ID = variant_id,\n", + " gene_ID = region,\n", + " event_ID = study,\n", + " PIP = pip,\n", + " finemap_model = finemap_method,\n", + " ) %>%\n", + " {\n", + " if (any(c(\"cs_coverage_0.95_min_corr_0.8\", \"cs_coverage_0.7_min_corr_0.8\", \"cs_coverage_0.5_min_corr_0.8\", \n", + " \"cs_coverage_0.95_min_corr_0.5\", \"cs_coverage_0.7_min_corr_0.5\", \"cs_coverage_0.5_min_corr_0.5\", \n", + " \"cs_coverage_0.95_min_corr\", \"cs_coverage_0.7_min_corr\", \"cs_coverage_0.5_min_corr\") %in% colnames(.))) {\n", + " mutate(.,\n", + " cs_coverage_0.95 = if (\"cs_coverage_0.95_min_corr_0.8\" %in% colnames(.)) get(\"cs_coverage_0.95_min_corr_0.8\") else \n", + " if (\"cs_coverage_0.95_min_corr\" %in% colnames(.)) get(\"cs_coverage_0.95_min_corr\") else NA,\n", + " \n", + " cs_coverage_0.7 = if (\"cs_coverage_0.7_min_corr_0.8\" %in% colnames(.)) get(\"cs_coverage_0.7_min_corr_0.8\") else \n", + " if (\"cs_coverage_0.7_min_corr\" %in% colnames(.)) get(\"cs_coverage_0.7_min_corr\") else NA,\n", + " \n", + " cs_coverage_0.5 = if (\"cs_coverage_0.5_min_corr_0.8\" %in% colnames(.)) get(\"cs_coverage_0.5_min_corr_0.8\") else \n", + " if (\"cs_coverage_0.5_min_corr\" %in% colnames(.)) get(\"cs_coverage_0.5_min_corr\") else NA,\n", + " \n", + " cs_coverage_0.95_purity0.5 = if (\"cs_coverage_0.95_min_corr_0.5\" %in% colnames(.)) get(\"cs_coverage_0.95_min_corr_0.5\") else NA,\n", + " cs_coverage_0.7_purity0.5 = if (\"cs_coverage_0.7_min_corr_0.5\" %in% colnames(.)) get(\"cs_coverage_0.7_min_corr_0.5\") else NA,\n", + " cs_coverage_0.5_purity0.5 = if (\"cs_coverage_0.5_min_corr_0.5\" %in% colnames(.)) get(\"cs_coverage_0.5_min_corr_0.5\") else NA\n", + " )\n", + " } else .\n", + " } %>%\n", + " # Conditionally add lfsr if the column exists\n", + " mutate(region_id = basename(\"${_input}\") %>% str_split(., '[.]', simplify = T) %>% .[,2]) %>% \n", + " # Conditionally add lfsr if the column exists\n", + " { if (\"lfsr\" %in% colnames(.)) mutate(., lfsr = lfsr, context = str_split(event_ID, ':', simplify = T) %>% .[,1]) else . } %>% \n", + " { if (\"lfsr\" %in% colnames(.) & lfsr_thres != 0) mutate(., event_ID = trait) else . } %>% \n", + " # Select and order the final set of columns\n", + " select(any_of(c(\"chr\", \"pos\", \"a1\", \"a2\", \"variant_ID\", \"gene_ID\", \"event_ID\", \"cs_coverage_0.95\", \"cs_coverage_0.7\", \"cs_coverage_0.5\",\n", + " \"cs_coverage_0.95_purity0.5\", \"cs_coverage_0.7_purity0.5\", \"cs_coverage_0.5_purity0.5\", \"PIP\", \"conditional_effect\", \n", + " \"lfsr\", \"region_id\", \"context\"${'\",method\"' if preset_top_loci else ''},\"finemap_model\"))) %>% \n", + " # Sort by chromosome and position (convert pos to numeric if necessary)\n", + " arrange(chr, as.numeric(pos))\n", + " # Check if \"lfsr\" is in column names\n", + " if (\"lfsr\" %in% colnames(file_top_loci_rename) & lfsr_thres != 0) file_top_loci_rename <- filter_lfsr(file_top_loci_rename, lfsr_thres = lfsr_thres)\n", + " fwrite(file_top_loci_rename,\"${_output}\")\n", + " }\n", + " " + ] + } + ], + "metadata": { + "language_info": { + "name": "python" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/code/mnm_analysis/mnm_postprocessing.ipynb b/code/mnm_analysis/mnm_postprocessing.ipynb index 4b5ab4516..367e998b0 100644 --- a/code/mnm_analysis/mnm_postprocessing.ipynb +++ b/code/mnm_analysis/mnm_postprocessing.ipynb @@ -40,8 +40,8 @@ }, "outputs": [], "source": [ - "sos run pipeline/SuSiE_post_processing.ipynb susie_to_tsv \\\n", - " --cwd output/test --rds_path `ls output/test/cache/*rds | head ` --region-list <(head -50 ./dlpfc_region_list) --container containers/stephenslab.sif " + "sos run pipeline/mnm_postprocessing.ipynb susie_to_tsv \\\n", + " --cwd output/test --rds_path `ls output/test/cache/*rds | head ` --region-list <(head -50 ./dlpfc_region_list)" ] }, { @@ -63,14 +63,12 @@ }, "outputs": [], "source": [ - "sos run pipeline/SuSiE_post_processing.ipynb susie_to_tsv \\\n", + "sos run pipeline/mnm_postprocessing.ipynb susie_to_tsv \\\n", " --cwd output/ADGWAS_finemapping_extracted/Bellenguez/ --rds_path `ls GWAS_Finemapping_Results/Bellenguez/ADGWAS2022*rds ` \\\n", - " --region-list ~/1300_hg38_EUR_LD_blocks_orig.tsv \\\n", - " --container containers/stephenslab.sif \n", + " --region-list ~/1300_hg38_EUR_LD_blocks_orig.tsv\n", "\n", - "sos run pipeline/SuSiE_post_processing.ipynb susie_tsv_collapse \\\n", - " --cwd output/ADGWAS_finemapping_extracted --tsv_path `ls output/ADGWAS_finemapping_extracted/*lbf.tsv` \\\n", - " --container containers/stephenslab.sif " + "sos run pipeline/mnm_postprocessing.ipynb susie_tsv_collapse \\\n", + " --cwd output/ADGWAS_finemapping_extracted --tsv_path `ls output/ADGWAS_finemapping_extracted/*lbf.tsv`" ] }, { @@ -92,10 +90,9 @@ }, "outputs": [], "source": [ - "sos run pipeline/SuSiE_post_processing.ipynb fsusie_to_tsv \\\n", + "sos run pipeline/mnm_postprocessing.ipynb fsusie_to_tsv \\\n", " --cwd output/f_susie_tad_haQTL_pos --rds_path `ls output/f_susie_tad_haQTL_pos/cache/*rds ` \\\n", - " --region-list ../eqtl/dlpfc_tad_list \\\n", - " --container containers/stephenslab.sif -s build" + " --region-list ../eqtl/dlpfc_tad_list -s build" ] }, { @@ -107,10 +104,9 @@ }, "outputs": [], "source": [ - "sos run pipeline/SuSiE_post_processing.ipynb fsusie_to_tsv \\\n", + "sos run pipeline/mnm_postprocessing.ipynb fsusie_to_tsv \\\n", " --cwd output/f_susie_tad_meQTL_pos_selected/ --rds_path `ls output/f_susie_tad_meQTL_pos_selected//cache/*1204*rds ` \\\n", - " --region-list ../eqtl/dlpfc_tad_list \\\n", - " --container containers/stephenslab.sif -s build" + " --region-list ../eqtl/dlpfc_tad_list" ] }, { @@ -122,10 +118,9 @@ }, "outputs": [], "source": [ - "sos run pipeline/SuSiE_post_processing.ipynb fsusie_to_tsv \\\n", + "sos run pipeline/mnm_postprocessing.ipynb fsusie_to_tsv \\\n", " --cwd output/f_susie_tad_meQTL_pos_2/ --rds_path `ls output/f_susie_tad_meQTL_pos_2//cache/*rds ` \\\n", - " --region-list ../eqtl/dlpfc_tad_list \\\n", - " --container containers/stephenslab.sif -s build" + " --region-list ../eqtl/dlpfc_tad_list -s build" ] }, { @@ -137,10 +132,9 @@ }, "outputs": [], "source": [ - "sos run pipeline/SuSiE_post_processing.ipynb fsusie_to_tsv \\\n", + "sos run pipeline/mnm_postprocessing.ipynb fsusie_to_tsv \\\n", " --cwd output/f_susie_tad_meQTL_pos/ --rds_path `ls output/f_susie_tad_meQTL_pos//cache/*rds ` \\\n", - " --region-list ../eqtl/dlpfc_tad_list \\\n", - " --container containers/stephenslab.sif -s build" + " --region-list ../eqtl/dlpfc_tad_list -s build" ] }, { @@ -152,10 +146,9 @@ }, "outputs": [], "source": [ - "sos run pipeline/SuSiE_post_processing.ipynb fsusie_to_tsv \\\n", + "sos run pipeline/mnm_postprocessing.ipynb fsusie_to_tsv \\\n", " --cwd output/f_susie_tad_haQTL_pos_check_pure_2 --rds_path `ls output/f_susie_tad_haQTL_pos_check_pure_2/cache/*rds ` \\\n", - " --region-list ../eqtl/dlpfc_tad_list \\\n", - " --container containers/stephenslab.sif -s build" + " --region-list ../eqtl/dlpfc_tad_list -s build" ] }, { @@ -167,10 +160,9 @@ }, "outputs": [], "source": [ - "sos run pipeline/SuSiE_post_processing.ipynb fsusie_to_tsv \\\n", + "sos run pipeline/mnm_postprocessing.ipynb fsusie_to_tsv \\\n", " --cwd output/f_susie_tad_meQTL_pos_2/ --rds_path `ls output/f_susie_tad_meQTL_pos_2//cache/*rds ` \\\n", - " --region-list ../eqtl/dlpfc_tad_list \\\n", - " --container containers/stephenslab.sif -s build" + " --region-list ../eqtl/dlpfc_tad_list -s build" ] }, { @@ -193,7 +185,7 @@ }, "outputs": [], "source": [ - "sos run pipeline/SuSiE_post_processing.ipynb susie_pip_landscape_plot \\\n", + "sos run pipeline/mnm_postprocessing.ipynb susie_pip_landscape_plot \\\n", " --cwd output/test/ --plot_list plot_recipe --annot_tibble ~/Annotatr_builtin_annotation_tibble.tsv -s force &" ] }, @@ -206,7 +198,7 @@ }, "outputs": [], "source": [ - "sos run pipeline/SuSiE_post_processing.ipynb susie_upsetR_plot \\\n", + "sos run pipeline/mnm_postprocessing.ipynb susie_upsetR_plot \\\n", " --cwd output/test/ --plot_list plot_recipe_1 -s force &" ] }, @@ -323,7 +315,7 @@ "outputs": [], "source": [ "#susie\n", - "sos run /home/rf2872/codes/xqtl-pipeline/pipeline/fine_mapping_post_processing.ipynb cis_results_export \\\n", + "sos run /home/rf2872/codes/xqtl-pipeline/pipeline/mnm_postprocessing.ipynb cis_results_export \\\n", " --region_file /mnt/vast/hpc/homes/rf2872/codes/xqtl-analysis/resource/TADB_enhanced_cis.protein_coding.bed \\\n", " --file_path /mnt/vast/hpc/homes/rf2872/aws/rds_files \\\n", " --name demo_susie \\\n", @@ -346,7 +338,7 @@ "outputs": [], "source": [ "#fsusie\n", - "sos run /home/rf2872/codes/xqtl-pipeline/pipeline/fine_mapping_post_processing.ipynb cis_results_export \\\n", + "sos run /home/rf2872/codes/xqtl-pipeline/pipeline/mnm_postprocessing.ipynb cis_results_export \\\n", " --region_file /mnt/vast/hpc/homes/rf2872/codes/xqtl-analysis/resource/extended_TADB.bed \\\n", " --file_path /mnt/vast/hpc/homes/rf2872/aws/rds_files \\\n", " --name demo_fsusie \\\n", @@ -370,7 +362,7 @@ "outputs": [], "source": [ "#meta QTL\n", - "sos run /home/rf2872/codes/xqtl-pipeline/pipeline/fine_mapping_post_processing.ipynb cis_results_export \\\n", + "sos run /home/rf2872/codes/xqtl-pipeline/pipeline/mnm_postprocessing.ipynb cis_results_export \\\n", " --region_file /mnt/vast/hpc/homes/rf2872/codes/xqtl-analysis/resource/hg38_1362_blocks.bed \\\n", " --file_path /mnt/vast/hpc/homes/rf2872/aws/rds_files \\\n", " --name demo_metaQTL \\\n", @@ -395,7 +387,7 @@ "source": [ "# multi gene mvsusie\n", "# multi gene mvsusie\n", - " sos run /data/interactive_analysis/rf2872/codes/Jan/xqtl-pipeline/pipeline/fine_mapping_post_processing.ipynb cis_results_export \\\n", + " sos run /data/interactive_analysis/rf2872/codes/Jan/xqtl-pipeline/pipeline/mnm_postprocessing.ipynb cis_results_export \\\n", " --region_file /data/xqtl-analysis/resource/extended_TADB.bed \\\n", " --file_path /data/analysis_result/mnm/ROSMAP/mnm_genes/ \\\n", " --name ROSMAP_AC_DeJager_eQTL \\\n", @@ -429,7 +421,7 @@ }, "outputs": [], "source": [ - " sos run /home/rf2872/codes/xqtl-pipeline/pipeline/fine_mapping_post_processing.ipynb gwas_results_export \\\n", + " sos run /home/rf2872/codes/xqtl-pipeline/pipeline/mnm_postprocessing.ipynb gwas_results_export \\\n", " --region_file /mnt/vast/hpc/homes/rf2872/codes/xqtl-analysis/resource/hg38_1362_blocks.bed \\\n", " --file_path /home/hs3393/RSS_QC/GWAS_finemapping_Apr9/univariate_rss \\\n", " --name demo_gwas \\\n", @@ -464,7 +456,7 @@ }, "outputs": [], "source": [ - "sos run /home/rf2872/codes/xqtl-pipeline/pipeline/fine_mapping_post_processing.ipynb combine_export_meta \\\n", + "sos run /home/rf2872/codes/xqtl-pipeline/pipeline/mnm_postprocessing.ipynb combine_export_meta \\\n", " --cache_path demo_susie_back/demo_susie_cache \\\n", " --output_file test_combine" ] @@ -489,7 +481,7 @@ }, "outputs": [], "source": [ - "sos run /home/rf2872/codes/xqtl-pipeline/pipeline/fine_mapping_post_processing.ipynb overlap_qtl_gwas \\\n", + "sos run /home/rf2872/codes/xqtl-pipeline/pipeline/mnm_postprocessing.ipynb overlap_qtl_gwas \\\n", " --name demo_overlap \\\n", " --qtl_meta_path /mnt/vast/hpc/csg/rf2872/Work/Multivariate/susie_2024_new/demo_susie/demo_susie.cis_results_db.tsv \\\n", " --gwas_meta_path /mnt/vast/hpc/csg/rf2872/Work/Multivariate/susie_2024_new/demo_gwas/demo_gwas.block_results_db.tsv \\\n", @@ -516,7 +508,7 @@ }, "outputs": [], "source": [ - "sos run /home/rf2872/codes/xqtl-pipeline/pipeline/fine_mapping_post_processing.ipynb export_top_loci \\\n", + "sos run /home/rf2872/codes/xqtl-pipeline/pipeline/mnm_postprocessing.ipynb export_top_loci \\\n", "--export_path /home/ubuntu/demo_singlecontext/ \\\n", "--region ENSG00000197106 \\\n", "--prefix ROSMAP_DeJager \\\n", @@ -543,7 +535,7 @@ "## Containers that contains the necessary packages\n", "parameter: container = \"\"\n", "import re\n", - "parameter: entrypoint= ('micromamba run -a \"\" -n' + ' ' + re.sub(r'(_apptainer:latest|_docker:latest|\\.sif)$', '', container.split('/')[-1])) if container else \"\"\n", + "parameter: entrypoint= \"\"\n", "# For cluster jobs, number commands to run per job\n", "parameter: job_size = 50\n", "# Wall clock time expected\n",