From f294f57cfc8d9150d56ceeb4ce59ad375597e367 Mon Sep 17 00:00:00 2001
From: xuewei cao <36172337+xueweic@users.noreply.github.com>
Date: Fri, 12 Sep 2025 20:58:33 -0400
Subject: [PATCH 01/12] improve computational

Using profvis to check the computational time for each part of functions. Improve on several way:
1. check_pair_jkeach: instead of checking every pair, we check the upper tri first to reduce the burden.
2. Still some LD checking issue caused the LD-free mode.
3. change matrix multiplication using tcrossprod function
---
 R/colocboost_check_update_jk.R | 105 ++++++++++++++++++++++++---------
 R/colocboost_inference.R       |  13 ++--
 R/colocboost_init.R            |   2 +-
 R/colocboost_utils.R           |  20 +++++--
 4 files changed, 99 insertions(+), 41 deletions(-)

diff --git a/R/colocboost_check_update_jk.R b/R/colocboost_check_update_jk.R
index 623faa8..05390c8 100644
--- a/R/colocboost_check_update_jk.R
+++ b/R/colocboost_check_update_jk.R
@@ -66,15 +66,15 @@ boost_check_update_jk_nofocal <- function(cb_model, cb_model_para, cb_data) {
 
     # -- define jk and jk_each
     cor_vals_each <- do.call(cbind, lapply(model_update, function(cc) cc$correlation))
-    abs_cor_vals_each <- sweep(abs(cor_vals_each), 2, adj_dep, `*`)
+    # abs_cor_vals_each <- sweep(abs(cor_vals_each), 2, adj_dep, `*`)
+    abs_cor_vals_each <- abs(cor_vals_each) * rep(adj_dep, each = nrow(cor_vals_each))
     abs_cor_vals <- rowSums(abs_cor_vals_each)
 
-    jk <- which(abs_cor_vals == max(abs_cor_vals))
-    jk <- ifelse(length(jk) == 1, jk, sample(jk, 1))
-    jk_each <- apply(abs_cor_vals_each, 2, function(temp) {
-      jk_temp <- which(temp == max(temp))
-      return(ifelse(length(jk_temp) == 1, jk_temp, sample(jk_temp, 1)))
-    })
+    # jk <- which(abs_cor_vals == max(abs_cor_vals))
+    # jk <- ifelse(length(jk) == 1, jk, sample(jk, 1))
+    # jk_each <- apply(abs_cor_vals_each, 2, which.max)
+    jk <- which.max(abs_cor_vals)
+    jk_each <- sapply(1:ncol(abs_cor_vals_each), function(j) which.max(abs_cor_vals_each[, j]))
     update_jk[c(1, pos.update + 1)] <- c(jk, jk_each)
 
     
@@ -142,7 +142,7 @@ boost_check_update_jk_nofocal <- function(cb_model, cb_model_para, cb_data) {
         update_status[pos.update[true_update]] <- 1
         # - re-define jk_star
         abs_cor_vals_redefine <- rowSums(abs_cor_vals_each[, true_update, drop = FALSE])
-        jk_redefine <- which(abs_cor_vals_redefine == max(abs_cor_vals_redefine))
+        jk_redefine <- which.max(abs_cor_vals_redefine == max(abs_cor_vals_redefine))
         jk_redefine <- ifelse(length(jk_redefine) == 1, jk_redefine, sample(jk_redefine, 1))
         real_update_jk[pos.update[true_update]] <- jk_redefine
       } else {
@@ -351,11 +351,10 @@ boost_check_update_jk_focal <- function(cb_model, cb_model_para, cb_data,
 
     # -- define jk and jk_each
     cor_vals_each <- do.call(cbind, lapply(model_update, function(cc) as.vector(cc$correlation)))
-    abs_cor_vals_each <- sweep(abs(cor_vals_each), 2, adj_dep, `*`)
-    jk_each <- apply(abs_cor_vals_each, 2, function(temp) {
-      jk_temp <- which(temp == max(temp))
-      return(ifelse(length(jk_temp) == 1, jk_temp, sample(jk_temp, 1)))
-    })
+    # abs_cor_vals_each <- sweep(abs(cor_vals_each), 2, adj_dep, `*`)
+    abs_cor_vals_each <- abs(cor_vals_each) * rep(adj_dep, each = nrow(cor_vals_each))
+    # jk_each <- apply(abs_cor_vals_each, 2, which.max)
+    jk_each <- sapply(1:ncol(abs_cor_vals_each), function(j) which.max(abs_cor_vals_each[, j]))
     pp_focal <- which(pos.update == focal_outcome_idx)
     jk_focal <- jk_each[pp_focal]
 
@@ -503,8 +502,8 @@ get_LD_jk1_jk2 <- function(jk1, jk2,
     if (length(XtX) == 1){
       LD_temp <- 0
     } else {
-      jk1.remain <- which(remain_jk == jk1)
-      jk2.remain <- which(remain_jk == jk2)
+      jk1.remain <- match(jk1, remain_jk)
+      jk2.remain <- match(jk2, remain_jk)
       LD_temp <- XtX[jk1.remain, jk2.remain]
     }
   }
@@ -539,33 +538,83 @@ check_jk_jkeach <- function(jk, jk_each,
   return(judge)
 }
 
+# check_pair_jkeach <- function(jk_each,
+#                               pos.update,
+#                               model_update,
+#                               cb_data, X_dict,
+#                               jk_equiv_corr = 0.8,
+#                               jk_equiv_loglik = 0.001) {
+#   data_update <- cb_data$data[pos.update]
+#   # -- check if jk_i ~ jk_j
+#   change_each_pair <- matrix(FALSE, nrow = length(jk_each), ncol = length(jk_each))
+#   for (i in 1:length(jk_each)) {
+#     
+#     jk_i <- jk_each[i]
+#     change_log_jk_i <- model_update[[i]]$change_loglike[jk_i]
+#     for (j in 1:length(jk_each)) {
+#       if (j != i) {
+#         jk_j <- jk_each[j]
+# 
+#         if (!(jk_i %in% data_update[[i]]$variable_miss) & !(jk_j %in% data_update[[i]]$variable_miss)) {
+#           change_log_jk_j <- model_update[[i]]$change_loglike[jk_j]
+#           change_each <- abs(change_log_jk_i - change_log_jk_j)
+#           LD_temp <- get_LD_jk1_jk2(jk_i, jk_j,
+#             X = cb_data$data[[X_dict[i]]]$X,
+#             XtX = cb_data$data[[X_dict[i]]]$XtX,
+#             N = data_update[[i]]$N,
+#             remain_jk = setdiff(1:length(model_update[[i]]$res), data_update[[i]]$variable_miss)
+#           )
+#           change_each_pair[i, j] <- (change_each <= jk_equiv_loglik) & (abs(LD_temp) >= jk_equiv_corr)
+#         } else {
+#           change_each_pair[i, j] <- FALSE
+#         }
+#       } else {
+#         change_each_pair[i, j] <- FALSE
+#       }
+#     }
+#   }
+#   change_each_pair <- change_each_pair + t(change_each_pair)
+#   return(change_each_pair)
+# }
+
 check_pair_jkeach <- function(jk_each,
                               pos.update,
                               model_update,
                               cb_data, X_dict,
                               jk_equiv_corr = 0.8,
                               jk_equiv_loglik = 0.001) {
+  
+  detect_func <- function(idx, jk_i, jk_j){
+    change_log_jk_i <- model_update[[idx]]$change_loglike[jk_i]
+    change_log_jk_j <- model_update[[idx]]$change_loglike[jk_j]
+    change_each <- abs(change_log_jk_i - change_log_jk_j)
+    LD_temp <- get_LD_jk1_jk2(jk_i, jk_j,
+                              X = cb_data$data[[X_dict[idx]]]$X,
+                              XtX = cb_data$data[[X_dict[idx]]]$XtX,
+                              N = data_update[[idx]]$N,
+                              remain_jk = setdiff(1:length(model_update[[idx]]$res), data_update[[idx]]$variable_miss)
+    )
+    return((change_each <= jk_equiv_loglik) & (abs(LD_temp) >= jk_equiv_corr))
+  }
+  
   data_update <- cb_data$data[pos.update]
   # -- check if jk_i ~ jk_j
   change_each_pair <- matrix(FALSE, nrow = length(jk_each), ncol = length(jk_each))
   for (i in 1:length(jk_each)) {
-    
     jk_i <- jk_each[i]
-    change_log_jk_i <- model_update[[i]]$change_loglike[jk_i]
-    for (j in 1:length(jk_each)) {
+    for (j in i:length(jk_each)) {
       if (j != i) {
         jk_j <- jk_each[j]
-
         if (!(jk_i %in% data_update[[i]]$variable_miss) & !(jk_j %in% data_update[[i]]$variable_miss)) {
-          change_log_jk_j <- model_update[[i]]$change_loglike[jk_j]
-          change_each <- abs(change_log_jk_i - change_log_jk_j)
-          LD_temp <- get_LD_jk1_jk2(jk_i, jk_j,
-            X = cb_data$data[[X_dict[i]]]$X,
-            XtX = cb_data$data[[X_dict[i]]]$XtX,
-            N = data_update[[i]]$N,
-            remain_jk = setdiff(1:length(model_update[[i]]$res), data_update[[i]]$variable_miss)
-          )
-          change_each_pair[i, j] <- (change_each <= jk_equiv_loglik) & (abs(LD_temp) >= jk_equiv_corr)
+          change_each_pair[i, j] <- detect_func(i, jk_i, jk_j)
+          # if jk_i and jk_j are equivalent on dataset i, then we don't need to check dataset j
+          if ( !change_each_pair[i, j] ){
+            if (!(jk_i %in% data_update[[j]]$variable_miss) & !(jk_j %in% data_update[[j]]$variable_miss)) {
+              change_each_pair[j, i] <- detect_func(j, jk_i, jk_j)
+            } else {
+              change_each_pair[j, i] <- FALSE
+            }
+          }
         } else {
           change_each_pair[i, j] <- FALSE
         }
diff --git a/R/colocboost_inference.R b/R/colocboost_inference.R
index dc53908..e76d708 100644
--- a/R/colocboost_inference.R
+++ b/R/colocboost_inference.R
@@ -132,7 +132,8 @@ get_modularity <- function(Weight, B) {
   
   if (m == 0) return(0)
   
-  cate <- B %*% t(B)
+  # cate <- B %*% t(B)
+  cate <- tcrossprod(B)
   
   if (m_pos == 0 & m_neg == 0) return(0)
   
@@ -190,12 +191,12 @@ get_n_cluster <- function(hc, Sigma, m = ncol(Sigma), min_cluster_corr = 0.8) {
 #' @noRd
 w_purity <- function(weights, X = NULL, Xcorr = NULL, N = NULL, n = 100, coverage = 0.95,
                      min_abs_corr = 0.5, median_abs_corr = NULL, miss_idx = NULL) {
-  tmp <- apply(weights, 2, w_cs, coverage = coverage)
-  tmp_purity <- apply(tmp, 2, function(tt) {
-    pos <- which(tt == 1)
+  
+  tmp_purity <- apply(weights, 2, function(w) {
+    pos <- w_cs(w, coverage = coverage)
     # deal with missing snp here
     if (!is.null(Xcorr)) {
-      pos <- match(pos, setdiff(1:length(tmp), miss_idx))
+      pos <- match(pos, setdiff(1:length(w), miss_idx))
     }
     get_purity(pos, X = X, Xcorr = Xcorr, N = N, n = n)
   })
@@ -420,7 +421,7 @@ get_purity <- function(pos, X = NULL, Xcorr = NULL, N = NULL, n = 100) {
       corr[which(is.na(corr))] <- 0
       value <- abs(get_upper_tri(corr))
     } else {
-      if (sum(Xcorr) == 1){
+      if (length(Xcorr) == 1){
         value <- 0
       } else {
         Xcorr <- Xcorr # if (!is.null(N)) Xcorr/(N-1) else Xcorr
diff --git a/R/colocboost_init.R b/R/colocboost_init.R
index 8685b01..ea27ce7 100644
--- a/R/colocboost_init.R
+++ b/R/colocboost_init.R
@@ -619,7 +619,7 @@ process_sumstat <- function(Z, N, Var_y, SeBhat, ld_matrices, variant_lists, dic
     Z_extend[pos_target] <- current_z[pos_z]
 
     # Calculate submatrix for each unique entry (not duplicates)
-    if (sum(current_ld_matrix) == 1){
+    if (length(current_ld_matrix) == 1){
       ld_submatrix <- current_ld_matrix
     } else {
       ld_submatrix <- NULL
diff --git a/R/colocboost_utils.R b/R/colocboost_utils.R
index c4da011..0407633 100644
--- a/R/colocboost_utils.R
+++ b/R/colocboost_utils.R
@@ -426,15 +426,20 @@ get_max_profile <- function(cb_obj, check_null_max = 0.025,
 }
 
 
+# ### Function for check cs for each weight
+# w_cs <- function(weights, coverage = 0.95) {
+#   indices <- unlist(get_in_cos(weights, coverage = coverage))
+#   result <- rep(0, length(weights))
+#   result[indices] <- 1
+#   return(result)
+# }
+
 ### Function for check cs for each weight
 w_cs <- function(weights, coverage = 0.95) {
   indices <- unlist(get_in_cos(weights, coverage = coverage))
-  result <- rep(0, length(weights))
-  result[indices] <- 1
-  return(result)
+  return(indices)
 }
 
-
 #' Pure R implementation (fallback)
 #' @noRd
 get_merge_ordered_with_indices <- function(vector_list) {
@@ -505,12 +510,14 @@ get_merge_ordered_with_indices <- function(vector_list) {
   # Step 4: Topological sort using Kahn's algorithm
   # Start with nodes that have no incoming edges
   queue <- all_elements[sapply(all_elements, function(elem) in_degree[[elem]] == 0)]
-  result <- character()
+  result <- list()
+  result_idx <- 1
   while (length(queue) > 0) {
     # Take the first element from the queue
     current <- queue[1]
     queue <- queue[-1]
-    result <- c(result, current)
+    result[[result_idx]] <- current  # List assignment is fast
+    result_idx <- result_idx + 1
     # Process all neighbors (elements that must come after current)
     neighbors <- graph[[current]]
     for (next_elem in neighbors) {
@@ -520,6 +527,7 @@ get_merge_ordered_with_indices <- function(vector_list) {
       }
     }
   }
+  result <- unlist(result)
   # Step 5: Check for cycles and use fallback if needed
   if (length(result) != n_elements) {
     # Different variable orders detected - use priority-based fallback

From 4fd1b757c44604d9e732f5eba673150bd25b5e43 Mon Sep 17 00:00:00 2001
From: xuewei cao <36172337+xueweic@users.noreply.github.com>
Date: Fri, 12 Sep 2025 21:36:13 -0400
Subject: [PATCH 02/12] Enhanced Highlight Functionality with Flexible Styling
 Options

The highlighting feature has been updated to provide greater flexibility in visualization options. In previous versions, additional highlight variables in the colocboost plot could only be displayed as vertical lines by setting add_vertical = TRUE.
The updated colocboost_plot function now offers more versatile highlighting capabilities through the new add_highlight = TRUE parameter, which allows users to display highlight variables as either vertical lines or red star markers, providing improved customization for data visualization needs.
---
 R/colocboost_plot.R                           | 148 ++++++++++--------
 tests/testthat/test_plot.R                    |   4 +-
 tests/testthat/test_utils.R                   |   6 +-
 vignettes/Visualization_ColocBoost_Output.Rmd |  11 +-
 4 files changed, 94 insertions(+), 75 deletions(-)

diff --git a/R/colocboost_plot.R b/R/colocboost_plot.R
index 90141d6..1f0f7c1 100644
--- a/R/colocboost_plot.R
+++ b/R/colocboost_plot.R
@@ -14,8 +14,9 @@
 #' @param plot_focal_cos_outcome_only Logical, if TRUE only plots colocalization including at least on colocalized outcome with focal outcome, default is FALSE.
 #' @param points_color Background color for non-colocalized variables, default is "grey80".
 #' @param cos_color Optional custom colors for CoS.
-#' @param add_vertical Logical, if TRUE adds vertical lines at specified positions, default is FALSE
-#' @param add_vertical_idx Optional indices for vertical lines.
+#' @param add_highlight Logical, if TRUE adds vertical lines at specified positions, default is FALSE
+#' @param add_highlight_style Optional style of add_highlight variables, default is "vertical_lines", other choice is "star" - red star.
+#' @param add_highlight_idx Optional indices for add_highlight variables.
 #' @param outcome_names Optional vector of outcomes names for the subtitle of each figure. \code{outcome_names=NULL} for the outcome name shown in \code{data_info}.
 #' @param plot_cols Number of columns in the plot grid, default is 2. If you have many colocalization. please consider increasing this.
 #' @param variant_coord Logical, if TRUE uses variant coordinates on x-axis, default is FALSE. This is required the variable names including position information.
@@ -78,8 +79,9 @@ colocboost_plot <- function(cb_output, y = "log10p",
                             plot_focal_cos_outcome_only = FALSE,
                             points_color = "grey80",
                             cos_color = NULL,
-                            add_vertical = FALSE,
-                            add_vertical_idx = NULL,
+                            add_highlight = FALSE,
+                            add_highlight_idx = NULL,
+                            add_highlight_style = "vertical_lines",
                             outcome_names = NULL,
                             plot_cols = 2,
                             variant_coord = FALSE,
@@ -102,33 +104,34 @@ colocboost_plot <- function(cb_output, y = "log10p",
   if (!inherits(cb_output, "colocboost")) {
     stop("Input of colocboost_plot must be a 'colocboost' object!")
   }
-
+  
   # get cb_plot_input data from colocboost results
   cb_plot_input <- get_input_plot(cb_output,
-    plot_cos_idx = plot_cos_idx,
-    variant_coord = variant_coord,
-    outcome_names = outcome_names,
-    plot_focal_only = plot_focal_only,
-    plot_focal_cos_outcome_only = plot_focal_cos_outcome_only,
-    show_cos_to_uncoloc = show_cos_to_uncoloc,
-    show_cos_to_uncoloc_idx = show_cos_to_uncoloc_idx,
-    show_cos_to_uncoloc_outcome = show_cos_to_uncoloc_outcome,
-    plot_ucos = plot_ucos, plot_ucos_idx = plot_ucos_idx
+                                  plot_cos_idx = plot_cos_idx,
+                                  variant_coord = variant_coord,
+                                  outcome_names = outcome_names,
+                                  plot_focal_only = plot_focal_only,
+                                  plot_focal_cos_outcome_only = plot_focal_cos_outcome_only,
+                                  show_cos_to_uncoloc = show_cos_to_uncoloc,
+                                  show_cos_to_uncoloc_idx = show_cos_to_uncoloc_idx,
+                                  show_cos_to_uncoloc_outcome = show_cos_to_uncoloc_outcome,
+                                  plot_ucos = plot_ucos, plot_ucos_idx = plot_ucos_idx
   )
   # get initial set up of plot
   cb_plot_init <- plot_initial(cb_plot_input,
-    y = y, points_color = points_color, cos_color = cos_color,
-    ylim_each = ylim_each, title_specific = title_specific,
-    outcome_legend_pos = outcome_legend_pos, outcome_legend_size = outcome_legend_size,
-    cos_legend_pos = cos_legend_pos, plot_ucos = plot_ucos,
-    show_variable = show_variable, lab_style = lab_style, axis_style = axis_style,
-    title_style = title_style, ...
+                               y = y, points_color = points_color, cos_color = cos_color,
+                               ylim_each = ylim_each, title_specific = title_specific,
+                               outcome_legend_pos = outcome_legend_pos, outcome_legend_size = outcome_legend_size,
+                               cos_legend_pos = cos_legend_pos, plot_ucos = plot_ucos,
+                               show_variable = show_variable, lab_style = lab_style, axis_style = axis_style,
+                               title_style = title_style, ...
   )
-
+  
   colocboost_plot_basic <- function(cb_plot_input, cb_plot_init,
                                     outcome_idx = NULL, plot_all_outcome = FALSE,
                                     grange = NULL, plot_cols = 2,
-                                    add_vertical = FALSE, add_vertical_idx = NULL,
+                                    add_highlight = FALSE, add_highlight_idx = NULL,
+                                    add_highlight_style = "vertical_lines",
                                     show_top_variables = TRUE,
                                     ...) {
     args <- list(...)
@@ -147,12 +150,12 @@ colocboost_plot <- function(cb_output, y = "log10p",
     args$font.lab <- cb_plot_init$lab_face
     # - change position
     cb_plot_init$outcome_legend_pos <- switch(cb_plot_init$outcome_legend_pos,
-      "right" = 4,
-      "left" = 2,
-      "top" = 3,
-      "bottom" = 1
+                                              "right" = 4,
+                                              "left" = 2,
+                                              "top" = 3,
+                                              "bottom" = 1
     )
-
+    
     # - begin plotting
     coloc_cos <- cb_plot_input$cos
     outcomes <- cb_plot_input$outcomes
@@ -229,12 +232,7 @@ colocboost_plot <- function(cb_output, y = "log10p",
               cex = cb_plot_init$outcome_legend_size, font = 1
         )
       }
-      if (add_vertical) {
-        for (iii in 1:length(add_vertical_idx)) {
-          abline(v = add_vertical_idx[iii], col = "#E31A1C", lwd = 1.5, lty = "dashed")
-        }
-      }
-
+      
       # mark variables in CoS to colocalized outcomes
       if (!is.null(coloc_cos)) {
         n.coloc <- length(coloc_cos)
@@ -242,7 +240,7 @@ colocboost_plot <- function(cb_output, y = "log10p",
         if (length(y)==1){coloc_index = lapply(coloc_index, function(i) 1 )}
         legend_text <- list(col = vector())
         legend_text$col <- head(cb_plot_init$col, n.coloc)
-
+        
         # check which coloc set for this outcome
         p.coloc <- sapply(coloc_index, function(idx) sum(idx == iy) != 0)
         p.coloc <- which(p.coloc)
@@ -261,7 +259,7 @@ colocboost_plot <- function(cb_output, y = "log10p",
             points(x_hits, y_hits, pch = 21, bg = legend_text$col[i.cs], col = "#E31A1C", cex = 3, lwd = 3)
           }
         }
-
+        
         # mark variables in CoS to uncolocalized outcomes
         uncoloc <- cb_plot_input$uncoloc
         if (!is.null(uncoloc)) {
@@ -276,8 +274,8 @@ colocboost_plot <- function(cb_output, y = "log10p",
               x0 <- intersect(args$x, cs)
               y1 <- args$y[match(x0, args$x)]
               points(x0, y1,
-                pch = 4, col = adjustcolor(legend_text$col[uncoloc$cos_idx_to_uncoloc[i.uncoloc]], alpha.f = 0.4),
-                cex = 1.5, lwd = 1.5
+                     pch = 4, col = adjustcolor(legend_text$col[uncoloc$cos_idx_to_uncoloc[i.uncoloc]], alpha.f = 0.4),
+                     cex = 1.5, lwd = 1.5
               )
               shape_col <- c(shape_col, adjustcolor(legend_text$col[uncoloc$cos_idx_to_uncoloc[i.uncoloc]], alpha.f = 1))
               texts_col <- c(texts_col, adjustcolor(legend_text$col[uncoloc$cos_idx_to_uncoloc[i.uncoloc]], alpha.f = 0.8))
@@ -294,35 +292,51 @@ colocboost_plot <- function(cb_output, y = "log10p",
           if (length(texts) == 0) {
             next
           }
-
+          
           # Get current plot area coordinates
           usr <- par("usr")
           x_pos <- usr[1] + cb_plot_init$cos_legend_pos[1] * (usr[2] - usr[1])  # 5% from left edge
           y_pos <- usr[3] + cb_plot_init$cos_legend_pos[2] * (usr[4] - usr[3])  # 50% from bottom edge
           legend(x = x_pos, y = y_pos, texts,
-            bty = "n", col = shape_col, text.col = texts_col,
-            cex = 1.5, pt.cex = 1.5, pch = 4, x.intersp = cos_legend_pos[3], y.intersp = cos_legend_pos[4]
+                 bty = "n", col = shape_col, text.col = texts_col,
+                 cex = 1.5, pt.cex = 1.5, pch = 4, x.intersp = cos_legend_pos[3], y.intersp = cos_legend_pos[4]
           )
         }
       }
+      if (add_highlight) {
+        for (iii in 1:length(add_highlight_idx)) {
+          if ( !(add_highlight_style %in% c("vertical_lines", "star")) ){
+            message("add_highlight_style is not 'vertical_lines' and 'star', defaulting to 'vertical_lines'")
+            add_highlight_style = "vertical_lines"
+          }
+          if (add_highlight_style == "vertical_lines"){
+            abline(v = add_highlight_idx[iii], col = "#E31A1C", lwd = 1.5, lty = "dashed")
+          } else if (add_highlight_style == "star") {
+            points(add_highlight_idx[iii], args$y[match(add_highlight_idx[iii],args$x)], 
+                   pch = 8, col = "#E31A1C", lwd = 2, lty = "dashed", cex = 2.5)
+          } 
+          
+        }
+      }
     }
     if (!is.null(cb_plot_init$title)) {
       mtext(cb_plot_init$title,
-        side = 3, line = 0, outer = TRUE,
-        cex = cb_plot_init$title_size, font = cb_plot_init$title_face
+            side = 3, line = 0, outer = TRUE,
+            cex = cb_plot_init$title_size, font = cb_plot_init$title_face
       )
     }
     return(invisible())
   }
-
+  
   colocboost_plot_basic(cb_plot_input, cb_plot_init,
-    grange = grange,
-    outcome_idx = outcome_idx, 
-    plot_all_outcome = plot_all_outcome,
-    plot_cols = plot_cols,
-    add_vertical = add_vertical, add_vertical_idx = add_vertical_idx,
-    show_top_variables = show_top_variables,
-    ...
+                        grange = grange,
+                        outcome_idx = outcome_idx, 
+                        plot_all_outcome = plot_all_outcome,
+                        plot_cols = plot_cols,
+                        add_highlight = add_highlight, add_highlight_idx = add_highlight_idx,
+                        add_highlight_style = add_highlight_style,
+                        show_top_variables = show_top_variables,
+                        ...
   )
 }
 
@@ -339,7 +353,7 @@ get_input_plot <- function(cb_output, plot_cos_idx = NULL,
                            show_cos_to_uncoloc_outcome = NULL,
                            plot_ucos = FALSE,
                            plot_ucos_idx = NULL) {
-
+  
   # check ucos exists
   if (plot_ucos && !"ucos_details" %in% names(cb_output)) {
     warning(
@@ -354,7 +368,7 @@ get_input_plot <- function(cb_output, plot_cos_idx = NULL,
     cb_output$data_info$outcome_info$outcome_names <- outcome_names
     names(cb_output$data_info$z) <- outcome_names
   }
-
+  
   # extract results from colocboost
   analysis_outcome <- cb_output$data_info$outcome_info$outcome_names
   focal_outcome_idx <- which(cb_output$data_info$outcome_info$is_focal)
@@ -371,7 +385,7 @@ get_input_plot <- function(cb_output, plot_cos_idx = NULL,
   Z <- cb_output$data_info$z
   coef <- cb_output$data_info$coef
   vcp <- list(as.numeric(cb_output$vcp))
-
+  
   # if finemapping
   if (cb_output$data_info$n_outcomes == 1) {
     cb_output$cos_details$cos$cos_variables <- cb_output$ucos_details$ucos$ucos_variables
@@ -406,7 +420,7 @@ get_input_plot <- function(cb_output, plot_cos_idx = NULL,
       }
     })
     ncos <- length(cb_output$cos_details$cos$cos_index)
-
+    
     select_cs <- 1:ncos
     if (!is.null(plot_cos_idx)) {
       if (length(setdiff(plot_cos_idx, c(1:ncos))) != 0) {
@@ -477,7 +491,7 @@ get_input_plot <- function(cb_output, plot_cos_idx = NULL,
     "coloc_index" = coloc_index,
     "select_cos" = select_cs
   )
-
+  
   # check plot uncolocalizaed confidence sets from ucos_details
   if (plot_ucos & cb_output$data_info$n_outcomes > 1){
     ucos_details <- cb_output$ucos_details
@@ -528,7 +542,7 @@ get_input_plot <- function(cb_output, plot_cos_idx = NULL,
       plot_input$ucos_cos_int_weights <- combined
     }
   }
-
+  
   # check if plot cos to uncolocalized outcome
   # use the updated coloc_cos and related components from plot_input if available
   coloc_cos <- plot_input$cos
@@ -613,7 +627,7 @@ get_input_plot <- function(cb_output, plot_cos_idx = NULL,
     )
     plot_input$uncoloc <- uncoloc
   }
-
+  
   class(plot_input) <- "colocboost"
   return(plot_input)
 }
@@ -634,11 +648,11 @@ plot_initial <- function(cb_plot_input, y = "log10p",
                          ...) {
   args <- list(...)
   if (!exists("pch", args)) args$pch <- 16
-
+  
   # - set background point color and cos color pools
   args$bg <- points_color
   if (is.null(cos_color)) {
-
+    
     # cos_color <- c(
     #   "#377EB8", "#E69F00", "#33A02C", "#984EA3", "#F46D43",
     #   "#A65628", "#1F4E79", "#B2182B", "#D73027", "#F781BF",
@@ -688,12 +702,12 @@ plot_initial <- function(cb_plot_input, y = "log10p",
       "#D5D8DC", "#BDC3C7", "#95A5A6", "#7F8C8D", "#17202A",
       "#808080", "#A9A9A9", "#C0C0C0", "#696969", "#778899"
     )
-
-
+    
+    
     # cos_color <- c("#1F70A9", "#33A02C", "#CAB2D6", "#EA7827")
   }
   args$col <- cos_color[cb_plot_input$select_cos]
-
+  
   # - set data and x-lab and y-lab
   if (y == "log10p") {
     plot_data <- lapply(cb_plot_input$Zscores, function(z) {
@@ -729,12 +743,12 @@ plot_initial <- function(cb_plot_input, y = "log10p",
   if (!exists("ylab", args)) args$ylab <- ylab
   args$lab_size <- as.numeric(lab_style[1])
   args$lab_face <- lab_style[2]
-
+  
   # - set title format
   args$title <- title_specific
   args$title_size <- as.numeric(title_style[1])
   args$title_face <- title_style[2]
-
+  
   # - set x-axis and y-axis
   args$x <- cb_plot_input$x$pos
   args$y <- plot_data
@@ -744,7 +758,7 @@ plot_initial <- function(cb_plot_input, y = "log10p",
   }
   args$axis_size <- as.numeric(axis_style[1])
   args$axis_face <- axis_style[2]
-
+  
   # - set ylim for each subfigure
   if (exists("ylim", args)) {
     ymax <- rep(args$ylim[2], length(args$y))
@@ -776,7 +790,7 @@ plot_initial <- function(cb_plot_input, y = "log10p",
   }
   args$ymax <- ymax
   args$ymin <- ymin
-
+  
   # - set legend text position and format
   args$outcome_legend_pos <- outcome_legend_pos
   args$outcome_legend_size <- outcome_legend_size
@@ -788,7 +802,7 @@ plot_initial <- function(cb_plot_input, y = "log10p",
     args$outcome_legend_angle <- 0
   }
   args$cos_legend_pos <- cos_legend_pos
-
+  
   return(args)
 }
 
diff --git a/tests/testthat/test_plot.R b/tests/testthat/test_plot.R
index 775d3fe..d49aace 100644
--- a/tests/testthat/test_plot.R
+++ b/tests/testthat/test_plot.R
@@ -176,7 +176,9 @@ test_that("colocboost_plot handles layout options", {
 test_that("colocboost_plot handles additional visualization options", {
   
   # Test with vertical line options
-  expect_error(suppressWarnings(colocboost_plot(cb_res, add_vertical = TRUE, add_vertical_idx = c(5, 10))), NA)
+  expect_error(suppressWarnings(colocboost_plot(cb_res, add_highlight = TRUE, add_highlight_idx = c(5, 10))), NA)
+  expect_error(suppressWarnings(colocboost_plot(cb_res, add_highlight = TRUE, add_highlight_idx = c(5, 10), add_highlight_style = "vertical_lines")), NA)
+  expect_error(suppressWarnings(colocboost_plot(cb_res, add_highlight = TRUE, add_highlight_idx = c(5, 10), add_highlight_style = "star")), NA)
   
   # Test with show_top_variables option
   expect_error(suppressWarnings(colocboost_plot(cb_res, show_top_variables = TRUE)), NA)
diff --git a/tests/testthat/test_utils.R b/tests/testthat/test_utils.R
index a622540..d3d480d 100644
--- a/tests/testthat/test_utils.R
+++ b/tests/testthat/test_utils.R
@@ -85,12 +85,12 @@ test_that("w_cs correctly identifies confidence set for weight vector", {
   result <- w_cs(w, coverage = 0.8)
   
   # Expected result for 80% coverage: w[1] + w[2] = 0.8, so first 2 elements should be 1
-  expected <- c(1, 1, 0, 0)
+  expected <- c(1,2)
   expect_equal(result, expected)
   
   # Test with different coverage
   result2 <- w_cs(w, coverage = 0.9)
-  expected2 <- c(1, 1, 1, 1)  # First 4 elements cover 90% since 3 and 4 with the same 0.1 weight 
+  expected2 <- c(1,2,3,4)  # First 4 elements cover 90% since 3 and 4 with the same 0.1 weight 
   expect_equal(result2, expected2)
 })
 
@@ -490,7 +490,7 @@ test_that("get_merge_ordered_with_indices handles conflicting variable orders co
 test_that("get_merge_ordered_with_indices handles edge cases", {
   
   # Test with NULL inputs
-  expect_equal(get_merge_ordered_with_indices(list()), character(0))
+  expect_equal(get_merge_ordered_with_indices(list()), NULL)
   
   # Test with single vector
   expect_equal(get_merge_ordered_with_indices(list(c("A", "B", "C"))), c("A", "B", "C"))
diff --git a/vignettes/Visualization_ColocBoost_Output.Rmd b/vignettes/Visualization_ColocBoost_Output.Rmd
index 1ffd7d9..8ed3e5b 100644
--- a/vignettes/Visualization_ColocBoost_Output.Rmd
+++ b/vignettes/Visualization_ColocBoost_Output.Rmd
@@ -94,9 +94,11 @@ There are three options available for plotting the CoS variants to uncolocalized
 colocboost_plot(res, show_cos_to_uncoloc = TRUE)
 ```
 
-### 2.4. Plot with an added vertical line
+### 2.4. Plot with added highlight points
 
-You can add a vertical line to the plot by setting `add_vertical = TRUE` and `add_vertical_idx = **`. This will add a vertical line at the specified index.
+You can highlight specific variants in the plot by setting `add_highlight = TRUE` and `add_highlight_idx = **`. 
+This will add red dashed vertical lines (defult `add_highlight_style = "vertical_lines"`) at the specified index you want to highlight. 
+Alternatively, you can use `add_highlight_style = "star"` to change the highlight style to the red star for the specified variants.
 For example, to add a vertical line at true causal variants, you can set `add_vertical_idx = unique(unlist(Ind_5traits$true_effect_variants))`.
 Following plot also shows the top variants.
 
@@ -104,8 +106,9 @@ Following plot also shows the top variants.
 ```{r vertical-plot}
 colocboost_plot(
   res, show_top_variables = TRUE, 
-  add_vertical = TRUE, 
-  add_vertical_idx = unique(unlist(Ind_5traits$true_effect_variants))
+  add_highlight = TRUE, 
+  add_highlight_idx = unique(unlist(Ind_5traits$true_effect_variants)),
+  add_highlight_style = "star"
 )
 ```
 

From 220f5f7f9783a0872685df5b4e32162c8c0ef1aa Mon Sep 17 00:00:00 2001
From: xuewei cao <36172337+xueweic@users.noreply.github.com>
Date: Sun, 14 Sep 2025 19:47:17 -0400
Subject: [PATCH 03/12] Optimized LD calculation for Outcome-Specific Model
 Updates

To enhance computational efficiency, we pre-calculate LD values for each pair of outcome-specific best parameter updates at each boost update.
---
 R/colocboost_check_update_jk.R | 100 +++++++++++++--------------------
 R/colocboost_init.R            |   3 +-
 R/colocboost_one_causal.R      |  10 ++--
 R/colocboost_update.R          |   3 +-
 R/colocboost_workhorse.R       |   3 +-
 5 files changed, 49 insertions(+), 70 deletions(-)

diff --git a/R/colocboost_check_update_jk.R b/R/colocboost_check_update_jk.R
index 05390c8..d449b03 100644
--- a/R/colocboost_check_update_jk.R
+++ b/R/colocboost_check_update_jk.R
@@ -538,66 +538,54 @@ check_jk_jkeach <- function(jk, jk_each,
   return(judge)
 }
 
-# check_pair_jkeach <- function(jk_each,
-#                               pos.update,
-#                               model_update,
-#                               cb_data, X_dict,
-#                               jk_equiv_corr = 0.8,
-#                               jk_equiv_loglik = 0.001) {
-#   data_update <- cb_data$data[pos.update]
-#   # -- check if jk_i ~ jk_j
-#   change_each_pair <- matrix(FALSE, nrow = length(jk_each), ncol = length(jk_each))
-#   for (i in 1:length(jk_each)) {
-#     
-#     jk_i <- jk_each[i]
-#     change_log_jk_i <- model_update[[i]]$change_loglike[jk_i]
-#     for (j in 1:length(jk_each)) {
-#       if (j != i) {
-#         jk_j <- jk_each[j]
-# 
-#         if (!(jk_i %in% data_update[[i]]$variable_miss) & !(jk_j %in% data_update[[i]]$variable_miss)) {
-#           change_log_jk_j <- model_update[[i]]$change_loglike[jk_j]
-#           change_each <- abs(change_log_jk_i - change_log_jk_j)
-#           LD_temp <- get_LD_jk1_jk2(jk_i, jk_j,
-#             X = cb_data$data[[X_dict[i]]]$X,
-#             XtX = cb_data$data[[X_dict[i]]]$XtX,
-#             N = data_update[[i]]$N,
-#             remain_jk = setdiff(1:length(model_update[[i]]$res), data_update[[i]]$variable_miss)
-#           )
-#           change_each_pair[i, j] <- (change_each <= jk_equiv_loglik) & (abs(LD_temp) >= jk_equiv_corr)
-#         } else {
-#           change_each_pair[i, j] <- FALSE
-#         }
-#       } else {
-#         change_each_pair[i, j] <- FALSE
-#       }
-#     }
-#   }
-#   change_each_pair <- change_each_pair + t(change_each_pair)
-#   return(change_each_pair)
-# }
-
 check_pair_jkeach <- function(jk_each,
                               pos.update,
                               model_update,
                               cb_data, X_dict,
                               jk_equiv_corr = 0.8,
                               jk_equiv_loglik = 0.001) {
-  
-  detect_func <- function(idx, jk_i, jk_j){
+
+
+  #' @importFrom stats cor
+  get_LD_jk_each <- function(jk_each,
+                             X = NULL, XtX = NULL, N = NULL,
+                             remain_jk = NULL) {
+    if (!is.null(X)) {
+      LD_temp <- suppressWarnings({
+        get_cormat(X[, jk_each])
+      })
+      LD_temp[which(is.na(LD_temp))] <- 0
+      # LD_temp <- LD_temp[1, 2]
+    } else if (!is.null(XtX)) {
+      if (length(XtX) == 1){
+        LD_temp <- matrix(0, length(jk_each), length(jk_each))
+      } else {
+        jk.remain <- match(jk_each, remain_jk)
+        LD_temp <- XtX[jk.remain, jk.remain]
+        LD_temp[which(is.na(LD_temp))] <- 0
+      }
+    }
+    return(LD_temp)
+  }
+
+  detect_func <- function(idx, LD_all, jk_i, jk_j, i, j){
     change_log_jk_i <- model_update[[idx]]$change_loglike[jk_i]
     change_log_jk_j <- model_update[[idx]]$change_loglike[jk_j]
     change_each <- abs(change_log_jk_i - change_log_jk_j)
-    LD_temp <- get_LD_jk1_jk2(jk_i, jk_j,
-                              X = cb_data$data[[X_dict[idx]]]$X,
-                              XtX = cb_data$data[[X_dict[idx]]]$XtX,
-                              N = data_update[[idx]]$N,
-                              remain_jk = setdiff(1:length(model_update[[idx]]$res), data_update[[idx]]$variable_miss)
-    )
+    LD_temp <- LD_all[[idx]][i, j]
     return((change_each <= jk_equiv_loglik) & (abs(LD_temp) >= jk_equiv_corr))
   }
-  
+
   data_update <- cb_data$data[pos.update]
+  LD_all <- lapply(1:length(jk_each), function(idx){
+    get_LD_jk_each(jk_each,
+                   X = cb_data$data[[X_dict[idx]]]$X,
+                   XtX = cb_data$data[[X_dict[idx]]]$XtX,
+                   N = data_update[[idx]]$N,
+                   remain_jk = setdiff(1:length(model_update[[idx]]$res), data_update[[idx]]$variable_miss)
+    )
+  })
+
   # -- check if jk_i ~ jk_j
   change_each_pair <- matrix(FALSE, nrow = length(jk_each), ncol = length(jk_each))
   for (i in 1:length(jk_each)) {
@@ -605,18 +593,10 @@ check_pair_jkeach <- function(jk_each,
     for (j in i:length(jk_each)) {
       if (j != i) {
         jk_j <- jk_each[j]
-        if (!(jk_i %in% data_update[[i]]$variable_miss) & !(jk_j %in% data_update[[i]]$variable_miss)) {
-          change_each_pair[i, j] <- detect_func(i, jk_i, jk_j)
-          # if jk_i and jk_j are equivalent on dataset i, then we don't need to check dataset j
-          if ( !change_each_pair[i, j] ){
-            if (!(jk_i %in% data_update[[j]]$variable_miss) & !(jk_j %in% data_update[[j]]$variable_miss)) {
-              change_each_pair[j, i] <- detect_func(j, jk_i, jk_j)
-            } else {
-              change_each_pair[j, i] <- FALSE
-            }
-          }
-        } else {
-          change_each_pair[i, j] <- FALSE
+        change_each_pair[i, j] <- detect_func(idx = i, LD_all, jk_i, jk_j, i, j)
+        # if jk_i and jk_j are equivalent on dataset i, then we don't need to check dataset j
+        if ( !change_each_pair[i, j] ){
+          change_each_pair[j, i] <- detect_func(idx = j, LD_all, jk_i, jk_j, i, j)
         }
       } else {
         change_each_pair[i, j] <- FALSE
diff --git a/R/colocboost_init.R b/R/colocboost_init.R
index ea27ce7..edca937 100644
--- a/R/colocboost_init.R
+++ b/R/colocboost_init.R
@@ -163,7 +163,8 @@ colocboost_init_model <- function(cb_data,
     tmp <- list(
       "res" = NULL,
       "beta" = rep(0, P),
-      "weights_path" = c(),
+      # "weights_path" = c(),
+      "weights_path" = list(),
       "profile_loglike_each" = NULL,
       "obj_path" = 999999,
       "obj_single" = 999999,
diff --git a/R/colocboost_one_causal.R b/R/colocboost_one_causal.R
index 0ed96b5..09692d5 100644
--- a/R/colocboost_one_causal.R
+++ b/R/colocboost_one_causal.R
@@ -79,9 +79,8 @@ colocboost_one_iteration <- function(cb_model, cb_model_para, cb_data) {
   cb_model_para[rm_elements] <- NULL
   for (i in 1:length(cb_model)) {
     cb_model[[i]]$obj_path <- as.numeric(unlist(cb_model[[i]]$obj_path[-1]))
-  }
-  for (i in 1:length(cb_model)) {
     cb_model[[i]]$obj_single <- as.numeric(unlist(cb_model[[i]]$obj_single[-1]))
+    cb_model[[i]]$weights_path <- do.call(rbind, cb_model[[i]]$weights_path)
   }
 
   cb_obj <- list("cb_data" = cb_data, "cb_model" = cb_model, "cb_model_para" = cb_model_para)
@@ -238,7 +237,7 @@ colocboost_diagLD <- function(cb_model, cb_model_para, cb_data) {
         cb_model_para, 
         cb_data
       )
-      weights <- rbind(weights, cb_model_tmp[[iy]]$weights_path)
+      weights <- rbind(weights, unlist(cb_model_tmp[[iy]]$weights_path))
     }
     ###### overlap weights
     overlap_pair <- check_pair_overlap(weights, coverage = 0.95)
@@ -307,11 +306,10 @@ colocboost_diagLD <- function(cb_model, cb_model_para, cb_data) {
   cb_model_para[rm_elements] <- NULL
   for (i in 1:length(cb_model)) {
     cb_model[[i]]$obj_path <- as.numeric(unlist(cb_model[[i]]$obj_path[-1]))
-  }
-  for (i in 1:length(cb_model)) {
     cb_model[[i]]$obj_single <- as.numeric(unlist(cb_model[[i]]$obj_single[-1]))
+    cb_model[[i]]$weights_path <- do.call(rbind, cb_model[[i]]$weights_path)
   }
-
+  
   cb_obj <- list("cb_data" = cb_data, "cb_model" = cb_model, "cb_model_para" = cb_model_para)
   class(cb_obj) <- "colocboost"
   return(cb_obj)
diff --git a/R/colocboost_update.R b/R/colocboost_update.R
index 6100828..734bcb1 100644
--- a/R/colocboost_update.R
+++ b/R/colocboost_update.R
@@ -74,7 +74,8 @@ colocboost_update <- function(cb_model, cb_model_para, cb_data) {
     # weights <- adj_dep * ld_feature / scaling_factor * exp_abs_cor / sum(exp_abs_cor)
     weights <- adj_dep * obj_ld * exp_abs_cor / sum(exp_abs_cor)
     weights <- weights / sum(weights)
-    cb_model[[i]]$weights_path <- rbind(cb_model[[i]]$weights_path, as.vector(weights))
+    # cb_model[[i]]$weights_path <- rbind(cb_model[[i]]$weights_path, as.vector(weights))
+    cb_model[[i]]$weights_path <- c(cb_model[[i]]$weights_path, list(as.vector(weights)))
     ########## END: MAIN CALCULATION ###################
 
 
diff --git a/R/colocboost_workhorse.R b/R/colocboost_workhorse.R
index 7583eee..5bfe645 100644
--- a/R/colocboost_workhorse.R
+++ b/R/colocboost_workhorse.R
@@ -236,9 +236,8 @@ colocboost_workhorse <- function(cb_data,
     cb_model_para$num_updates <- m
     for (i in 1:length(cb_model)) {
       cb_model[[i]]$obj_path <- as.numeric(unlist(cb_model[[i]]$obj_path[-1]))
-    }
-    for (i in 1:length(cb_model)) {
       cb_model[[i]]$obj_single <- as.numeric(unlist(cb_model[[i]]$obj_single[-1]))
+      cb_model[[i]]$weights_path <- do.call(rbind, cb_model[[i]]$weights_path)
     }
 
     if (m == M) {

From 040b1783fefbf6110f00788b65b13c20fdc13e41 Mon Sep 17 00:00:00 2001
From: xuewei cao <36172337+xueweic@users.noreply.github.com>
Date: Tue, 30 Sep 2025 15:04:05 -0400
Subject: [PATCH 04/12] Fix purity issue for merged CoS

For the merged CoS, we recalibrate the CoS variants based on the merged weight. We should recalculate purity for the merged CoS and filter unpured merged CoS.
---
 R/colocboost_assemble.R |  3 +++
 R/colocboost_utils.R    | 52 ++++++++++++++++++++++++++++++++++++++---
 2 files changed, 52 insertions(+), 3 deletions(-)

diff --git a/R/colocboost_assemble.R b/R/colocboost_assemble.R
index aba5405..e849872 100644
--- a/R/colocboost_assemble.R
+++ b/R/colocboost_assemble.R
@@ -209,6 +209,9 @@ colocboost_assemble <- function(cb_obj,
     # - colocalization results
     cb_obj$cb_model_para$weight_fudge_factor <- weight_fudge_factor
     cb_obj$cb_model_para$coverage <- coverage
+    cb_obj$cb_model_para$min_abs_corr <- min_abs_corr
+    cb_obj$cb_model_para$median_abs_corr <- median_abs_corr
+    cb_obj$cb_model_para$n_purity <- n_purity
     cos_results <- get_cos_details(cb_obj, coloc_out = past_out$cos$cos, data_info = data_info)
     cb_output <- list(
       "vcp" = cos_results$vcp,
diff --git a/R/colocboost_utils.R b/R/colocboost_utils.R
index 0407633..1a13eeb 100644
--- a/R/colocboost_utils.R
+++ b/R/colocboost_utils.R
@@ -665,8 +665,6 @@ get_cos_details <- function(cb_obj, coloc_out, data_info = NULL) {
     })
     int_weight <- lapply(cos_weights, get_integrated_weight, weight_fudge_factor = cb_obj$cb_model_para$weight_fudge_factor)
     names(int_weight) <- names(cos_weights) <- colocset_names
-    vcp <- as.vector(1 - apply(1 - do.call(cbind, int_weight), 1, prod))
-    names(vcp) <- data_info$variables
 
     # - resummary results
     cos_re_idx <- lapply(int_weight, function(w) {
@@ -676,7 +674,55 @@ get_cos_details <- function(cb_obj, coloc_out, data_info = NULL) {
       data_info$variables[idx]
     })
     coloc_csets <- list("cos_index" = cos_re_idx, "cos_variables" = cos_re_var)
-
+    
+    # - recalculate purity
+    purity <- vector(mode = "list", length = length(cos_re_idx))
+    for (ee in 1:length(cos_re_idx)) {
+      coloc_t <- coloc_outcome_index[[ee]]
+      p_tmp <- c()
+      for (i3 in coloc_t) {
+        pos <- cos_re_idx[[ee]]
+        X_dict <- cb_obj$cb_data$dict[i3]
+        if (!is.null(cb_obj$cb_data$data[[X_dict]]$XtX)) {
+          pos <- match(pos, setdiff(1:cb_obj$cb_model_para$P, cb_obj$cb_data$data[[i3]]$variable_miss))
+        }
+        tmp <- matrix(get_purity(pos,
+                                 X = cb_obj$cb_data$data[[X_dict]]$X,
+                                 Xcorr = cb_obj$cb_data$data[[X_dict]]$XtX,
+                                 N = cb_obj$cb_data$data[[i3]]$N, n = cb_obj$cb_model_para$n_purity
+        ), 1, 3)
+        p_tmp <- rbind(p_tmp, tmp)
+      }
+      purity[[ee]] <- matrix(apply(p_tmp, 2, max), 1, 3)
+    }
+    purity_all <- do.call(rbind, purity)
+    purity_all <- as.data.frame(purity_all)
+    colnames(purity_all) <- c("min_abs_corr", "mean_abs_corr", "median_abs_corr")
+    coloc_out$purity <- purity_all
+    if (is.null(cb_obj$cb_model_para$median_abs_corr)) {
+      is_pure <- which(purity_all[, 1] >= cb_obj$cb_model_para$min_abs_corr)
+    } else {
+      is_pure <- which(purity_all[, 1] >= cb_obj$cb_model_para$min_abs_corr | purity_all[, 3] >= cb_obj$cb_model_para$median_abs_corr)
+    }
+    if (length(is_pure)==0){
+      coloc_results <- NULL
+      vcp <- NULL
+      return(list("cos_results" = coloc_results, "vcp" = vcp))
+    } else if (length(is_pure)!=length(cos_re_idx)){
+      int_weight = int_weight[is_pure]
+      coloc_csets <- lapply(coloc_csets, function(cs) cs[is_pure])
+      coloc_outcomes <- lapply(coloc_outcomes, function(cs) cs[is_pure])
+      normalization_evidence <- normalization_evidence[is_pure]
+      npc <- npc[is_pure]
+      cos_min_npc_outcome <- cos_min_npc_outcome[is_pure]
+      cos_weights <- cos_weights[is_pure]
+      coloc_out$purity <- purity_all[is_pure,,drop = FALSE]
+      colocset_names <- colocset_names[is_pure]
+    }
+    vcp <- as.vector(1 - apply(1 - do.call(cbind, int_weight), 1, prod))
+    names(vcp) <- data_info$variables
+    
+    
     # - hits variables in each csets
     coloc_hits <- coloc_hits_variablenames <- coloc_hits_names <- c()
     for (i in 1:length(int_weight)) {

From 5c19c59e237191c9f4452749970e330a6cf24413 Mon Sep 17 00:00:00 2001
From: xuewei cao <36172337+xueweic@users.noreply.github.com>
Date: Thu, 2 Oct 2025 02:13:57 -0400
Subject: [PATCH 05/12] minor fix

fix error when focal outcome does not pass multiple testing correction at the first place.
---
 R/colocboost_assemble.R  | 2 +-
 R/colocboost_utils.R     | 2 +-
 R/colocboost_workhorse.R | 3 +++
 3 files changed, 5 insertions(+), 2 deletions(-)

diff --git a/R/colocboost_assemble.R b/R/colocboost_assemble.R
index e849872..4ddd67d 100644
--- a/R/colocboost_assemble.R
+++ b/R/colocboost_assemble.R
@@ -58,7 +58,7 @@ colocboost_assemble <- function(cb_obj,
     )
     # - save model and all coloc and single information for diagnostic
     if (output_level != 1) {
-      tmp <- get_full_output(cb_obj = cb_obj, past_out = NULL, variables = NULL)
+      tmp <- get_full_output(cb_obj = cb_obj, past_out = NULL, variables = data_info$variables)
       if (output_level == 2) {
         cb_output <- c(cb_output, list("ucos_details" = tmp$ucos_detials))
         cb_output <- cb_output[c("cos_summary", "vcp", "cos_details", "data_info", "model_info", "ucos_details")]
diff --git a/R/colocboost_utils.R b/R/colocboost_utils.R
index 1a13eeb..0d57e32 100644
--- a/R/colocboost_utils.R
+++ b/R/colocboost_utils.R
@@ -893,7 +893,7 @@ get_full_output <- function(cb_obj, past_out = NULL, variables = NULL, cb_output
   cb_model_para <- cb_obj$cb_model_para
 
   ## - obtain the order of variables based on the variables names if it has position information
-  if (!is.null(variables)) {
+  if (is.null(variables)) {
     ordered <- 1:length(cb_obj$cb_model_para$variables)
   } else {
     ordered <- match(variables, cb_obj$cb_model_para$variables)
diff --git a/R/colocboost_workhorse.R b/R/colocboost_workhorse.R
index 5bfe645..4462680 100644
--- a/R/colocboost_workhorse.R
+++ b/R/colocboost_workhorse.R
@@ -103,6 +103,9 @@ colocboost_workhorse <- function(cb_data,
   }
 
   if (sum(cb_model_para$update_y == 1) == 0) {
+    for (i in 1:length(cb_model)){
+      cb_model[[i]]$weights_path <- matrix(0, ncol = cb_model_para$P)
+    }
     cb_obj <- list("cb_data" = cb_data, "cb_model" = cb_model, "cb_model_para" = cb_model_para)
     class(cb_obj) <- "colocboost"
     return(cb_obj)

From 8c0dad2856c15f6224d6a5347f259d9e635177bb Mon Sep 17 00:00:00 2001
From: xuewei cao <36172337+xueweic@users.noreply.github.com>
Date: Wed, 22 Oct 2025 13:40:06 -0400
Subject: [PATCH 06/12] Update colocboost_utils.R

fix bug for choosing top variable in ucos details
---
 R/colocboost_utils.R | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/R/colocboost_utils.R b/R/colocboost_utils.R
index 0d57e32..9bfa3fd 100644
--- a/R/colocboost_utils.R
+++ b/R/colocboost_utils.R
@@ -987,7 +987,7 @@ get_full_output <- function(cb_obj, past_out = NULL, variables = NULL, cb_output
         # - hits variables in each csets
         cs_hits <- sapply(1:length(specific_w), function(jj) {
           inw <- specific_w[[jj]]
-          sample(which(inw == max(inw)), 1)
+          which(inw == max(inw))[1]
         })
         cs_hits_variablenames <- sapply(cs_hits, function(ch) variables[ch])
         specific_cs_hits <- data.frame("top_index" = cs_hits, "top_variables" = cs_hits_variablenames) # save

From 37b33ad3ccfe67a259074a0649457d07060029fb Mon Sep 17 00:00:00 2001
From: xuewei cao <36172337+xueweic@users.noreply.github.com>
Date: Wed, 22 Oct 2025 14:05:32 -0400
Subject: [PATCH 07/12] fineboost one iteration issue

fix fine-mapping one iteration issue
---
 R/colocboost_assemble.R  | 1 +
 R/colocboost_inference.R | 7 ++++++-
 2 files changed, 7 insertions(+), 1 deletion(-)

diff --git a/R/colocboost_assemble.R b/R/colocboost_assemble.R
index 4ddd67d..94cbbe5 100644
--- a/R/colocboost_assemble.R
+++ b/R/colocboost_assemble.R
@@ -79,6 +79,7 @@ colocboost_assemble <- function(cb_obj,
     } else if (cb_obj$cb_model_para$model_used == "one_causal"){
       # fixme later
       check_null_max <- check_null_max * check_null
+      check_null_max_ucos <- check_null_max_ucos * check_null
     }
     cb_obj <- get_max_profile(cb_obj, check_null_max = check_null_max, 
                               check_null_max_ucos = check_null_max_ucos, 
diff --git a/R/colocboost_inference.R b/R/colocboost_inference.R
index e76d708..882fc7a 100644
--- a/R/colocboost_inference.R
+++ b/R/colocboost_inference.R
@@ -374,7 +374,12 @@ check_null_post <- function(cb_obj,
   }
   if (!weaker_effect) {
     check_cs_change <- cs_change
-    check_null_tmp <- sapply(1:cb_obj$cb_model_para$L, function(j) cb_obj$cb_model[[j]]$check_null_max)
+    if (cb_obj$cb_model_para$L == 1){
+      check_null_max_tmp <- cb_obj$cb_model[[j]]$check_null_max_ucos
+    } else {
+      check_null_max_tmp <- cb_obj$cb_model[[j]]$check_null_max
+    }
+    check_null_tmp <- sapply(1:cb_obj$cb_model_para$L, function(j) check_null_max_tmp)
   } else {
     check_null_tmp <- rep(check_null, cb_obj$cb_model_para$L)
   }

From c0878aa26d7fe2e8b38f1d94b890c7de386d3823 Mon Sep 17 00:00:00 2001
From: xuewei cao <36172337+xueweic@users.noreply.github.com>
Date: Sat, 1 Nov 2025 08:18:40 -0400
Subject: [PATCH 08/12] Update ColocBoost_Wrapper_Pipeline.Rmd

Comment out to avoid running this code here, as we do not have real data files in examples
---
 vignettes/ColocBoost_Wrapper_Pipeline.Rmd | 173 ++++++++++++----------
 1 file changed, 92 insertions(+), 81 deletions(-)

diff --git a/vignettes/ColocBoost_Wrapper_Pipeline.Rmd b/vignettes/ColocBoost_Wrapper_Pipeline.Rmd
index 6b5a5a4..49bc4c1 100644
--- a/vignettes/ColocBoost_Wrapper_Pipeline.Rmd
+++ b/vignettes/ColocBoost_Wrapper_Pipeline.Rmd
@@ -85,23 +85,25 @@ imiss_cutoff = 0.9
 
 # More advanced parameters see pecotmr::load_multitask_regional_data()
 
-region_data_individual <- load_multitask_regional_data(
-    region = region,
-    genotype_list = genotype_list,
-    phenotype_list = phenotype_list,
-    covariate_list = covariate_list,
-    conditions_list_individual = conditions_list_individual,
-    match_geno_pheno = match_geno_pheno,
-    association_window = association_window,
-    region_name_col = region_name_col,
-    extract_region_name = extract_region_name,
-    keep_indel = keep_indel,
-    keep_samples = keep_samples,
-    maf_cutoff = maf_cutoff,
-    mac_cutoff = mac_cutoff,
-    xvar_cutoff = xvar_cutoff,
-    imiss_cutoff = imiss_cutoff
-)
+#### Comment out to avoid running this code here, as we do not have real data files in this example ####
+# region_data_individual <- load_multitask_regional_data(
+#     region = region,
+#     genotype_list = genotype_list,
+#     phenotype_list = phenotype_list,
+#     covariate_list = covariate_list,
+#     conditions_list_individual = conditions_list_individual,
+#     match_geno_pheno = match_geno_pheno,
+#     association_window = association_window,
+#     region_name_col = region_name_col,
+#     extract_region_name = extract_region_name,
+#     keep_indel = keep_indel,
+#     keep_samples = keep_samples,
+#     maf_cutoff = maf_cutoff,
+#     mac_cutoff = mac_cutoff,
+#     xvar_cutoff = xvar_cutoff,
+#     imiss_cutoff = imiss_cutoff
+# )
+#### End of comment out
 
 ```
 
@@ -144,17 +146,19 @@ n_controls = c(0, 40000)
 
 # More advanced parameters see pecotmr::load_multitask_regional_data()
 
-region_data_sumstat <- load_multitask_regional_data(
-    sumstat_path_list = sumstat_path_list,
-    column_file_path_list = column_file_path_list,
-    LD_meta_file_path_list = LD_meta_file_path_list,
-    conditions_list_sumstat = conditions_list_sumstat,
-    match_LD_sumstat = match_LD_sumstat,
-    association_window = association_window,
-    n_samples = n_samples,
-    n_cases = n_cases,
-    n_controls = n_controls
-)
+#### Comment out to avoid running this code here, as we do not have real data files in this example ####
+# region_data_sumstat <- load_multitask_regional_data(
+#     sumstat_path_list = sumstat_path_list,
+#     column_file_path_list = column_file_path_list,
+#     LD_meta_file_path_list = LD_meta_file_path_list,
+#     conditions_list_sumstat = conditions_list_sumstat,
+#     match_LD_sumstat = match_LD_sumstat,
+#     association_window = association_window,
+#     n_samples = n_samples,
+#     n_cases = n_cases,
+#     n_controls = n_controls
+# )
+#### End of comment out
 ```
 
 
@@ -223,58 +227,65 @@ outputs:
 - **`colocboost_results`**: List of colocboost objects (with `xqtl_coloc`, `joint_gwas`, `separate_gwas`); Output of the `colocboost_analysis_pipeline` function. If the mode is not run, the corresponding element will be `NULL`.
 
 ```{r, colocboost-analysis}
-# load in individual-level and sumstat data
-region_data_combined <- load_multitask_regional_data(
-    region = region,
-    genotype_list = genotype_list,
-    phenotype_list = phenotype_list,
-    covariate_list = covariate_list,
-    conditions_list_individual = conditions_list_individual,
-    match_geno_pheno = match_geno_pheno,
-    association_window = association_window,
-    region_name_col = region_name_col,
-    extract_region_name = extract_region_name,
-    keep_indel = keep_indel,
-    keep_samples = keep_samples,
-    maf_cutoff = maf_cutoff,
-    mac_cutoff = mac_cutoff,
-    xvar_cutoff = xvar_cutoff,
-    imiss_cutoff = imiss_cutoff,
-    sumstat_path_list = sumstat_path_list,
-    column_file_path_list = column_file_path_list,
-    LD_meta_file_path_list = LD_meta_file_path_list,
-    conditions_list_sumstat = conditions_list_sumstat,
-    match_LD_sumstat = match_LD_sumstat,
-    n_samples = n_samples,
-    n_cases = n_cases,
-    n_controls = n_controls
-)
-
-maf_cutoff = 0.01
-pip_cutoff_to_skip_ind = rep(0, length(phenotype_list))
-pip_cutoff_to_skip_sumstat = rep(0, length(sumstat_path_list))
-qc_method = "rss_qc" 
-
-# run colocboost analysis
-colocboost_results <- colocboost_analysis_pipeline(
-    region_data_combined,
-    maf_cutoff = maf_cutoff, 
-    pip_cutoff_to_skip_ind = pip_cutoff_to_skip_ind,
-    pip_cutoff_to_skip_sumstat = pip_cutoff_to_skip_sumstat,
-    qc_method = qc_method,
-    xqtl_coloc = TRUE,
-    joint_gwas = TRUE,
-    separate_gwas = TRUE
-)
-
-# visualize results for xQTL-only mode
-colocboost_plot(colocboost_results$xqtl_coloc)
-
-# visualize results for joint GWAS mode
-colocboost_plot(colocboost_results$joint_gwas)
 
-# visualize results for separate GWAS mode
-for (i in 1:length(colocboost_results$separate_gwas)) {
-    colocboost_plot(colocboost_results$separate_gwas[[i]])
-}
+#### Comment out to avoid running this code here, as we do not have real data files in this example ####
+#### Please check the example code below ####
+
+# # load in individual-level and sumstat data
+# region_data_combined <- load_multitask_regional_data(
+#     region = region,
+#     genotype_list = genotype_list,
+#     phenotype_list = phenotype_list,
+#     covariate_list = covariate_list,
+#     conditions_list_individual = conditions_list_individual,
+#     match_geno_pheno = match_geno_pheno,
+#     association_window = association_window,
+#     region_name_col = region_name_col,
+#     extract_region_name = extract_region_name,
+#     keep_indel = keep_indel,
+#     keep_samples = keep_samples,
+#     maf_cutoff = maf_cutoff,
+#     mac_cutoff = mac_cutoff,
+#     xvar_cutoff = xvar_cutoff,
+#     imiss_cutoff = imiss_cutoff,
+#     sumstat_path_list = sumstat_path_list,
+#     column_file_path_list = column_file_path_list,
+#     LD_meta_file_path_list = LD_meta_file_path_list,
+#     conditions_list_sumstat = conditions_list_sumstat,
+#     match_LD_sumstat = match_LD_sumstat,
+#     n_samples = n_samples,
+#     n_cases = n_cases,
+#     n_controls = n_controls
+# )
+
+# maf_cutoff = 0.01
+# pip_cutoff_to_skip_ind = rep(0, length(phenotype_list))
+# pip_cutoff_to_skip_sumstat = rep(0, length(sumstat_path_list))
+# qc_method = "rss_qc" 
+
+# # run colocboost analysis
+# colocboost_results <- colocboost_analysis_pipeline(
+#     region_data_combined,
+#     maf_cutoff = maf_cutoff, 
+#     pip_cutoff_to_skip_ind = pip_cutoff_to_skip_ind,
+#     pip_cutoff_to_skip_sumstat = pip_cutoff_to_skip_sumstat,
+#     qc_method = qc_method,
+#     xqtl_coloc = TRUE,
+#     joint_gwas = TRUE,
+#     separate_gwas = TRUE
+# )
+
+# # visualize results for xQTL-only mode
+# colocboost_plot(colocboost_results$xqtl_coloc)
+
+# # visualize results for joint GWAS mode
+# colocboost_plot(colocboost_results$joint_gwas)
+
+# # visualize results for separate GWAS mode
+# for (i in 1:length(colocboost_results$separate_gwas)) {
+#     colocboost_plot(colocboost_results$separate_gwas[[i]])
+# }
+
+
+#### End of comment out
 ```
\ No newline at end of file

From b6c16f0f6995c912eb48bbf256da025ff67f3342 Mon Sep 17 00:00:00 2001
From: xueweic <xueweic@users.noreply.github.com>
Date: Sat, 1 Nov 2025 12:47:07 +0000
Subject: [PATCH 09/12] Update documentation

---
 man/colocboost_plot.Rd | 11 +++++++----
 1 file changed, 7 insertions(+), 4 deletions(-)

diff --git a/man/colocboost_plot.Rd b/man/colocboost_plot.Rd
index fe155fc..8e5c7df 100644
--- a/man/colocboost_plot.Rd
+++ b/man/colocboost_plot.Rd
@@ -18,8 +18,9 @@ colocboost_plot(
   plot_focal_cos_outcome_only = FALSE,
   points_color = "grey80",
   cos_color = NULL,
-  add_vertical = FALSE,
-  add_vertical_idx = NULL,
+  add_highlight = FALSE,
+  add_highlight_idx = NULL,
+  add_highlight_style = "vertical_lines",
   outcome_names = NULL,
   plot_cols = 2,
   variant_coord = FALSE,
@@ -62,9 +63,11 @@ colocboost_plot(
 
 \item{cos_color}{Optional custom colors for CoS.}
 
-\item{add_vertical}{Logical, if TRUE adds vertical lines at specified positions, default is FALSE}
+\item{add_highlight}{Logical, if TRUE adds vertical lines at specified positions, default is FALSE}
 
-\item{add_vertical_idx}{Optional indices for vertical lines.}
+\item{add_highlight_idx}{Optional indices for add_highlight variables.}
+
+\item{add_highlight_style}{Optional style of add_highlight variables, default is "vertical_lines", other choice is "star" - red star.}
 
 \item{outcome_names}{Optional vector of outcomes names for the subtitle of each figure. \code{outcome_names=NULL} for the outcome name shown in \code{data_info}.}
 

From 7747844d130c7132d189fabbd25186b98211ff59 Mon Sep 17 00:00:00 2001
From: xuewei cao <36172337+xueweic@users.noreply.github.com>
Date: Sat, 1 Nov 2025 15:40:46 -0400
Subject: [PATCH 10/12] Update colocboost.R

handle duplication variables - remove one of them
---
 R/colocboost.R | 33 +++++++++++++++++++++++++++++++--
 1 file changed, 31 insertions(+), 2 deletions(-)

diff --git a/R/colocboost.R b/R/colocboost.R
index 81e8490..6cb1721 100644
--- a/R/colocboost.R
+++ b/R/colocboost.R
@@ -247,7 +247,17 @@ colocboost <- function(X = NULL, Y = NULL, # individual data
         return(xx)
       })
     }
-
+    
+    # Remove duplicates and report: if duplicate columns of X
+    X <- lapply(seq_along(X), function(i) {
+      df <- X[[i]]
+      if (anyDuplicated(colnames(df))) {
+        message(paste("Removed duplicate columns from X matrix ", i))
+        df[, !duplicated(colnames(df)), drop = FALSE]
+      } else {
+        df
+      }
+    })
     keep_variable_individual <- lapply(X, colnames)
     if (!is.list(X) & !is.list(Y)) {
       warning("Error: Input X and Y must be the list containing genotype matrics and all phenotype vectors!")
@@ -333,11 +343,19 @@ colocboost <- function(X = NULL, Y = NULL, # individual data
         warning("Error: effect_est and effect_se should be the same dimension! Please check!")
         return(NULL)
       }
-      variables <- rownames(effect_est)
       effect_est <- as.matrix(effect_est)
       effect_se <- as.matrix(effect_se)
+      variables <- rownames(effect_est)
       if (is.null(variables)) {
         variables <- paste0("variant_", 1:nrow(effect_est))
+      } else {
+        # add - if duplciates occurs, only keep one
+        if (anyDuplicated(variables)) {
+          keep_idx <- !duplicated(variables)
+          variables <- variables[keep_idx]
+          effect_est <- effect_est[keep_idx, , drop = FALSE]
+          effect_se <- effect_se[keep_idx, , drop = FALSE]
+        }
       }
       sumstat <- list()
       for (sum_iy in 1:ncol(effect_est)) {
@@ -388,6 +406,17 @@ colocboost <- function(X = NULL, Y = NULL, # individual data
         })
       }
     }
+    
+    # Remove duplicates and report: if duplicate variant in summary statistics
+    sumstat <- lapply(seq_along(sumstat), function(i) {
+      xx <- sumstat[[i]]
+      if (anyDuplicated(xx$variant)) {
+        message(paste("Removed duplicate variants from sumstat", i))
+        xx[!duplicated(xx$variant), , drop = FALSE]
+      } else {
+        xx
+      }
+    })
     keep_variable_sumstat <- lapply(sumstat, function(xx) {
       xx$variant
     })

From fab36510c5171abbdc5943968c7ab6a545dc4828 Mon Sep 17 00:00:00 2001
From: xuewei cao <36172337+xueweic@users.noreply.github.com>
Date: Thu, 6 Nov 2025 14:19:03 -0500
Subject: [PATCH 11/12] Several Changes

1. Move colocboost wrapper to pecotmr.
2. Improve colocboost function - validation of the dataset.
3. Fix one error about the missing value or duplicated value in summary statistics
---
 R/colocboost.R                            | 420 +++++++++++++++-------
 man/colocboost_validate_input_data.Rd     |  70 ++++
 tests/testthat/test_sumstats.R            |  77 +++-
 vignettes/ColocBoost_Wrapper_Pipeline.Rmd | 251 +------------
 4 files changed, 430 insertions(+), 388 deletions(-)
 create mode 100644 man/colocboost_validate_input_data.Rd

diff --git a/R/colocboost.R b/R/colocboost.R
index 6cb1721..aba9cdf 100644
--- a/R/colocboost.R
+++ b/R/colocboost.R
@@ -195,6 +195,181 @@ colocboost <- function(X = NULL, Y = NULL, # individual data
     return(NULL)
   }
 
+  # - check input data: individual level data and summary-level data
+  validated_data <- colocboost_validate_input_data(
+    X = X, Y = Y,
+    sumstat = sumstat, LD = LD,
+    dict_YX = dict_YX, dict_sumstatLD = dict_sumstatLD,
+    effect_est = effect_est, effect_se = effect_se, effect_n = effect_n,
+    overlap_variables = overlap_variables,
+    M = M, min_abs_corr = min_abs_corr
+  )
+  if (is.null(validated_data)) {
+    return(NULL)
+  }
+  # Extract validated data
+  X <- validated_data$X
+  Y <- validated_data$Y
+  yx_dict <- validated_data$yx_dict
+  keep_variable_individual <- validated_data$keep_variable_individual
+  sumstat <- validated_data$sumstat
+  LD <- validated_data$LD
+  sumstatLD_dict <- validated_data$sumstatLD_dict
+  keep_variable_sumstat <- validated_data$keep_variable_sumstat
+  Z <- validated_data$Z
+  N_sumstat <- validated_data$N_sumstat
+  Var_y <- validated_data$Var_y
+  SeBhat <- validated_data$SeBhat
+  
+  # Update parameters if LD was not provided
+  M <- validated_data$M
+  min_abs_corr <- validated_data$min_abs_corr
+  jk_equiv_corr <- validated_data$jk_equiv_corr
+  jk_equiv_loglik <- validated_data$jk_equiv_loglik
+  func_simplex <- validated_data$func_simplex
+  
+  # - initial colocboost object
+  keep_variables <- c(keep_variable_individual, keep_variable_sumstat)
+  overlapped_variables <- Reduce("intersect", keep_variables)
+  mean_variables <- mean(sapply(keep_variables, length))
+  min_variables <- min(sapply(keep_variables, length))
+  if (min_variables < 100) {
+    warning(
+      "Warning message about the number of variables.\n",
+      "The smallest number of variables across outcomes is ", min_variables, " < 100.",
+      " If this is what you expected, this is not a problem.",
+      " If this is not what you expected, please check input data."
+    )
+  }
+  if (length(overlapped_variables) <= 1) {
+    warning(
+      "Error: No or only 1 overlapping variables were found across all outcomes, colocalization cannot be performed. ",
+      "Please verify the variable names across different outcomes."
+    )
+    return(NULL)
+  } else if ((length(overlapped_variables) / mean_variables) < 0.1) {
+    warning(
+      "Warning message about the overlapped variables.\n",
+      "The average number of variables across outcomes is ", mean_variables,
+      ". But only ", length(overlapped_variables), " number of variables overlapped (<10%).\n",
+      " If this is what you expected, this is not a problem.",
+      " If this is not what you expected, please check if the variable name matched across outcomes."
+    )
+  }
+  cb_data <- colocboost_init_data(
+    X = X, Y = Y, dict_YX = yx_dict,
+    Z = Z, LD = LD, N_sumstat = N_sumstat, dict_sumstatLD = sumstatLD_dict,
+    Var_y = Var_y, SeBhat = SeBhat,
+    keep_variables = keep_variables,
+    focal_outcome_idx = focal_outcome_idx,
+    focal_outcome_variables = focal_outcome_variables,
+    overlap_variables = overlap_variables,
+    intercept = intercept,
+    standardize = standardize,
+    residual_correlation = residual_correlation
+  )
+
+  ##################  colocboost updates   ###################################
+  message("Starting gradient boosting algorithm.")
+  cb_obj <- colocboost_workhorse(cb_data,
+    M = M,
+    prioritize_jkstar = prioritize_jkstar,
+    tau = tau,
+    learning_rate_init = learning_rate_init,
+    learning_rate_decay = learning_rate_decay,
+    func_simplex = func_simplex,
+    lambda = lambda,
+    lambda_focal_outcome = lambda_focal_outcome,
+    stop_thresh = stop_thresh,
+    func_multi_test = func_multi_test,
+    stop_null = stop_null,
+    multi_test_max = multi_test_max,
+    multi_test_thresh = multi_test_thresh,
+    ash_prior = ash_prior,
+    p.adjust.methods = p.adjust.methods,
+    jk_equiv_corr = jk_equiv_corr,
+    jk_equiv_loglik = jk_equiv_loglik,
+    func_compare = func_compare,
+    coloc_thresh = coloc_thresh,
+    LD_free = LD_free,
+    dynamic_learning_rate = dynamic_learning_rate,
+    focal_outcome_idx = focal_outcome_idx,
+    outcome_names = outcome_names
+  )
+
+  # --- post-processing of the colocboost updates
+  message("Performing inference on colocalization events.")
+  cb_output <- colocboost_assemble(cb_obj,
+    coverage = coverage,
+    weight_fudge_factor = weight_fudge_factor,
+    check_null = check_null,
+    check_null_method = check_null_method,
+    check_null_max = check_null_max,
+    check_null_max_ucos = check_null_max_ucos,
+    dedup = dedup,
+    overlap = overlap,
+    n_purity = n_purity,
+    min_abs_corr = min_abs_corr,
+    sec_coverage_thresh = sec_coverage_thresh,
+    median_abs_corr = median_abs_corr,
+    min_cluster_corr = min_cluster_corr,
+    median_cos_abs_corr = median_cos_abs_corr,
+    weaker_effect = weaker_effect,
+    merge_cos = merge_cos,
+    tol = tol,
+    output_level = output_level
+  )
+  class(cb_output) <- "colocboost"
+  return(cb_output)
+}
+
+
+
+
+#' @title Validate and Process All Input Data for ColocBoost
+#'
+#' @description Internal function to validate and process both individual-level and summary-level input data
+#'
+#' @param X A list of genotype matrices for different outcomes, or a single matrix if all outcomes share the same genotypes.
+#' @param Y A list of vectors of outcomes or an N by L matrix if it is considered for the same X and multiple outcomes.
+#' @param sumstat A list of data.frames of summary statistics.
+#' @param LD A list of correlation matrices indicating the LD matrix for each genotype.
+#' @param dict_YX A L by 2 matrix of dictionary for X and Y if there exist subsets of outcomes corresponding to the same X matrix.
+#' @param dict_sumstatLD A L by 2 matrix of dictionary for sumstat and LD if there exist subsets of outcomes corresponding to the same sumstat.
+#' @param effect_est Matrix of variable regression coefficients (i.e. regression beta values) in the genomic region
+#' @param effect_se Matrix of standard errors associated with the beta values
+#' @param effect_n A scalar or a vector of sample sizes for estimating regression coefficients.
+#' @param overlap_variables If overlap_variables = TRUE, only perform colocalization in the overlapped region.
+#' @param M The maximum number of gradient boosting rounds for each outcome (default is 500).
+#' @param min_abs_corr Minimum absolute correlation allowed in a confidence set.
+#'
+#' @return A list containing:
+#'   \item{X}{Processed list of genotype matrices}
+#'   \item{Y}{Processed list of phenotype vectors}
+#'   \item{yx_dict}{Dictionary mapping Y to X}
+#'   \item{keep_variable_individual}{List of variable names for each X matrix}
+#'   \item{sumstat}{Processed list of summary statistics data.frames}
+#'   \item{LD}{Processed list of LD matrices}
+#'   \item{sumstatLD_dict}{Dictionary mapping sumstat to LD}
+#'   \item{keep_variable_sumstat}{List of variant names for each sumstat}
+#'   \item{Z}{List of z-scores for each outcome}
+#'   \item{N_sumstat}{List of sample sizes for each outcome}
+#'   \item{Var_y}{List of phenotype variances for each outcome}
+#'   \item{SeBhat}{List of standard errors for each outcome}
+#'   \item{M_updated}{Updated M value (may be changed if LD not provided)}
+#'   \item{min_abs_corr_updated}{Updated min_abs_corr value (may be changed if LD not provided)}
+#'   \item{jk_equiv_corr_updated}{Updated jk_equiv_corr value}
+#'   \item{jk_equiv_loglik_updated}{Updated jk_equiv_loglik value}
+#'   \item{func_simplex_updated}{Updated func_simplex value}
+#'
+#' @keywords internal
+colocboost_validate_input_data <- function(X = NULL, Y = NULL,
+                                           sumstat = NULL, LD = NULL,
+                                           dict_YX = NULL, dict_sumstatLD = NULL,
+                                           effect_est = NULL, effect_se = NULL, effect_n = NULL,
+                                           overlap_variables = FALSE,
+                                           M = 500, min_abs_corr = 0.5) {
+  
   # - check individual level data
   if (!is.null(X) & !is.null(Y)) {
     # --- check input
@@ -334,7 +509,7 @@ colocboost <- function(X = NULL, Y = NULL, # individual data
   } else {
     yx_dict <- keep_variable_individual <- NULL
   }
-
+  
   # - check summary-level data
   if ((!is.null(sumstat)) | (!is.null(effect_est) & !is.null(effect_se))) {
     # --- check input of (effect_est, effect_se)
@@ -376,7 +551,7 @@ colocboost <- function(X = NULL, Y = NULL, # individual data
         sumstat[[sum_iy]] <- sumstat_y
       }
     }
-
+    
     if (!is.null(sumstat)) {
       if (is.data.frame(sumstat)) {
         sumstat <- list(sumstat)
@@ -385,6 +560,7 @@ colocboost <- function(X = NULL, Y = NULL, # individual data
         warning("Error: Input sumstat must be the list containing summary level data for all outcomes!")
         return(NULL)
       }
+      sumstat <- lapply(sumstat, as.data.frame)
       # --- check if variables names in summary data
       variable.tmp <- sapply(sumstat, function(xx) {
         if (("variant" %in% colnames(xx))) {
@@ -407,21 +583,32 @@ colocboost <- function(X = NULL, Y = NULL, # individual data
       }
     }
     
+    # Remove NA for sumstat$variant columns - add on
+    sumstat <- lapply(seq_along(sumstat), function(i) {
+      xx <- sumstat[[i]]
+      if (anyNA(xx$variant)) {
+        warning(paste("Removed variant with NA from sumstat", i))
+        xx = as.data.frame(xx[!is.na(xx$variant), , drop = FALSE])
+      }
+      return(xx)
+    })
     # Remove duplicates and report: if duplicate variant in summary statistics
     sumstat <- lapply(seq_along(sumstat), function(i) {
       xx <- sumstat[[i]]
       if (anyDuplicated(xx$variant)) {
-        message(paste("Removed duplicate variants from sumstat", i))
-        xx[!duplicated(xx$variant), , drop = FALSE]
-      } else {
-        xx
+        warning(paste("Removed duplicate variants from sumstat", i))
+        xx = as.data.frame(xx[!duplicated(xx$variant), , drop = FALSE])
       }
+      return(xx)
     })
-    keep_variable_sumstat <- lapply(sumstat, function(xx) {
-      xx$variant
-    })
-
+    
     # --- check input of LD
+    M_updated <- M
+    min_abs_corr_updated <- min_abs_corr
+    jk_equiv_corr_updated <- 0.8
+    jk_equiv_loglik_updated <- 1
+    func_simplex_updated <- "LD_z2z"
+    
     if (is.null(LD)) {
       # if no LD input, set diagonal matrix to LD
       warning(
@@ -433,55 +620,84 @@ colocboost <- function(X = NULL, Y = NULL, # individual data
       LD <- 1
       sumstatLD_dict <- rep(1, length(sumstat))
       # change some algorithm parameters
-      M <- 1 # one iteration
-      min_abs_corr <- 0 # remove purity checking
-      jk_equiv_corr <- 0
-      jk_equiv_loglik <- 0.1
-      func_simplex <- "only_z2z"
+      M_updated <- 1 # one iteration
+      min_abs_corr_updated <- 0 # remove purity checking
+      jk_equiv_corr_updated <- 0
+      jk_equiv_loglik_updated <- 0.1
+      func_simplex_updated <- "only_z2z"
+      
     } else {
-      if (is.data.frame(LD)) {
-        LD <- as.matrix(LD)
-      }
-      if (is.matrix(LD)) {
-        LD <- list(LD)
-      }
+      
+      if (is.data.frame(LD)) LD <- as.matrix(LD)
+      if (is.matrix(LD)) LD <- list(LD)
       # - check if NA in LD matrix
       num_na <- sapply(LD, sum)
       if (any(is.na(num_na))){
         warning("Error: Input LD must not contain missing values (NA).")
         return(NULL)
       }
+      # Create sumstat-LD mapping ===
       if (length(LD) == 1) {
         sumstatLD_dict <- rep(1, length(sumstat))
       } else if (length(LD) == length(sumstat)) {
-        sumstatLD_dict <- 1:length(sumstat)
+        sumstatLD_dict <- seq_along(sumstat)
       } else {
         if (is.null(dict_sumstatLD)) {
-          warning("Error: Please provide the dict_sumstatLD since you have multiple sumstat but only few LD!")
+          warning('Error: Please provide dict_sumstatLD: you have ', length(sumstat), 
+                  ' sumstats but only ', length(LD), ' LD matrices')
           return(NULL)
-        } else {
-          # - dict for sumstat to LD mapping
-          sumstatLD_dict <- rep(NA, length(sumstat))
-          for (i in 1:length(sumstat)) {
-            tmp <- unique(dict_sumstatLD[dict_sumstatLD[, 1] == i, 2])
-            if (length(tmp) == 0) {
-              warning(paste("Error: You don't provide matched LD for sumstat", i))
-              return(NULL)
-            } else if (length(tmp) != 1) {
-              warning(paste("Error: You provide multiple matched LD for sumstat", i))
-              return(NULL)
-            } else {
-              sumstatLD_dict[i] <- tmp
-            }
+        }
+        if (length(dict_sumstatLD) != length(sumstat)) {
+          warning('Error: dict_sumstatLD must have length ', length(sumstat))
+          return(NULL)
+        }
+        if (any(is.na(dict_sumstatLD))) {
+          warning('Error: dict_sumstatLD contains NA values')
+          return(NULL)
+        }
+        if (any(dict_sumstatLD < 1) || any(dict_sumstatLD > length(LD))) {
+          warning('Error: dict_sumstatLD values must be between 1 and ', length(LD))
+          return(NULL)
+        }
+        sumstatLD_dict <- as.integer(dict_sumstatLD)
+      }
+      
+      # === Filter variants for each sumstat ===
+      for (i in seq_along(sumstat)) {
+        # Get sumstat variants (adjust field name based on your data structure)
+        sumstat_variants <- sumstat[[i]]$variant
+        n_total <- length(sumstat_variants)
+        # Get LD variants
+        ld_idx <- sumstatLD_dict[i]
+        current_ld <- LD[[ld_idx]]
+        ld_variants <- rownames(current_ld)
+        if (is.null(ld_variants)) {
+          if (ncol(current_ld) != n_total){
+            warning('Error: LD matrix ', ld_idx, ' has no rownames. Please ensure all LD matrices have variant names as rownames.')
+            return(NULL)
           }
-          if (max(sumstatLD_dict) > length(LD)) {
-            warning("Error: You don't provide enough LD matrices!")
+        }
+        # Find common variants
+        common_variants <- intersect(sumstat_variants, ld_variants)
+        n_removed <- n_total - length(common_variants)
+        # Filter if needed
+        if (n_removed > 0) {
+          warning('Sumstat ', i, ': removing ', n_removed, ' out of ', n_total,
+                  ' variants since those variants are not in LD matrix ', ld_idx)
+          keep_idx <- match(common_variants, sumstat_variants)
+          if (length(keep_idx) == 0){
+            warning('Error: Sumstat data ', i, ' is empty after filtering. Returning NULL')
             return(NULL)
           }
+          # Filter all relevant fields - ADJUST THESE FIELD NAMES TO YOUR DATA
+          sumstat[[i]] <- sumstat[[i]][keep_idx, , drop = FALSE]
         }
       }
     }
-
+    keep_variable_sumstat <- lapply(sumstat, function(xx) {
+      xx$variant
+    })
+    
     # - checking sample size existency
     n_exist <- sapply(sumstat, function(ss) {
       "n" %in% colnames(ss)
@@ -498,7 +714,7 @@ colocboost <- function(X = NULL, Y = NULL, # individual data
         "Outcome ", paste(p_no, collapse = ", "), " in sumstat don't contain 'n'!"
       )
     }
-
+    
     Z <- N_sumstat <- Var_y <- SeBhat <- vector(mode = "list", length = length(sumstat))
     for (i.summstat in 1:length(sumstat)) {
       summstat_tmp <- sumstat[[i.summstat]]
@@ -528,7 +744,7 @@ colocboost <- function(X = NULL, Y = NULL, # individual data
         warning(paste("summary statistic dataset", i.summstat, "contains NA values that are replaced with 0"))
         z[is.na(z)] <- 0
       }
-
+      
       # - check N
       if (!("n" %in% colVar)) {
         z <- z
@@ -541,7 +757,7 @@ colocboost <- function(X = NULL, Y = NULL, # individual data
         }
         n <- median(n)
         N_sumstat[[i.summstat]] <- n
-
+        
         # When n is provided, compute the adjusted z-scores.
         z <- z * sqrt((n - 1) / (z^2 + n - 2))
         if ("var_y" %in% colVar) {
@@ -561,99 +777,31 @@ colocboost <- function(X = NULL, Y = NULL, # individual data
     }
   } else {
     Z <- N_sumstat <- Var_y <- SeBhat <- sumstatLD_dict <- keep_variable_sumstat <- NULL
+    M_updated <- M
+    min_abs_corr_updated <- min_abs_corr
+    jk_equiv_corr_updated <- 0.8
+    jk_equiv_loglik_updated <- 1
+    func_simplex_updated <- "LD_z2z"
   }
-  # - initial colocboost object
-  keep_variables <- c(keep_variable_individual, keep_variable_sumstat)
-  overlapped_variables <- Reduce("intersect", keep_variables)
-  mean_variables <- mean(sapply(keep_variables, length))
-  min_variables <- min(sapply(keep_variables, length))
-  if (min_variables < 100) {
-    warning(
-      "Warning message about the number of variables.\n",
-      "The smallest number of variables across outcomes is ", min_variables, " < 100.",
-      " If this is what you expected, this is not a problem.",
-      " If this is not what you expected, please check input data."
-    )
-  }
-  if (length(overlapped_variables) <= 1) {
-    warning(
-      "Error: No or only 1 overlapping variables were found across all outcomes, colocalization cannot be performed. ",
-      "Please verify the variable names across different outcomes."
-    )
-    return(NULL)
-  } else if ((length(overlapped_variables) / mean_variables) < 0.1) {
-    warning(
-      "Warning message about the overlapped variables.\n",
-      "The average number of variables across outcomes is ", mean_variables,
-      ". But only ", length(overlapped_variables), " number of variables overlapped (<10%).\n",
-      " If this is what you expected, this is not a problem.",
-      " If this is not what you expected, please check if the variable name matched across outcomes."
-    )
-  }
-  cb_data <- colocboost_init_data(
-    X = X, Y = Y, dict_YX = yx_dict,
-    Z = Z, LD = LD, N_sumstat = N_sumstat, dict_sumstatLD = sumstatLD_dict,
-    Var_y = Var_y, SeBhat = SeBhat,
-    keep_variables = keep_variables,
-    focal_outcome_idx = focal_outcome_idx,
-    focal_outcome_variables = focal_outcome_variables,
-    overlap_variables = overlap_variables,
-    intercept = intercept,
-    standardize = standardize,
-    residual_correlation = residual_correlation
-  )
-
-  ##################  colocboost updates   ###################################
-  message("Starting gradient boosting algorithm.")
-  cb_obj <- colocboost_workhorse(cb_data,
-    M = M,
-    prioritize_jkstar = prioritize_jkstar,
-    tau = tau,
-    learning_rate_init = learning_rate_init,
-    learning_rate_decay = learning_rate_decay,
-    func_simplex = func_simplex,
-    lambda = lambda,
-    lambda_focal_outcome = lambda_focal_outcome,
-    stop_thresh = stop_thresh,
-    func_multi_test = func_multi_test,
-    stop_null = stop_null,
-    multi_test_max = multi_test_max,
-    multi_test_thresh = multi_test_thresh,
-    ash_prior = ash_prior,
-    p.adjust.methods = p.adjust.methods,
-    jk_equiv_corr = jk_equiv_corr,
-    jk_equiv_loglik = jk_equiv_loglik,
-    func_compare = func_compare,
-    coloc_thresh = coloc_thresh,
-    LD_free = LD_free,
-    dynamic_learning_rate = dynamic_learning_rate,
-    focal_outcome_idx = focal_outcome_idx,
-    outcome_names = outcome_names
-  )
-
-  # --- post-processing of the colocboost updates
-  message("Performing inference on colocalization events.")
-  cb_output <- colocboost_assemble(cb_obj,
-    coverage = coverage,
-    weight_fudge_factor = weight_fudge_factor,
-    check_null = check_null,
-    check_null_method = check_null_method,
-    check_null_max = check_null_max,
-    check_null_max_ucos = check_null_max_ucos,
-    dedup = dedup,
-    overlap = overlap,
-    n_purity = n_purity,
-    min_abs_corr = min_abs_corr,
-    sec_coverage_thresh = sec_coverage_thresh,
-    median_abs_corr = median_abs_corr,
-    min_cluster_corr = min_cluster_corr,
-    median_cos_abs_corr = median_cos_abs_corr,
-    weaker_effect = weaker_effect,
-    merge_cos = merge_cos,
-    tol = tol,
-    output_level = output_level
-  )
-  class(cb_output) <- "colocboost"
-  return(cb_output)
+  
+  return(list(
+    X = X,
+    Y = Y,
+    yx_dict = yx_dict,
+    keep_variable_individual = keep_variable_individual,
+    sumstat = sumstat,
+    LD = LD,
+    sumstatLD_dict = sumstatLD_dict,
+    keep_variable_sumstat = keep_variable_sumstat,
+    Z = Z,
+    N_sumstat = N_sumstat,
+    Var_y = Var_y,
+    SeBhat = SeBhat,
+    M = M_updated,
+    min_abs_corr = min_abs_corr_updated,
+    jk_equiv_corr = jk_equiv_corr_updated,
+    jk_equiv_loglik = jk_equiv_loglik_updated,
+    func_simplex = func_simplex_updated
+  ))
 }
 
diff --git a/man/colocboost_validate_input_data.Rd b/man/colocboost_validate_input_data.Rd
new file mode 100644
index 0000000..b8616e9
--- /dev/null
+++ b/man/colocboost_validate_input_data.Rd
@@ -0,0 +1,70 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/colocboost.R
+\name{colocboost_validate_input_data}
+\alias{colocboost_validate_input_data}
+\title{Validate and Process All Input Data for ColocBoost}
+\usage{
+colocboost_validate_input_data(
+  X = NULL,
+  Y = NULL,
+  sumstat = NULL,
+  LD = NULL,
+  dict_YX = NULL,
+  dict_sumstatLD = NULL,
+  effect_est = NULL,
+  effect_se = NULL,
+  effect_n = NULL,
+  overlap_variables = FALSE,
+  M = 500,
+  min_abs_corr = 0.5
+)
+}
+\arguments{
+\item{X}{A list of genotype matrices for different outcomes, or a single matrix if all outcomes share the same genotypes.}
+
+\item{Y}{A list of vectors of outcomes or an N by L matrix if it is considered for the same X and multiple outcomes.}
+
+\item{sumstat}{A list of data.frames of summary statistics.}
+
+\item{LD}{A list of correlation matrices indicating the LD matrix for each genotype.}
+
+\item{dict_YX}{A L by 2 matrix of dictionary for X and Y if there exist subsets of outcomes corresponding to the same X matrix.}
+
+\item{dict_sumstatLD}{A L by 2 matrix of dictionary for sumstat and LD if there exist subsets of outcomes corresponding to the same sumstat.}
+
+\item{effect_est}{Matrix of variable regression coefficients (i.e. regression beta values) in the genomic region}
+
+\item{effect_se}{Matrix of standard errors associated with the beta values}
+
+\item{effect_n}{A scalar or a vector of sample sizes for estimating regression coefficients.}
+
+\item{overlap_variables}{If overlap_variables = TRUE, only perform colocalization in the overlapped region.}
+
+\item{M}{The maximum number of gradient boosting rounds for each outcome (default is 500).}
+
+\item{min_abs_corr}{Minimum absolute correlation allowed in a confidence set.}
+}
+\value{
+A list containing:
+\item{X}{Processed list of genotype matrices}
+\item{Y}{Processed list of phenotype vectors}
+\item{yx_dict}{Dictionary mapping Y to X}
+\item{keep_variable_individual}{List of variable names for each X matrix}
+\item{sumstat}{Processed list of summary statistics data.frames}
+\item{LD}{Processed list of LD matrices}
+\item{sumstatLD_dict}{Dictionary mapping sumstat to LD}
+\item{keep_variable_sumstat}{List of variant names for each sumstat}
+\item{Z}{List of z-scores for each outcome}
+\item{N_sumstat}{List of sample sizes for each outcome}
+\item{Var_y}{List of phenotype variances for each outcome}
+\item{SeBhat}{List of standard errors for each outcome}
+\item{M_updated}{Updated M value (may be changed if LD not provided)}
+\item{min_abs_corr_updated}{Updated min_abs_corr value (may be changed if LD not provided)}
+\item{jk_equiv_corr_updated}{Updated jk_equiv_corr value}
+\item{jk_equiv_loglik_updated}{Updated jk_equiv_loglik value}
+\item{func_simplex_updated}{Updated func_simplex value}
+}
+\description{
+Internal function to validate and process both individual-level and summary-level input data
+}
+\keyword{internal}
diff --git a/tests/testthat/test_sumstats.R b/tests/testthat/test_sumstats.R
index deb3aaa..ef896c3 100644
--- a/tests/testthat/test_sumstats.R
+++ b/tests/testthat/test_sumstats.R
@@ -247,7 +247,7 @@ test_that("colocboost handles mismatched inputs correctly", {
   
   # Expect error with mismatched dimensions
   expect_warning(
-    colocboost(
+    colocboost_validate_input_data(
       effect_est = bad_effect_est,
       effect_se = test_sumstat_data$effect_se
     ),
@@ -281,4 +281,77 @@ test_that("colocboost handles missing sample size correctly", {
   
   # Still should get a valid result
   expect_s3_class(result, "colocboost")
-})
\ No newline at end of file
+})
+
+# Test 8: Handling of NA variants
+test_that("colocboost removes NA variants correctly", {
+  sumstat_with_na <- test_sumstat_data$sumstat
+  sumstat_with_na[[1]]$variant[1:2] <- NA
+  
+  warnings <- capture_warnings({
+    validated_data <- colocboost_validate_input_data(
+      sumstat = sumstat_with_na,
+      LD = test_sumstat_data$LD
+    )
+  })
+  
+  expect_true(any(grepl("Removed variant with NA from sumstat 1", warnings)))
+  # Should have 2 fewer variants
+  expect_true(length(validated_data$Z[[1]]) == length(validated_data$Z[[2]])-2)
+})
+
+# Test 9: Handling of duplicate variants
+test_that("colocboost removes duplicate variants correctly", {
+  sumstat_with_dup <- test_sumstat_data$sumstat
+  # Duplicate the first variant
+  sumstat_with_dup[[1]] <- rbind(
+    sumstat_with_dup[[1]][1, ],
+    sumstat_with_dup[[1]]
+  )
+  
+  warnings <- capture_warnings({
+    result <- colocboost(
+      sumstat = sumstat_with_dup,
+      LD = test_sumstat_data$LD
+    )
+  })
+  
+  expect_true(any(grepl("Removed duplicate variants from sumstat 1", warnings)))
+  expect_s3_class(result, "colocboost")
+  # Should be back to original count
+  expect_equal(length(result$data_info$variables), ncol(test_sumstat_data$X))
+})
+
+# Test 10: Handling of mismatched variants between sumstat and LD
+test_that("colocboost handles variant mismatch between sumstat and LD", {
+  # Create LD with different variant names
+  LD_modified <- test_sumstat_data$LD
+  rownames(LD_modified) <- colnames(LD_modified) <- paste0("SNP", 11:30)
+  
+  warnings <- capture_warnings({
+    result <- colocboost_validate_input_data(
+      sumstat = test_sumstat_data$sumstat,
+      LD = LD_modified
+    )
+  })
+  
+  # Should warn about removing variants
+  expect_true(any(grepl("removing.*variants since those variants are not in LD", warnings)))
+})
+
+# Test 11: Error when no common variants
+test_that("colocboost errors with no common variants", {
+  # Create LD with completely different variant names
+  LD_no_overlap <- test_sumstat_data$LD
+  rownames(LD_no_overlap) <- colnames(LD_no_overlap) <- paste0("DIFF", 1:20)
+  
+  warnings <- capture_warnings(
+    colocboost_validate_input_data(
+      sumstat = test_sumstat_data$sumstat,
+      LD = LD_no_overlap
+    )
+  )
+  expect_true(any(grepl("removing.*variants since those variants are not in LD", warnings)))
+  expect_true(any(grepl("is empty after filtering", warnings)))
+})
+
diff --git a/vignettes/ColocBoost_Wrapper_Pipeline.Rmd b/vignettes/ColocBoost_Wrapper_Pipeline.Rmd
index 49bc4c1..7577a27 100644
--- a/vignettes/ColocBoost_Wrapper_Pipeline.Rmd
+++ b/vignettes/ColocBoost_Wrapper_Pipeline.Rmd
@@ -22,7 +22,7 @@ This vignette demonstrates how to use the bioinformatics pipeline for ColocBoost
 - See more details about input data preparation in `xqtl_protocol` with [link](https://statfungen.github.io/xqtl-protocol/code/mnm_analysis/mnm_methods/colocboost.html).
 
 
-# 1. Loading Data using `colocboost_analysis_pipeline` function
+# 1. Loading individual-level and summary statistics using `load_multitask_regional_data` function from multiple cohorts or datasets
 
 This function harmonizes the input data and prepares it for colocalization analysis. 
 
@@ -34,258 +34,9 @@ The output is a list with `individual_data` and `sumstat_data` components, where
 This list is then passed to the `colocboost_analysis_pipeline` function for the colocalization analysis.
 
 
-Below are the input parameters for this function for loading individual-level data:
-
-## 1.1. Loading individual-level data from multiple cohorts
-
-inputs:
-- **`region`**: String ; Genomic region of interest in the format of `chr:start-end` for the phenotype region you want to analyze. 
-- **`genotype_list`**: Character vector; Paths for PLINK bed files containing genotype data (do NOT include .bed suffix). 
-- **`phenotype_list`**: Character vector; Paths for phenotype file names.
-- **`covariate_list`**: Character vector; Paths for covariate file names for each phenotype. Must have the same length as the phenotype file vector.
-- **`conditions_list_individual`**: Character vector; Strings representing different conditions or groups used for naming. Must have the same length as the phenotype file vector.
-- **`match_geno_pheno`**: Integer vector; Indices of phenotypes matched to genotype if multiple genotype PLINK files are used. For each phenotype file in `phenotype_list`, the index of the genotype file in `genotype_list` it matches with.
-- **`association_window`**: String; Genomic region of interest in the format of `chr:start-end` for the association analysis window of variants to test (cis or trans). If not provided, all genotype data will be loaded.
-- **`extract_region_name`**: List of character vectors; Phenotype names (e.g., gene ID `ENSG00000269699`) to subset the phenotype data when there are multiple phenotypes availible in the region. Must have the same length as the phenotype file vector. Default is `NULL`, which will use all phenotypes in the region.
-- **`region_name_col`**: Integer; 1-based index of the column containing the region name (i.e. 4 for gene ID in a bed file). Required if `extract_region_name` is not `NULL`, or if multiple phenotypes fall into the same region in one phenotype file
-- **`keep_indel`**: Logical; indicating whether to keep insertions/deletions (INDELs). Default is `TRUE`.
-- **`keep_samples`**: Character vector; Sample names to keep. Default is `NULL`. Currently only supports keeping the same samples from all genotype and phenotype files.
-- **`maf_cutoff`**: Numeric; Minimum minor allele frequency (MAF) cutoff. Default is 0.
-- **`mac_cutoff`**: Numeric; Minimum minor allele count (MAC) cutoff. Default is 0.
-- **`xvar_cutoff`**: Numeric; Minimum genotype variance cutoff. Default is 0.
-- **`imiss_cutoff`**: Numeric; Maximum individual missingness cutoff. Default is 0.
-
-outputs: 
-- **`region_data`**: List (with `individual_data`, `sumstat_data`); Output of the `load_multitask_regional_data` function. If only individual-level data is loaded, `sumstat_data` will be `NULL`.
-
-
-**Indivudual-level data loading example**
-
-The following example demonstrates how to set up input data with 3 phenotypes and 2 cohorts. The first cohort has 2 phenotypes and the second cohort has 1 phenotype. The first phenotype has 2 genes and the second phenotype has 1 gene.
-
-```{r, data-loader-individual}
-# Example of loading individual-level data
-region = "chr1:1000000-2000000"
-genotype_list = c("plink_cohort1.1", "plink_cohort1.2")
-phenotype_list = c("phenotype1_cohort1.bed.gz", "phenotype2_cohort1.bed.gz", "phenotype1_cohort2.bed.gz")
-covariate_list = c("covariate1_cohort1.bed.gz", "covariate2_cohort1.bed.gz", "covariate1_cohort2.bed.gz")
-conditions_list_individual = c("phenotype1_cohort1", "phenotype2_cohort1", "phenotype1_cohort2")
-match_geno_pheno = c(1,1,2)
-association_window = "chr1:1000000-2000000" # set to be the same as region for cis-analysis
-extract_region_name = list(c("ENSG00000269699, ENSG00000789633"), c("ENSG00000269699"), c("ENSG00000269699", "ENSG00000789633"))
-region_name_col = 4
-keep_indel = TRUE
-keep_samples = c("SAMPLE1", "SAMPLE2", "SAMPLE3")
-
-# Following parameters need to be set according to your data
-maf_cutoff = 0.01
-mac_cutoff = 10
-xvar_cutoff = 0
-imiss_cutoff = 0.9
-
-# More advanced parameters see pecotmr::load_multitask_regional_data()
-
-#### Comment out to avoid running this code here, as we do not have real data files in this example ####
-# region_data_individual <- load_multitask_regional_data(
-#     region = region,
-#     genotype_list = genotype_list,
-#     phenotype_list = phenotype_list,
-#     covariate_list = covariate_list,
-#     conditions_list_individual = conditions_list_individual,
-#     match_geno_pheno = match_geno_pheno,
-#     association_window = association_window,
-#     region_name_col = region_name_col,
-#     extract_region_name = extract_region_name,
-#     keep_indel = keep_indel,
-#     keep_samples = keep_samples,
-#     maf_cutoff = maf_cutoff,
-#     mac_cutoff = mac_cutoff,
-#     xvar_cutoff = xvar_cutoff,
-#     imiss_cutoff = imiss_cutoff
-# )
-#### End of comment out
-
-```
-
-
-
-## 1.2. Loading summary statistics from multiple cohorts or datasets
-
-inputs:
-- **`sumstat_path_list`**: Character vector; Paths to the summary statistics.
-- **`column_file_path_list`**: Character vector; Paths to the column mapping files. See below for expected format.
-- **`LD_meta_file_path_list`**: Character vector; Paths to LD metadata files. See below for expected format.
-- **`conditions_list_sumstat`**: Character vector; Strings representing different sumstats used for naming. Must have the same length as the sumstat file vector.
-- **`match_LD_sumstat`**: List of character vectors; Mapping each LD metadata file to the summary-statistics conditions to pair with it. Length must equal `LD_meta_file_path_list`. Each element is a character vector of names present in `conditions_list_sumstat`. If omitted or an element is empty, defaults to all conditions for the first LD.
-- **`association_window`**: String; Genomic region of interest in the format of `chr:start-end` for the association analysis window of variants to test (cis or trans). If not provided, all genotype data will be loaded.
-- **`n_samples`**: Integer vector; Sample size. Set a 0 if `n_cases`/`n_controls` are passed explicitly. If unknown, set as 0 and include `n_samples` column in the column mapping file to retrieve from the sumstat file.
-- **`n_cases`**: Integer vector; Number of cases. Set a 0 if `n_samples` is passed explicitly. If unknown, set as 0 and include `n_cases` column in the column mapping file to retrieve from the sumstat file.
-- **`n_controls`**: Integer vector; Number of controls. Set a 0 if `n_samples` is passed explicitly. If unknown, set as 0 and include `n_controls` column in the column mapping file to retrieve from the sumstat file.
-
-outputs: 
-- **`region_data`**: List (with `individual_data`, `sumstat_data`); Output of the `load_multitask_regional_data` function. If only summary statistics data is loaded, `individual_data` will be `NULL`.
-
-**Summary statistics loading example**
-
-The following example demonstrates how to set up input data with 2 summary statistics and one LD reference.
-
-```{r, data-loader-sumstat}
-# Example of loading summary statistics
-sumstat_path_list = c("sumstat1.tsv.gz", "sumstat2.tsv.gz")
-column_file_path_list = c("column_mapping_sumstat1.yml", "column_mapping_sumstat2.yml")
-LD_meta_file_path_list = c("ld_meta_file.tsv")
-conditions_list_sumstat = c("sumstat_1", "sumstat_2")
-match_LD_sumstat = c("sumstat_1", "sumstat_2")
-association_window = "chr1:1000000-2000000"
-
-# Following parameters need to be set according to your data
-n_samples = c(300000, 0)
-n_cases = c(0, 20000)
-n_controls = c(0, 40000)
-
-
-# More advanced parameters see pecotmr::load_multitask_regional_data()
-
-#### Comment out to avoid running this code here, as we do not have real data files in this example ####
-# region_data_sumstat <- load_multitask_regional_data(
-#     sumstat_path_list = sumstat_path_list,
-#     column_file_path_list = column_file_path_list,
-#     LD_meta_file_path_list = LD_meta_file_path_list,
-#     conditions_list_sumstat = conditions_list_sumstat,
-#     match_LD_sumstat = match_LD_sumstat,
-#     association_window = association_window,
-#     n_samples = n_samples,
-#     n_cases = n_cases,
-#     n_controls = n_controls
-# )
-#### End of comment out
-```
-
-
-
-**Expected format for column mapping file**
-The column mapping file is YAML (`.yml`) with key: value pairs mapping your input column names to the standardized names expected by the loader. 
-Required columns are `chrom`, `pos`, `A1`, and `A2`, and either `z` or `beta` and `sebeta`. 
-Either 'n_case' and 'n_control' or 'n_samples' can be passed as part of the column mapping, but will be overwritten by the n_cases and n_controls or n_samples parameterspassed explicitly.
-```yaml
-# required
-chrom: chromosome
-pos: position
-A1: effect_allele
-A2: non_effect_allele
-
-z: zscore 
-# or
-beta: beta
-sebeta: sebeta
-
-# optional, will be overridden by n_samples or n_cases and n_controls if passed explicitly
-n_case: NCASE
-n_control: NCONTROL
-# or
-n_sample: N
-```
-
-
-**Expected format for LD metadata file**
-LD files sould be in the format generated by for instance `plink --r squared`, then xz compressed.
-The LD metadata file is a tab-separated file with the following columns:
-- `chrom`: chromosome
-- `start`: start position
-- `end`: end position
-- `ld_path, bim_path`: path to the LD block file and bim file
-```
-1	1000000	2000000	ld_block_1.ld.gz,ld_block_1.bim
-```
-
 
 # 2. Perform ColocBoost using `colocboost_analysis_pipeline` function
 
 In this section, we load region data for a combination of individual-level and summary statistics data, then perform the colocalization analysis using the `colocboost_analysis_pipeline` function. 
 The colocalization analysis can be run in any one of three modes, or in a combination of these modes (names assume that individual-level data is xQTL data and summary statistics data is GWAS data):
 
-- **`xQTL-only mode`**: Only perform colocalization analysis on the individual-level data. Summary statistics data is not used.
-- **`joint GWAS mode`**: Perform colocalization analysis in disease-agnostic mode on the individual-level and summary statistics data together.
-- **`separate GWAS mode`**: Perform colocalization analysis in disease-prioritized mode on the the individual-level data and each summary statistics dataset separately, treating each summary statistics dataset as the focal trait.
-
-inputs:
-- **`region_data`**: List (with `individual_data`, `sumstat_data`); Output of the `load_multitask_regional_data` function.
-- **`focal_trait`**: String; For xQTL-only mode, the name of the trait to perform disease-prioritized ColocBoost, from `conditions_list_individual`. If not provided, xQTL-only mode will be run without disease-prioritized mode.
-- **`event_filters`**: List of character vectors; Patterns for filtering events based on context names. 
-Example: for sQTL, `list(type_pattern = ".*clu_(\\d+_[+-?]).*", valid_pattern = "clu_(\\d+_[+-?]):PR:", exclude_pattern = "clu_(\\d+_[+-?]):IN:")`.
-- **`maf_cutoff`**: Numeric; Minor allele frequency cutoff. Default is 0.005.
-- **`pip_cutoff_to_skip_ind`**: Integer vector; Cutoff values for skipping analysis based on pre-screening with single-effect SuSiE (L=1). Context is skipped if none of the variants in the context have PIP values greater than the cutoff. Default is 0 (does not run single-effect SuSiE). Passing a negative value sets the cutoff to 3/number of variants.
-- **`pip_cutoff_to_skip_sumstat`**: Integer vector; Cutoff values for skipping analysis based on pre-screening with single-effect SuSiE (L=1). Sumstat is skipped if none of the variants in the sumstat have PIP values greater than the cutoff. Default is 0 (does not run single-effect SuSiE). Passing a negative value sets the cutoff to 3/number of variants.
-- **`qc_method`**: String; Quality control method to use. Options are "rss_qc", "dentist", or "slalom". Default is `rss_qc`.
-- **`impute`**: Logical; if TRUE, performs imputation for outliers identified in the analysis. Default is `TRUE`.
-- **`impute_opts`**: List of lists; Imputation options including rcond, R2_threshold, and minimum_ld. Default is `list(rcond = 0.01, R2_threshold = 0.6, minimum_ld = 5)`.
-- **`xqtl_coloc`**: Logical; if TRUE, performs xQTL-only mode. Default is `TRUE`.
-- **`joint_gwas`**: Logical; if TRUE, performs joint GWAS mode, mapping all individual-level and sumstat data together.Default is `FALSE`.
-- **`separate_gwas`**: Logical; if TRUE, runs separate GWAS mode, where each sumstat dataset is analyzed separately with all individual-level data, treating each sumstat as the focal trait in disease-prioritized mode. Default is `FALSE`.
-
-outputs:
-- **`colocboost_results`**: List of colocboost objects (with `xqtl_coloc`, `joint_gwas`, `separate_gwas`); Output of the `colocboost_analysis_pipeline` function. If the mode is not run, the corresponding element will be `NULL`.
-
-```{r, colocboost-analysis}
-
-#### Comment out to avoid running this code here, as we do not have real data files in this example ####
-#### Please check the example code below ####
-
-# # load in individual-level and sumstat data
-# region_data_combined <- load_multitask_regional_data(
-#     region = region,
-#     genotype_list = genotype_list,
-#     phenotype_list = phenotype_list,
-#     covariate_list = covariate_list,
-#     conditions_list_individual = conditions_list_individual,
-#     match_geno_pheno = match_geno_pheno,
-#     association_window = association_window,
-#     region_name_col = region_name_col,
-#     extract_region_name = extract_region_name,
-#     keep_indel = keep_indel,
-#     keep_samples = keep_samples,
-#     maf_cutoff = maf_cutoff,
-#     mac_cutoff = mac_cutoff,
-#     xvar_cutoff = xvar_cutoff,
-#     imiss_cutoff = imiss_cutoff,
-#     sumstat_path_list = sumstat_path_list,
-#     column_file_path_list = column_file_path_list,
-#     LD_meta_file_path_list = LD_meta_file_path_list,
-#     conditions_list_sumstat = conditions_list_sumstat,
-#     match_LD_sumstat = match_LD_sumstat,
-#     n_samples = n_samples,
-#     n_cases = n_cases,
-#     n_controls = n_controls
-# )
-
-# maf_cutoff = 0.01
-# pip_cutoff_to_skip_ind = rep(0, length(phenotype_list))
-# pip_cutoff_to_skip_sumstat = rep(0, length(sumstat_path_list))
-# qc_method = "rss_qc" 
-
-# # run colocboost analysis
-# colocboost_results <- colocboost_analysis_pipeline(
-#     region_data_combined,
-#     maf_cutoff = maf_cutoff, 
-#     pip_cutoff_to_skip_ind = pip_cutoff_to_skip_ind,
-#     pip_cutoff_to_skip_sumstat = pip_cutoff_to_skip_sumstat,
-#     qc_method = qc_method,
-#     xqtl_coloc = TRUE,
-#     joint_gwas = TRUE,
-#     separate_gwas = TRUE
-# )
-
-# # visualize results for xQTL-only mode
-# colocboost_plot(colocboost_results$xqtl_coloc)
-
-# # visualize results for joint GWAS mode
-# colocboost_plot(colocboost_results$joint_gwas)
-
-# # visualize results for separate GWAS mode
-# for (i in 1:length(colocboost_results$separate_gwas)) {
-#     colocboost_plot(colocboost_results$separate_gwas[[i]])
-# }
-
-
-#### End of comment out
-```
\ No newline at end of file

From 13095366a156c19d870a3c2e24093aa7a6591d5e Mon Sep 17 00:00:00 2001
From: xuewei cao <36172337+xueweic@users.noreply.github.com>
Date: Thu, 6 Nov 2025 14:40:27 -0500
Subject: [PATCH 12/12] Update ColocBoost_Wrapper_Pipeline.Rmd

---
 vignettes/ColocBoost_Wrapper_Pipeline.Rmd | 17 ++---------------
 1 file changed, 2 insertions(+), 15 deletions(-)

diff --git a/vignettes/ColocBoost_Wrapper_Pipeline.Rmd b/vignettes/ColocBoost_Wrapper_Pipeline.Rmd
index 7577a27..800d2f7 100644
--- a/vignettes/ColocBoost_Wrapper_Pipeline.Rmd
+++ b/vignettes/ColocBoost_Wrapper_Pipeline.Rmd
@@ -22,21 +22,8 @@ This vignette demonstrates how to use the bioinformatics pipeline for ColocBoost
 - See more details about input data preparation in `xqtl_protocol` with [link](https://statfungen.github.io/xqtl-protocol/code/mnm_analysis/mnm_methods/colocboost.html).
 
 
-# 1. Loading individual-level and summary statistics using `load_multitask_regional_data` function from multiple cohorts or datasets
+Step 1: Loading individual-level and summary statistics using `load_multitask_regional_data` function from multiple cohorts or datasets
 
-This function harmonizes the input data and prepares it for colocalization analysis. 
 
-
-In this section, we introduce how to load the regional data required for the ColocBoost analysis using the `load_multitask_regional_data` function. 
-This function loads mixed datasets for a specific region, including individual-level data (genotype, phenotype, covariate data), summary statistics 
-(sumstats, LD), or a combination of both. It runs `load_regional_univariate_data` for each individual-level dataset and `load_rss_data` for each summary statistics dataset.
-The output is a list with `individual_data` and `sumstat_data` components, where `individual_data` is a list of individual-level data and `sumstat_data` is a list of summary statistics data. 
-This list is then passed to the `colocboost_analysis_pipeline` function for the colocalization analysis.
-
-
-
-# 2. Perform ColocBoost using `colocboost_analysis_pipeline` function
-
-In this section, we load region data for a combination of individual-level and summary statistics data, then perform the colocalization analysis using the `colocboost_analysis_pipeline` function. 
-The colocalization analysis can be run in any one of three modes, or in a combination of these modes (names assume that individual-level data is xQTL data and summary statistics data is GWAS data):
+Step 2: Perform ColocBoost using `colocboost_analysis_pipeline` function