From 5db331fab4280d46818e441cacff3dde21997045 Mon Sep 17 00:00:00 2001 From: Alex Petty Date: Mon, 26 May 2025 16:34:45 -0400 Subject: [PATCH 1/2] Update Tractor chunk processing to not repeatedly re-read from the beginning of the file. --- scripts/run_tractor.R | 24 ++++++++++++++++++------ 1 file changed, 18 insertions(+), 6 deletions(-) diff --git a/scripts/run_tractor.R b/scripts/run_tractor.R index 9bca688..6030482 100755 --- a/scripts/run_tractor.R +++ b/scripts/run_tractor.R @@ -231,7 +231,6 @@ RunTractor <- function(prefix, phenofile, sampleidcol, phenocol, covarcollist, c COV_ = NULL iters <- 1 - skip_val <- 1 max_iters <- ceiling(totallines/chunksize) data_colnames <- NULL @@ -251,16 +250,30 @@ RunTractor <- function(prefix, phenofile, sampleidcol, phenocol, covarcollist, c dopar_packages <- c("data.table","dplyr") dopar_functions <- c("subset_mat_NA","extract_model_info") + inPipes = lapply(inFiles, function(file) { + if (!endsWith(file,".gz")) { + con <- pipe(sprintf("cat %s", shQuote(file)), open="r") + on.exit(try(close(con), silent=TRUE)) + return(con) + } else { + con <- pipe(sprintf("gzip -cd %s", shQuote(file)), open="r") + on.exit(try(close(con), silent=TRUE)) + return(con) + } + }) + while (iters <= max_iters) { if (iters != 1) { - data = lapply(inFiles,function(file) { - data.table::fread(file, nrows=chunksize, skip=skip_val, + data = lapply(inPipes,function(file) { + lins <- readLines(file, n=chunksize) + data.table::fread(text=lins, nrows=chunksize, col.names=data_colnames, sep="\t") #, header=TRUE }) } else { - data = lapply(inFiles,function(file) { - data.table::fread(file, nrows=chunksize, skip=skip_val-1, + data = lapply(inPipes,function(file) { + lins <- readLines(file, n=chunksize) + data.table::fread(text=lins, nrows=chunksize, sep="\t", header=TRUE) }) data_colnames <- colnames(data[[1]]) @@ -416,7 +429,6 @@ RunTractor <- function(prefix, phenofile, sampleidcol, phenocol, covarcollist, c # Updating looping variables iters = iters + 1 - skip_val = skip_val + chunksize } } From a7db4de6b37568ae90baf24ac1d0542741d57663 Mon Sep 17 00:00:00 2001 From: Alex Petty Date: Tue, 27 May 2025 15:34:59 -0400 Subject: [PATCH 2/2] Fix pipes being closed prematurely. --- scripts/run_tractor.R | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/scripts/run_tractor.R b/scripts/run_tractor.R index 6030482..1e48d40 100755 --- a/scripts/run_tractor.R +++ b/scripts/run_tractor.R @@ -253,11 +253,9 @@ RunTractor <- function(prefix, phenofile, sampleidcol, phenocol, covarcollist, c inPipes = lapply(inFiles, function(file) { if (!endsWith(file,".gz")) { con <- pipe(sprintf("cat %s", shQuote(file)), open="r") - on.exit(try(close(con), silent=TRUE)) return(con) } else { con <- pipe(sprintf("gzip -cd %s", shQuote(file)), open="r") - on.exit(try(close(con), silent=TRUE)) return(con) } }) @@ -430,8 +428,13 @@ RunTractor <- function(prefix, phenofile, sampleidcol, phenocol, covarcollist, c # Updating looping variables iters = iters + 1 } + + lapply(inPipes, function(pipe) { + try(close(pipe), silent=TRUE) + }) } + RunTractor(prefix = opt$hapdose, phenofile = opt$phenofile, sampleidcol = opt$sampleidcol,