From 0af2f0a0c056ce898547a55e565366825cc94749 Mon Sep 17 00:00:00 2001 From: bbimber Date: Mon, 4 Sep 2023 08:54:51 -0700 Subject: [PATCH 01/11] Refactor genetics pipeline to allow standalone import of TSVs that were calculated separately --- .../org/labkey/api/ehr/EHRService.java | 13 ++ .../pipeline/kinship/populateInbreeding.r | 53 ++++--- .../pipeline/kinship/populateKinship.r | 109 +++++++++++---- ehr/src/org/labkey/ehr/EHRServiceImpl.java | 9 ++ .../GeneticCalculationsImportTask.java | 130 ++++++++---------- .../pipeline/GeneticCalculationsRTask.java | 2 +- 6 files changed, 189 insertions(+), 127 deletions(-) diff --git a/ehr/api-src/org/labkey/api/ehr/EHRService.java b/ehr/api-src/org/labkey/api/ehr/EHRService.java index 550935660..2d7e8b092 100644 --- a/ehr/api-src/org/labkey/api/ehr/EHRService.java +++ b/ehr/api-src/org/labkey/api/ehr/EHRService.java @@ -15,6 +15,7 @@ */ package org.labkey.api.ehr; +import org.apache.logging.log4j.Logger; import org.jetbrains.annotations.NotNull; import org.jetbrains.annotations.Nullable; import org.labkey.api.data.AbstractTableInfo; @@ -31,6 +32,7 @@ import org.labkey.api.ehr.history.*; import org.labkey.api.ldk.table.ButtonConfigFactory; import org.labkey.api.module.Module; +import org.labkey.api.pipeline.PipelineJobException; import org.labkey.api.query.BatchValidationException; import org.labkey.api.query.DetailsURL; import org.labkey.api.query.ExprColumn; @@ -43,6 +45,7 @@ import org.labkey.api.view.ActionURL; import org.labkey.api.view.template.ClientDependency; +import java.io.File; import java.io.IOException; import java.util.Collection; import java.util.Date; @@ -326,4 +329,14 @@ public EHRQCState getQCState(@NotNull Container c) abstract public List ensureStudyQCStates(Container c, final User u, final boolean commitChanges); abstract public void registerLabWorkOverrides(Module module, String fromType, LabworkType toType); + + /** + * The EHR has a built-in GeneticsCalculations pipeline job that computes inbreeding and kinship based on the pedigree. + * These are normally calculated in R, saved as TSVs, and imported using java code. This method is a separate entrypoint + * that allows other code perform the calculations, save the results as TSVs, and then trigger import here. + * + * A use case is a separate pipeline server that performs the R computation on a cluster, and then triggers the main webserver to import + * those results. + */ + abstract public void standaloneProcessKinshipAndInbreeding(Container c, User u, File pipelineDir, Logger log) throws PipelineJobException; } diff --git a/ehr/resources/pipeline/kinship/populateInbreeding.r b/ehr/resources/pipeline/kinship/populateInbreeding.r index 9bf71ba85..49145eec5 100644 --- a/ehr/resources/pipeline/kinship/populateInbreeding.r +++ b/ehr/resources/pipeline/kinship/populateInbreeding.r @@ -7,47 +7,46 @@ # This R script will calculate and store inbreeding coefficients for all animals in the colony. This data will be compared against # the information currently stored in the DB and the minimal number of inserts/updates/deletes are then performed. This script is designed # to run as a daily cron job. - - -options(error = dump.frames); -library(pedigree); -library(getopt); -library(Matrix); +options(error = dump.frames) +library(pedigree) +library(getopt) +library(Matrix) library(dplyr) spec <- matrix(c( -'inputFile', '-f', 1, "character" -), ncol=4, byrow=TRUE); -opts = getopt(spec, commandArgs(trailingOnly = TRUE)); + 'inputFile', '-f', 1, 'character' +), ncol=4, byrow=TRUE) +opts <- getopt(spec, commandArgs(trailingOnly = TRUE)) -allPed <- read.table(opts$inputFile); +allPed <- read.table(opts$inputFile) colnames(allPed)<-c('Id', 'Dam', 'Sire', 'Gender') -is.na(allPed$Id)<-which(allPed$Id=="") -is.na(allPed$Dam)<-which(allPed$Dam=="") -is.na(allPed$Sire)<-which(allPed$Sire=="") -is.na(allPed$Gender)<-which(allPed$Gender=="") - -df <- data.frame(id=as.character(allPed$Id), 'id parent1'=allPed$Dam, 'id parent2'=allPed$Sire, stringsAsFactors=FALSE); -colnames(df)<-c("id", "id parent1", "id parent2") +allPed$Id[allPed$Id == ""] <- NA +allPed$Id[allPed$Dam == ""] <- NA +allPed$Id[allPed$Sire == ""] <- NA +allPed$Id[allPed$Gender == ""] <- NA -originalIds <-as.data.frame(df[,1,drop=FALSE]) +df <- data.frame(id=as.character(allPed$Id), 'id parent1'=allPed$Dam, 'id parent2'=allPed$Sire, stringsAsFactors=FALSE) +originalIds <- df$id #this is a function in the pedigree package designed to add missing parents to the dataframe #see pedigree package documentation for more detail -df <- add.Inds(df); +df <- add.Inds(df) ord <- orderPed(df) df <- df[order(ord),] #use an existing package to calculate inbreeding -ib = calcInbreeding(df); - -newRecords <- data.frame(Id=as.character(df$id), coefficient=ib, stringsAsFactors=FALSE); +ib <- calcInbreeding(df) #only calculate inbreeding for Ids at the center -newRecords <- dplyr::filter(newRecords, Id %in% originalIds$id) +newRecords <- data.frame(Id=as.character(df$id), coefficient=ib, stringsAsFactors=FALSE) %>% + dplyr::filter(Id %in% originalIds) + +print("Output table:") +print(str(newRecords)) + +if (nrow(newRecords) != length(originalIds)) { + stop(paste0('Output dataframe and input IDs not the same length! Expected: ', length(originalIds), ', was: ', nrow(newRecords))) +} -# write TSV to disk -print("Output table:"); -print(str(newRecords)); -write.table(newRecords, file = "inbreeding.txt", append = FALSE,row.names=F,quote=F,sep="\t"); \ No newline at end of file +write.table(newRecords, file = "inbreeding.txt", append = FALSE, row.names=F, quote=F, sep="\t") \ No newline at end of file diff --git a/ehr/resources/pipeline/kinship/populateKinship.r b/ehr/resources/pipeline/kinship/populateKinship.r index abffbd120..e930530a1 100644 --- a/ehr/resources/pipeline/kinship/populateKinship.r +++ b/ehr/resources/pipeline/kinship/populateKinship.r @@ -7,30 +7,24 @@ # This R script will calculate and store kinship coefficients (aka. relatedness) for all animals in the colony. This is a large, sparse matrix. # The matrix is converted into a very long 3-column dataframe (animal1, animal2, coefficient). This dataframe is output to a TSV file, # which is normally imported into ehr.kinship by java code in GeneticCalculationsImportTask - - -#options(echo=TRUE); -options(error = dump.frames); -library(methods); -library(kinship2); -library(getopt); -library(Matrix); -library(dplyr); +options(error = dump.frames) +library(kinship2) +library(getopt) +library(Matrix) +library(dplyr) spec <- matrix(c( -#'containerPath', '-c', 1, "character", -#'baseUrl', '-b', 1, "character" -'inputFile', '-f', 1, "character" -), ncol=4, byrow=TRUE); -opts = getopt(spec, commandArgs(trailingOnly = TRUE)); + 'inputFile', '-f', 1, "character" +), ncol=4, byrow=TRUE) +opts <- getopt(spec, commandArgs(trailingOnly = TRUE)) -allPed <- read.table(opts$inputFile, quote="\""); -colnames(allPed)<-c('Id', 'Dam', 'Sire', 'Gender', 'Species'); +allPed <- read.table(opts$inputFile, quote="\"") +colnames(allPed)<-c('Id', 'Dam', 'Sire', 'Gender', 'Species') -is.na(allPed$Id)<-which(allPed$Id=="") -is.na(allPed$Dam)<-which(allPed$Dam=="") -is.na(allPed$Sire)<-which(allPed$Sire=="") -is.na(allPed$Gender)<-which(allPed$Gender=="") +allPed$Id[allPed$Id == ""] <- NA +allPed$Id[allPed$Dam == ""] <- NA +allPed$Id[allPed$Sire == ""] <- NA +allPed$Id[allPed$Gender == ""] <- NA allPed$Species <- as.character(allPed$Species) allPed$Species[is.na(allPed$Species)] <- c('Unknown') @@ -38,10 +32,10 @@ allPed$Species <- as.factor(allPed$Species) # In order to reduce the max matrix size, calculate famids using makefamid, then analyze each group separately # It resizes the biggest matrix from 12000^2 to 8200^2 thus reduces the memory used by half -newRecords=NULL +newRecords <- NULL for (species in unique(allPed$Species)){ - print(paste0('processing species: ', species)) allRecordsForSpecies <- allPed[allPed$Species == species,] + print(paste0('Processing species: ', species, ', with ', nrow(allRecordsForSpecies), ' IDs')) # Add missing parents for accurate kinship calculations fixedRecords <- with(allRecordsForSpecies, fixParents(id = Id, dadid = Sire, momid = Dam, sex = Gender)) @@ -49,25 +43,80 @@ for (species in unique(allPed$Species)){ # Kinship is expecting records to be sorted IAW it's own pedigree function recordsForSpecies <- with(fixedRecords, pedigree(id=id,dadid=dadid,momid=momid,sex=sex,missid=0)) - temp.kin=kinship(recordsForSpecies) + temp.kin <- kinship(recordsForSpecies) # Add rownames to make matrix symmetric, which is required downstream rownames(temp.kin) <- colnames(temp.kin) # Convert kinship matrix to a triplet list of two ids and their coefficient - summaryDf = as.data.frame(summary(as(temp.kin, "dgCMatrix"))) + summaryDf <- as.data.frame(summary(as(temp.kin, "dgCMatrix"))) idList <- rownames(temp.kin) - temp.tri = data.frame(Id=idList[summaryDf$i], Id2=idList[summaryDf$j], coefficient=summaryDf$x) + temp.tri <- data.frame(Id=idList[summaryDf$i], Id2=idList[summaryDf$j], coefficient=summaryDf$x) # Now filter out parents added for kinship calculation temp.tri <- dplyr::filter(temp.tri, grepl("^(?!addin).*$", Id, perl = TRUE)) temp.tri <- dplyr::filter(temp.tri, grepl("^(?!addin).*$", Id2, perl = TRUE)) - newRecords=rbind(newRecords,temp.tri) - print(paste0('total subjects: ', nrow(allRecordsForSpecies))) + newRecords <- rbind(newRecords,temp.tri) + print(paste0('Total subjects: ', nrow(allRecordsForSpecies))) +} + +generateExpectedKinship <- function(pedDf) { + parentChild <- rbind( + data.frame(Id = pedDf$Id, Id2 = pedDf$Dam, Species = pedDf$Species, Relationship = 'Child/Parent'), + data.frame(Id = pedDf$Id, Id2 = pedDf$Sire, Species = pedDf$Species, Relationship = 'Child/Parent') + ) %>% filter(!is.na(Id2)) %>% mutate(ExpectedCoefficient = 0.5) + + grandParentOffspring1 <- merge(pedDf[!is.na(pedDf$Dam),], pedDf, by.x = c('Dam', 'Species'), by.y = c('Id', 'Species'), all.x = F, all.y = F) + grandParentOffspring1 <- rbind( + grandParentOffspring1 %>% select(Id, Dam.y, Species) %>% filter(!is.na(Dam.y)) %>% rename(Id2 = Dam.y) %>% mutate(Relationship = 'Grandchild/Maternal Granddam'), + grandParentOffspring1 %>% select(Id, Sire.y, Species) %>% filter(!is.na(Sire.y)) %>% rename(Id2 = Sire.y) %>% mutate(Relationship = 'Grandchild/Maternal Grandsire') + ) %>% mutate(ExpectedCoefficient = 0.25) + + grandParentOffspring2 <- merge(pedDf[!is.na(pedDf$Sire),], pedDf, by.x = c('Sire', 'Species'), by.y = c('Id', 'Species'), all.x = F, all.y = F) + grandParentOffspring2 <- rbind( + grandParentOffspring2 %>% select(Id, Dam.y, Species) %>% filter(!is.na(Dam.y)) %>% rename(Id2 = Dam.y) %>% mutate(Relationship = 'Grandchild/Paternal Granddam'), + grandParentOffspring2 %>% select(Id, Sire.y, Species) %>% filter(!is.na(Sire.y)) %>% rename(Id2 = Sire.y) %>% mutate(Relationship = 'Grandchild/Paternal Grandsire') + ) %>% mutate(ExpectedCoefficient = 0.25) + + fullSibs <- merge(pedDf[!is.na(pedDf$Dam) & !is.na(pedDf$Sire),], pedDf[!is.na(pedDf$Dam) & !is.na(pedDf$Sire),], by = c('Sire', 'Dam', 'Species'), all.x = F, all.y = F) %>% + select(Id.x, Id.y, Species) %>% + rename(Id = Id.x, Id2 = Id.y) %>% + mutate(Relationship = 'Full sib', ExpectedCoefficient = 0.25) + + ret <- rbind( + parentChild, + grandParentOffspring1, + grandParentOffspring2, + fullSibs + ) + + # Generate the reciprocal of relationships as well: + ret2 <- data.frame(Id = ret$Id2, Id2 = ret$Id, Species = ret$Species, Relationship = ret$Relationship, ExpectedCoefficient = ret$ExpectedCoefficient) + ret2$Relationship <- sapply(ret2$Relationship, function(x){ + x <- unlist(strsplit(x, split = '/')) + if (length(x) == 1) { + return(x) + } + + return(paste0(x[2], '/', x[1])) + }) + + return(rbind(ret, ret2)) +} + +# Basic validation: +toValidate <- merge(newRecords, generateExpectedKinship(pedDf), by = c('Id', 'Id2', 'Species'), all.x = T, all.y = T) %>% + filter(!is.na(ExpectedCoefficient)) %>% + mutate(MinCoefficient = ExpectedCoefficient / 2) %>% + filter(is.na(coefficient) | coefficient < MinCoefficient) + +if (nrow(toValidate) > 0) { + print(paste0('There were unexpected kinship values! See the file kinshipErrors.txt for more information')) + write.table(newRecords, file = "kinshipErrors.txt", append = FALSE, row.names=F, quote=F, sep='\t') } # write TSV to disk -print("Output table:"); -print(str(newRecords)); -write.table(newRecords, file = "kinship.txt", append = FALSE,row.names=F,quote=F,sep="\t"); \ No newline at end of file +print("Output table:") +print(str(newRecords)) +write.table(newRecords, file = "kinship.txt", append = FALSE, row.names = FALSE, quote = FALSE, sep='\t') \ No newline at end of file diff --git a/ehr/src/org/labkey/ehr/EHRServiceImpl.java b/ehr/src/org/labkey/ehr/EHRServiceImpl.java index 039a34b10..0b9baed00 100644 --- a/ehr/src/org/labkey/ehr/EHRServiceImpl.java +++ b/ehr/src/org/labkey/ehr/EHRServiceImpl.java @@ -50,6 +50,7 @@ import org.labkey.api.module.ModuleLoader; import org.labkey.api.module.ModuleProperty; import org.labkey.api.pipeline.PipeRoot; +import org.labkey.api.pipeline.PipelineJobException; import org.labkey.api.pipeline.PipelineService; import org.labkey.api.query.BatchValidationException; import org.labkey.api.query.DetailsURL; @@ -78,10 +79,12 @@ import org.labkey.ehr.history.DefaultObservationsDataSource; import org.labkey.ehr.history.DefaultPregnanciesDataSource; import org.labkey.ehr.history.LabworkManager; +import org.labkey.ehr.pipeline.GeneticCalculationsImportTask; import org.labkey.ehr.security.EHRSecurityManager; import org.labkey.ehr.table.DefaultEHRCustomizer; import org.labkey.ehr.table.SNOMEDCodesDisplayColumn; +import java.io.File; import java.io.FileNotFoundException; import java.io.IOException; import java.io.InputStream; @@ -1061,4 +1064,10 @@ public Map> getLabWorkOverrides() { return _labWorkOverrides; } + + @Override + public void standaloneProcessKinshipAndInbreeding(Container c, User u, File pipelineDir, Logger log) throws PipelineJobException + { + GeneticCalculationsImportTask.standaloneProcessKinshipAndInbreeding(c, u, pipelineDir, log); + } } diff --git a/ehr/src/org/labkey/ehr/pipeline/GeneticCalculationsImportTask.java b/ehr/src/org/labkey/ehr/pipeline/GeneticCalculationsImportTask.java index 28d277270..5bd5adbe5 100644 --- a/ehr/src/org/labkey/ehr/pipeline/GeneticCalculationsImportTask.java +++ b/ehr/src/org/labkey/ehr/pipeline/GeneticCalculationsImportTask.java @@ -16,6 +16,7 @@ package org.labkey.ehr.pipeline; import org.apache.commons.lang3.StringUtils; +import org.apache.logging.log4j.Logger; import org.jetbrains.annotations.NotNull; import org.jetbrains.annotations.Nullable; import org.labkey.api.collections.CaseInsensitiveHashMap; @@ -50,6 +51,7 @@ import org.labkey.api.query.QueryUpdateServiceException; import org.labkey.api.query.UserSchema; import org.labkey.api.query.ValidationException; +import org.labkey.api.reader.Readers; import org.labkey.api.security.User; import org.labkey.api.util.FileType; import org.labkey.api.util.PageFlowUtil; @@ -116,7 +118,7 @@ public List getProtocolActionNames() } @Override - public PipelineJob.Task createTask(PipelineJob job) + public PipelineJob.Task createTask(PipelineJob job) { GeneticCalculationsImportTask task = new GeneticCalculationsImportTask(this, job); setJoin(false); @@ -145,8 +147,11 @@ public RecordedActionSet run() throws PipelineJobException } else { - processInbreeding(); - processKinship(); + PipelineJob job = getJob(); + FileAnalysisJobSupport support = (FileAnalysisJobSupport) job; + + processInbreeding(job.getContainer(), job.getUser(), support.getAnalysisDirectoryPath().toFile(), job.getLogger()); + processKinship(job.getContainer(), job.getUser(), support.getAnalysisDirectoryPath().toFile(), job.getLogger()); if (GeneticCalculationsJob.isKinshipValidation()) validateKinship(); @@ -155,24 +160,27 @@ public RecordedActionSet run() throws PipelineJobException return new RecordedActionSet(actions); } - private void processKinship() throws PipelineJobException + public static void standaloneProcessKinshipAndInbreeding(Container c, User u, File pipelineDir, Logger log) throws PipelineJobException { - PipelineJob job = getJob(); - FileAnalysisJobSupport support = (FileAnalysisJobSupport) job; + processInbreeding(c, u, pipelineDir, log); + processKinship(c, u, pipelineDir, log); + } - File output = new File(support.getAnalysisDirectory(), KINSHIP_FILE); + private static void processKinship(Container c, User u, File pipelineDir, Logger log) throws PipelineJobException + { + File output = new File(pipelineDir, KINSHIP_FILE); if (!output.exists()) throw new PipelineJobException("Unable to find file: " + output.getPath()); DbSchema ehrSchema = EHRSchema.getInstance().getSchema(); TableInfo kinshipTable = ehrSchema.getTable("kinship"); - getJob().getLogger().info("Inspecting file length: " + output.getPath()); + log.info("Inspecting file length: " + output.getPath()); try { try (DbScope.Transaction transaction = ExperimentService.get().ensureTransaction(); - LineNumberReader lnr = new LineNumberReader(new BufferedReader(new FileReader(output)))) + LineNumberReader lnr = new LineNumberReader(Readers.getReader(output))) { while (lnr.readLine() != null) { @@ -180,19 +188,19 @@ private void processKinship() throws PipelineJobException break; } int lineNumber = lnr.getLineNumber(); - lnr.close(); - if (lineNumber < 3) + { throw new PipelineJobException("Too few lines found in output. Line count was: " + lineNumber); + } //delete all previous records - getJob().getLogger().info("Deleting existing rows"); - Table.delete(kinshipTable, new SimpleFilter(FieldKey.fromString("container"), getJob().getContainerId(), CompareType.EQUAL)); + log.info("Deleting existing rows"); + Table.delete(kinshipTable, new SimpleFilter(FieldKey.fromString("container"), c.getId(), CompareType.EQUAL)); //NOTE: this process creates and deletes a ton of rows each day. the rowId can balloon very quickly, so we reset it here SqlSelector ss = new SqlSelector(kinshipTable.getSchema(), new SQLFragment("SELECT max(rowid) as expt FROM " + kinshipTable.getSelectName())); List ret = ss.getArrayList(Long.class); - Integer maxVal; + int maxVal; if (ret.isEmpty()) { maxVal = 0; @@ -230,9 +238,9 @@ else if (kinshipTable.getSqlDialect().isPostgreSQL()) } try (DbScope.Transaction transaction = ExperimentService.get().ensureTransaction(); - BufferedReader reader = new BufferedReader(new FileReader(output))) + BufferedReader reader = Readers.getReader(output)) { - getJob().getLogger().info("Inserting rows"); + log.info("Inserting rows"); String line = null; int lineNum = 0; while ((line = reader.readLine()) != null) @@ -246,7 +254,7 @@ else if (kinshipTable.getSqlDialect().isPostgreSQL()) if (fields[0].equalsIgnoreCase(fields[1])) continue; //dont import self-kinship - Map row = new HashMap(); + Map row = new HashMap<>(); assert fields[0].length() < 80 : "Field Id value too long: [" + fields[0] + ']'; assert fields[1].length() < 80 : "Field Id2 value too long: [" + fields[1] + "]"; @@ -261,19 +269,19 @@ else if (kinshipTable.getSqlDialect().isPostgreSQL()) throw new PipelineJobException("Invalid kinship coefficient on line " + (lineNum + 1) + " for IDs " + fields[0] + " and " + fields[1] + ": " + fields[2], e); } - row.put("container", job.getContainer().getId()); + row.put("container", c.getId()); row.put("created", new Date()); - row.put("createdby", job.getUser().getUserId()); - Table.insert(job.getUser(), kinshipTable, row); + row.put("createdby", u.getUserId()); + Table.insert(u, kinshipTable, row); lineNum++; if (lineNum % 100000 == 0) { - getJob().getLogger().info("processed " + lineNum + " rows"); + log.info("processed " + lineNum + " rows"); } } - job.getLogger().info("Inserted " + lineNum + " rows into ehr.kinship"); + log.info("Inserted " + lineNum + " rows into ehr.kinship"); transaction.commit(); } } @@ -431,7 +439,7 @@ private void validateSetOfRelations(TableInfo kinshipTable, Map familyMembers) { - PipelineJob job = getJob(); - TableSelector familyTs = new TableSelector(familyTable, new LinkedHashSet(familyMembers)); + TableSelector familyTs = new TableSelector(familyTable, new LinkedHashSet<>(familyMembers)); Map> relations = new HashMap<>(); familyTs.forEach(rs -> { @@ -564,7 +570,7 @@ private boolean validateKinship() return true; } - private TableInfo getRealTable(TableInfo ti) + private static TableInfo getRealTable(TableInfo ti) { Domain domain = ti.getDomain(); if (domain != null) @@ -575,48 +581,44 @@ private TableInfo getRealTable(TableInfo ti) return null; } - private void processInbreeding() throws PipelineJobException + private static void processInbreeding(Container c, User u, File pipelineDir, Logger log) throws PipelineJobException { - PipelineJob job = getJob(); - FileAnalysisJobSupport support = (FileAnalysisJobSupport) job; - - File output = new File(support.getAnalysisDirectory(), INBREEDING_FILE); + File output = new File(pipelineDir, INBREEDING_FILE); if (!output.exists()) throw new PipelineJobException("Unable to find file: " + output.getPath()); - UserSchema us = QueryService.get().getUserSchema(job.getUser(), job.getContainer(), "study"); + UserSchema us = QueryService.get().getUserSchema(u, c, "study"); TableInfo ti = us.getTable("Inbreeding Coefficients"); if (ti == null) { - getJob().getLogger().warn("Unable to find table study.inbreeding coefficients"); + log.warn("Unable to find table study.inbreeding coefficients"); return; } QueryUpdateService qus = ti.getUpdateService(); qus.setBulkLoad(true); - LineNumberReader lnr = null; - BufferedReader reader = null; - - try + try (BufferedReader reader = Readers.getReader(output)) { try (DbScope.Transaction transaction = ExperimentService.get().ensureTransaction()) { - getJob().getLogger().info("Inspecting file length: " + output.getPath()); - lnr = new LineNumberReader(new BufferedReader(new FileReader(output))); - while (lnr.readLine() != null) + log.info("Inspecting file length: " + output.getPath()); + try (LineNumberReader lnr = new LineNumberReader(Readers.getReader(output))) { - if (lnr.getLineNumber() > 3) - break; + while (lnr.readLine() != null) + { + if (lnr.getLineNumber() > 3) + break; + } + int lineNumber = lnr.getLineNumber(); + if (lineNumber < 3) + { + throw new PipelineJobException("Too few lines found in inbreeding output. Line count was: " + lineNumber); + } } - int lineNumber = lnr.getLineNumber(); - lnr.close(); - - if (lineNumber < 3) - throw new PipelineJobException("Too few lines found in inbreeding output. Line count was: " + lineNumber); //delete all previous records - getJob().getLogger().info("Deleting existing rows"); + log.info("Deleting existing rows"); TableInfo realTable = getRealTable(ti); if (realTable == null) { @@ -628,14 +630,12 @@ private void processInbreeding() throws PipelineJobException transaction.commit(); } - reader = new BufferedReader(new FileReader(output)); - String line; int lineNum = 0; List> rows = new ArrayList<>(); Date date = new Date(); - getJob().getLogger().info("Reading file"); + log.info("Reading file"); while ((line = reader.readLine()) != null){ String[] fields = line.split("\t"); if (fields.length < 2) @@ -643,11 +643,11 @@ private void processInbreeding() throws PipelineJobException if ("coefficient".equalsIgnoreCase(fields[1])) continue; //skip header - Map row = new CaseInsensitiveHashMap(); + Map row = new CaseInsensitiveHashMap<>(); String subjectId = StringUtils.trimToNull(fields[0]); if (subjectId == null) { - getJob().getLogger().error("Missing subjectId on row " + lineNum); + log.error("Missing subjectId on row " + lineNum); continue; } @@ -660,22 +660,22 @@ private void processInbreeding() throws PipelineJobException lineNum++; } - getJob().getLogger().info("Inserting rows"); + log.info("Inserting rows"); BatchValidationException errors = new BatchValidationException(); Map options = new HashMap<>(); - options.put(QueryUpdateService.ConfigParameters.Logger, getJob().getLogger()); + options.put(QueryUpdateService.ConfigParameters.Logger, log); try (DbScope.Transaction transaction = ExperimentService.get().ensureTransaction()) { - qus.insertRows(getJob().getUser(), getJob().getContainer(), rows, errors, options, new HashMap()); + qus.insertRows(u, c, rows, errors, options, new HashMap<>()); if (errors.hasErrors()) throw errors; transaction.commit(); } - job.getLogger().info("Inserted " + lineNum + " rows into inbreeding coefficients table"); + log.info("Inserted " + lineNum + " rows into inbreeding coefficients table"); } catch (DuplicateKeyException | SQLException | IOException | QueryUpdateServiceException e) @@ -684,21 +684,13 @@ private void processInbreeding() throws PipelineJobException } catch (BatchValidationException e) { - getJob().getLogger().info("error inserting rows"); + log.info("error inserting rows"); for (ValidationException ve : e.getRowErrors()) { - getJob().getLogger().info(ve.getMessage()); + log.info(ve.getMessage()); } throw new PipelineJobException(e); } - finally - { - if (lnr != null) - try{lnr.close();}catch (Exception ignored){} - - if (reader != null) - try{reader.close();}catch (Exception ignored){} - } } } diff --git a/ehr/src/org/labkey/ehr/pipeline/GeneticCalculationsRTask.java b/ehr/src/org/labkey/ehr/pipeline/GeneticCalculationsRTask.java index 7a075a391..1fe9d72aa 100644 --- a/ehr/src/org/labkey/ehr/pipeline/GeneticCalculationsRTask.java +++ b/ehr/src/org/labkey/ehr/pipeline/GeneticCalculationsRTask.java @@ -82,7 +82,7 @@ public List getProtocolActionNames() } @Override - public PipelineJob.Task createTask(PipelineJob job) + public PipelineJob.Task createTask(PipelineJob job) { GeneticCalculationsRTask task = new GeneticCalculationsRTask(this, job); setJoin(false); From b64fab302f5d638416b85664ed57b7ac088cf775 Mon Sep 17 00:00:00 2001 From: bbimber Date: Mon, 4 Sep 2023 12:04:12 -0700 Subject: [PATCH 02/11] Improve R scripts: - Make R scripts exit immediately on error - Bugfix to expected kinship/validation --- .../pipeline/kinship/populateInbreeding.r | 5 +-- .../pipeline/kinship/populateKinship.r | 32 +++++++++++-------- 2 files changed, 20 insertions(+), 17 deletions(-) diff --git a/ehr/resources/pipeline/kinship/populateInbreeding.r b/ehr/resources/pipeline/kinship/populateInbreeding.r index 49145eec5..629178452 100644 --- a/ehr/resources/pipeline/kinship/populateInbreeding.r +++ b/ehr/resources/pipeline/kinship/populateInbreeding.r @@ -7,7 +7,6 @@ # This R script will calculate and store inbreeding coefficients for all animals in the colony. This data will be compared against # the information currently stored in the DB and the minimal number of inserts/updates/deletes are then performed. This script is designed # to run as a daily cron job. -options(error = dump.frames) library(pedigree) library(getopt) library(Matrix) @@ -28,6 +27,7 @@ allPed$Id[allPed$Gender == ""] <- NA df <- data.frame(id=as.character(allPed$Id), 'id parent1'=allPed$Dam, 'id parent2'=allPed$Sire, stringsAsFactors=FALSE) originalIds <- df$id +print(paste0('Input IDs: ', nrow(df))) #this is a function in the pedigree package designed to add missing parents to the dataframe #see pedigree package documentation for more detail @@ -42,9 +42,6 @@ ib <- calcInbreeding(df) newRecords <- data.frame(Id=as.character(df$id), coefficient=ib, stringsAsFactors=FALSE) %>% dplyr::filter(Id %in% originalIds) -print("Output table:") -print(str(newRecords)) - if (nrow(newRecords) != length(originalIds)) { stop(paste0('Output dataframe and input IDs not the same length! Expected: ', length(originalIds), ', was: ', nrow(newRecords))) } diff --git a/ehr/resources/pipeline/kinship/populateKinship.r b/ehr/resources/pipeline/kinship/populateKinship.r index e930530a1..762329d7a 100644 --- a/ehr/resources/pipeline/kinship/populateKinship.r +++ b/ehr/resources/pipeline/kinship/populateKinship.r @@ -7,7 +7,6 @@ # This R script will calculate and store kinship coefficients (aka. relatedness) for all animals in the colony. This is a large, sparse matrix. # The matrix is converted into a very long 3-column dataframe (animal1, animal2, coefficient). This dataframe is output to a TSV file, # which is normally imported into ehr.kinship by java code in GeneticCalculationsImportTask -options(error = dump.frames) library(kinship2) library(getopt) library(Matrix) @@ -56,32 +55,36 @@ for (species in unique(allPed$Species)){ # Now filter out parents added for kinship calculation temp.tri <- dplyr::filter(temp.tri, grepl("^(?!addin).*$", Id, perl = TRUE)) temp.tri <- dplyr::filter(temp.tri, grepl("^(?!addin).*$", Id2, perl = TRUE)) + temp.tri$Species <- species newRecords <- rbind(newRecords,temp.tri) - print(paste0('Total subjects: ', nrow(allRecordsForSpecies))) } generateExpectedKinship <- function(pedDf) { + # See reference: https://en.wikipedia.org/wiki/Coefficient_of_relationship#Kinship_coefficient + self <- data.frame(Id = pedDf$Id, Id2 = pedDf$Id, Species = pedDf$Species, Relationship = 'Self', ExpectedCoefficient = 0.5) + parentChild <- rbind( data.frame(Id = pedDf$Id, Id2 = pedDf$Dam, Species = pedDf$Species, Relationship = 'Child/Parent'), data.frame(Id = pedDf$Id, Id2 = pedDf$Sire, Species = pedDf$Species, Relationship = 'Child/Parent') - ) %>% filter(!is.na(Id2)) %>% mutate(ExpectedCoefficient = 0.5) + ) %>% filter(!is.na(Id2)) %>% mutate(ExpectedCoefficient = 0.25) grandParentOffspring1 <- merge(pedDf[!is.na(pedDf$Dam),], pedDf, by.x = c('Dam', 'Species'), by.y = c('Id', 'Species'), all.x = F, all.y = F) grandParentOffspring1 <- rbind( grandParentOffspring1 %>% select(Id, Dam.y, Species) %>% filter(!is.na(Dam.y)) %>% rename(Id2 = Dam.y) %>% mutate(Relationship = 'Grandchild/Maternal Granddam'), grandParentOffspring1 %>% select(Id, Sire.y, Species) %>% filter(!is.na(Sire.y)) %>% rename(Id2 = Sire.y) %>% mutate(Relationship = 'Grandchild/Maternal Grandsire') - ) %>% mutate(ExpectedCoefficient = 0.25) + ) %>% mutate(ExpectedCoefficient = 0.125) grandParentOffspring2 <- merge(pedDf[!is.na(pedDf$Sire),], pedDf, by.x = c('Sire', 'Species'), by.y = c('Id', 'Species'), all.x = F, all.y = F) grandParentOffspring2 <- rbind( grandParentOffspring2 %>% select(Id, Dam.y, Species) %>% filter(!is.na(Dam.y)) %>% rename(Id2 = Dam.y) %>% mutate(Relationship = 'Grandchild/Paternal Granddam'), grandParentOffspring2 %>% select(Id, Sire.y, Species) %>% filter(!is.na(Sire.y)) %>% rename(Id2 = Sire.y) %>% mutate(Relationship = 'Grandchild/Paternal Grandsire') - ) %>% mutate(ExpectedCoefficient = 0.25) + ) %>% mutate(ExpectedCoefficient = 0.125) fullSibs <- merge(pedDf[!is.na(pedDf$Dam) & !is.na(pedDf$Sire),], pedDf[!is.na(pedDf$Dam) & !is.na(pedDf$Sire),], by = c('Sire', 'Dam', 'Species'), all.x = F, all.y = F) %>% select(Id.x, Id.y, Species) %>% rename(Id = Id.x, Id2 = Id.y) %>% + filter(Id != Id2) %>% mutate(Relationship = 'Full sib', ExpectedCoefficient = 0.25) ret <- rbind( @@ -102,21 +105,24 @@ generateExpectedKinship <- function(pedDf) { return(paste0(x[2], '/', x[1])) }) - return(rbind(ret, ret2)) + return(rbind(self, ret, ret2)) } # Basic validation: -toValidate <- merge(newRecords, generateExpectedKinship(pedDf), by = c('Id', 'Id2', 'Species'), all.x = T, all.y = T) %>% +toValidate <- merge(newRecords, generateExpectedKinship(allPed), by = c('Id', 'Id2', 'Species'), all.x = T, all.y = T) +write.table(file = 'kinshipWithExpectedValues.txt', toValidate, sep = '\t', quote = F) + +toValidate <- toValidate %>% filter(!is.na(ExpectedCoefficient)) %>% - mutate(MinCoefficient = ExpectedCoefficient / 2) %>% - filter(is.na(coefficient) | coefficient < MinCoefficient) + filter(is.na(coefficient) | coefficient < ExpectedCoefficient) if (nrow(toValidate) > 0) { print(paste0('There were unexpected kinship values! See the file kinshipErrors.txt for more information')) - write.table(newRecords, file = "kinshipErrors.txt", append = FALSE, row.names=F, quote=F, sep='\t') + write.table(newRecords, file = "kinshipErrors.txt", append = FALSE, row.names = FALSE, quote = FALSE, sep = '\t') +} else { + print('All coefficients were within expected ranges from predicted values') } +newRecords$Species <- NULL # write TSV to disk -print("Output table:") -print(str(newRecords)) -write.table(newRecords, file = "kinship.txt", append = FALSE, row.names = FALSE, quote = FALSE, sep='\t') \ No newline at end of file +write.table(newRecords, file = "kinship.txt", append = FALSE, row.names = FALSE, quote = FALSE, sep = '\t') \ No newline at end of file From 233c7a6e2dfbf75e2ec0af3730c6fd4a068cbd3d Mon Sep 17 00:00:00 2001 From: bbimber Date: Mon, 4 Sep 2023 12:05:18 -0700 Subject: [PATCH 03/11] Add action to allow external server to trigger re-import of externally computed kinship data --- .../ehr/pipeline/GeneticCalculationsRunnable.java | 15 ++++++--------- 1 file changed, 6 insertions(+), 9 deletions(-) diff --git a/ehr/src/org/labkey/ehr/pipeline/GeneticCalculationsRunnable.java b/ehr/src/org/labkey/ehr/pipeline/GeneticCalculationsRunnable.java index 4d6f0bf5b..b3bd6c833 100644 --- a/ehr/src/org/labkey/ehr/pipeline/GeneticCalculationsRunnable.java +++ b/ehr/src/org/labkey/ehr/pipeline/GeneticCalculationsRunnable.java @@ -31,6 +31,7 @@ import org.labkey.api.pipeline.file.FileAnalysisTaskPipeline; import org.labkey.api.security.User; import org.labkey.api.util.ConfigurationException; +import org.labkey.api.util.StringUtilsLabKey; import org.labkey.api.util.logging.LogHelper; import org.labkey.api.view.ActionURL; import org.labkey.api.view.ViewBackgroundInfo; @@ -73,12 +74,12 @@ private void startCalculation(User u, Container c, boolean allowRunningDuringDay { String taskIdString = FileAnalysisTaskPipeline.class.getName() + ":" + KINSHIP_PIPELINE_NAME; TaskId taskId = new TaskId(taskIdString); - TaskPipeline taskPipeline = PipelineJobService.get().getTaskPipeline(taskId); + TaskPipeline taskPipeline = PipelineJobService.get().getTaskPipeline(taskId); if (taskPipeline == null) throw new PipelineJobException("Unable to find kinship pipeline: " + taskId); AbstractFileAnalysisProvider provider = (AbstractFileAnalysisProvider) PipelineService.get().getPipelineProvider("File Analysis"); - AbstractFileAnalysisProtocolFactory factory = provider.getProtocolFactory(taskPipeline); + AbstractFileAnalysisProtocolFactory factory = provider.getProtocolFactory(taskPipeline); ViewBackgroundInfo bg = new ViewBackgroundInfo(c, u, new ActionURL()); PipeRoot root = PipelineService.get().getPipelineRootSetting(c); String protocolName = "EHR Kinship Calculation"; @@ -87,7 +88,7 @@ private void startCalculation(User u, Container c, boolean allowRunningDuringDay (allowRunningDuringDay ? "\ttrue" : "") + ""; - AbstractFileAnalysisProtocol protocol = factory.createProtocolInstance(protocolName, "", xml); + AbstractFileAnalysisProtocol protocol = factory.createProtocolInstance(protocolName, "", xml); if (protocol == null) { return; @@ -111,7 +112,7 @@ private void startCalculation(User u, Container c, boolean allowRunningDuringDay defaultXml.getParentFile().mkdirs(); defaultXml.createNewFile(); - try (FileWriter w = new FileWriter(defaultXml)) + try (FileWriter w = new FileWriter(defaultXml, StringUtilsLabKey.DEFAULT_CHARSET)) { w.write(xml); } @@ -135,11 +136,7 @@ private void startCalculation(User u, Container c, boolean allowRunningDuringDay { throw new ConfigurationException("The EHR kinship pipeline has not been configured on this server", e); } - catch (IOException e) - { - throw new PipelineJobException(e); - } - catch (PipelineValidationException e) + catch (IOException | PipelineValidationException e) { throw new PipelineJobException(e); } From bf5c98d84f852ca31583fe97bbdb86554d4e6930 Mon Sep 17 00:00:00 2001 From: bbimber Date: Mon, 4 Sep 2023 13:35:12 -0700 Subject: [PATCH 04/11] Correct typos --- ehr/resources/pipeline/kinship/populateInbreeding.r | 6 +++--- ehr/resources/pipeline/kinship/populateKinship.r | 6 +++--- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/ehr/resources/pipeline/kinship/populateInbreeding.r b/ehr/resources/pipeline/kinship/populateInbreeding.r index 629178452..bc5b6b95e 100644 --- a/ehr/resources/pipeline/kinship/populateInbreeding.r +++ b/ehr/resources/pipeline/kinship/populateInbreeding.r @@ -21,9 +21,9 @@ allPed <- read.table(opts$inputFile) colnames(allPed)<-c('Id', 'Dam', 'Sire', 'Gender') allPed$Id[allPed$Id == ""] <- NA -allPed$Id[allPed$Dam == ""] <- NA -allPed$Id[allPed$Sire == ""] <- NA -allPed$Id[allPed$Gender == ""] <- NA +allPed$Dam[allPed$Dam == ""] <- NA +allPed$Sire[allPed$Sire == ""] <- NA +allPed$Gender[allPed$Gender == ""] <- NA df <- data.frame(id=as.character(allPed$Id), 'id parent1'=allPed$Dam, 'id parent2'=allPed$Sire, stringsAsFactors=FALSE) originalIds <- df$id diff --git a/ehr/resources/pipeline/kinship/populateKinship.r b/ehr/resources/pipeline/kinship/populateKinship.r index 762329d7a..36ff7be58 100644 --- a/ehr/resources/pipeline/kinship/populateKinship.r +++ b/ehr/resources/pipeline/kinship/populateKinship.r @@ -21,9 +21,9 @@ allPed <- read.table(opts$inputFile, quote="\"") colnames(allPed)<-c('Id', 'Dam', 'Sire', 'Gender', 'Species') allPed$Id[allPed$Id == ""] <- NA -allPed$Id[allPed$Dam == ""] <- NA -allPed$Id[allPed$Sire == ""] <- NA -allPed$Id[allPed$Gender == ""] <- NA +allPed$Dam[allPed$Dam == ""] <- NA +allPed$Sire[allPed$Sire == ""] <- NA +allPed$Gender[allPed$Gender == ""] <- NA allPed$Species <- as.character(allPed$Species) allPed$Species[is.na(allPed$Species)] <- c('Unknown') From aed23f72498afbbe128bb459199794b3beddc309 Mon Sep 17 00:00:00 2001 From: bbimber Date: Mon, 4 Sep 2023 14:04:13 -0700 Subject: [PATCH 05/11] Minor cleanup --- .../org/labkey/ehr/pipeline/GeneticCalculationsInitTask.java | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/ehr/src/org/labkey/ehr/pipeline/GeneticCalculationsInitTask.java b/ehr/src/org/labkey/ehr/pipeline/GeneticCalculationsInitTask.java index 14cb4948f..fb6482853 100644 --- a/ehr/src/org/labkey/ehr/pipeline/GeneticCalculationsInitTask.java +++ b/ehr/src/org/labkey/ehr/pipeline/GeneticCalculationsInitTask.java @@ -33,6 +33,7 @@ import org.labkey.api.query.UserSchema; import org.labkey.api.util.FileType; import org.labkey.api.util.PageFlowUtil; +import org.labkey.api.writer.PrintWriters; import org.springframework.jdbc.BadSqlGrammarException; import java.io.File; @@ -84,7 +85,7 @@ public List getProtocolActionNames() } @Override - public PipelineJob.Task createTask(PipelineJob job) + public PipelineJob.Task createTask(PipelineJob job) { GeneticCalculationsInitTask task = new GeneticCalculationsInitTask(this, job); setJoin(false); @@ -138,7 +139,7 @@ public RecordedActionSet run() throws PipelineJobException File outputFile = new File(support.getAnalysisDirectory(), GeneticCalculationsImportTask.PEDIGREE_FILE); - try (CSVWriter writer = new CSVWriter(new OutputStreamWriter(new FileOutputStream(outputFile)), '\t', CSVWriter.DEFAULT_QUOTE_CHARACTER)) + try (CSVWriter writer = new CSVWriter(PrintWriters.getPrintWriter(outputFile), '\t', CSVWriter.DEFAULT_QUOTE_CHARACTER)) { long count = ts.getRowCount(); if (count > 0) From e8cd2a073e2653753f9cff8733cb5c5503614e6c Mon Sep 17 00:00:00 2001 From: bbimber Date: Tue, 5 Sep 2023 14:48:39 -0700 Subject: [PATCH 06/11] - Refactor kinship script to further reduce memory - Refactor kinship script to optionally merge species where hybrids are present and process together --- .../pipeline/kinship/populateKinship.r | 248 +++++++++++++----- .../panel/GeneticCalculationSettingsPanel.js | 18 +- ehr/src/org/labkey/ehr/EHRController.java | 14 +- .../GeneticCalculationsImportTask.java | 4 +- .../ehr/pipeline/GeneticCalculationsJob.java | 13 +- .../pipeline/GeneticCalculationsRTask.java | 23 +- .../pipeline/GeneticCalculationsRunnable.java | 2 + 7 files changed, 243 insertions(+), 79 deletions(-) diff --git a/ehr/resources/pipeline/kinship/populateKinship.r b/ehr/resources/pipeline/kinship/populateKinship.r index 36ff7be58..ed1e55037 100644 --- a/ehr/resources/pipeline/kinship/populateKinship.r +++ b/ehr/resources/pipeline/kinship/populateKinship.r @@ -13,116 +13,222 @@ library(Matrix) library(dplyr) spec <- matrix(c( - 'inputFile', '-f', 1, "character" + 'inputFile', '-f', 1, 'character', + 'mergeSpeciesWithHybrids', '-m', 0, 'logical', + 'performKinshipValidation', '-v', 0, 'logical' ), ncol=4, byrow=TRUE) opts <- getopt(spec, commandArgs(trailingOnly = TRUE)) +if (is.null(opts$mergeSpeciesWithHybrids)){ + opts$mergeSpeciesWithHybrids <- FALSE +} + +if (is.null(opts$performKinshipValidation)){ + opts$performKinshipValidation <- FALSE +} + allPed <- read.table(opts$inputFile, quote="\"") colnames(allPed)<-c('Id', 'Dam', 'Sire', 'Gender', 'Species') allPed$Id[allPed$Id == ""] <- NA allPed$Dam[allPed$Dam == ""] <- NA allPed$Sire[allPed$Sire == ""] <- NA -allPed$Gender[allPed$Gender == ""] <- NA +allPed$Gender[allPed$Gender == "" | is.na(allPed$Gender)] <- 3 # 3 = unknown allPed$Species <- as.character(allPed$Species) allPed$Species[is.na(allPed$Species)] <- c('Unknown') allPed$Species <- as.factor(allPed$Species) -# In order to reduce the max matrix size, calculate famids using makefamid, then analyze each group separately -# It resizes the biggest matrix from 12000^2 to 8200^2 thus reduces the memory used by half -newRecords <- NULL -for (species in unique(allPed$Species)){ - allRecordsForSpecies <- allPed[allPed$Species == species,] - print(paste0('Processing species: ', species, ', with ', nrow(allRecordsForSpecies), ' IDs')) - - # Add missing parents for accurate kinship calculations - fixedRecords <- with(allRecordsForSpecies, fixParents(id = Id, dadid = Sire, momid = Dam, sex = Gender)) - - # Kinship is expecting records to be sorted IAW it's own pedigree function - recordsForSpecies <- with(fixedRecords, pedigree(id=id,dadid=dadid,momid=momid,sex=sex,missid=0)) - - temp.kin <- kinship(recordsForSpecies) +if (any(allPed$Species == 'Unknown')) { + print(paste0('There are ', sum(allPed$Species == 'Unknown'), ' Ids with species = Unknown')) +} - # Add rownames to make matrix symmetric, which is required downstream - rownames(temp.kin) <- colnames(temp.kin) +# The purpose of this function is to handle instances where there was cross-breeding. +# While this is probably rare, it can occur. When this happens, simply merge the entire species together and process as one unit. +# This ensures that all relevant ancestors from both species are present +generateSpeciesToProcess <- function(allPed, mergeSpeciesWithHybrids) { + hybridParents <- dplyr::bind_rows( + merge(allPed, allPed, by.x = 'Dam', by.y = 'Id') %>% filter(Species.x != Species.y) %>% select(Id, Dam, Species.x, Species.y) %>% rename(Parent = Dam) %>% mutate(ParentType = 'Dam'), + merge(allPed, allPed, by.x = 'Sire', by.y = 'Id') %>% filter(Species.x != Species.y) %>% select(Id, Sire, Species.x, Species.y) %>% rename(Parent = Sire) %>% mutate(ParentType = 'Sire') + ) - # Convert kinship matrix to a triplet list of two ids and their coefficient - summaryDf <- as.data.frame(summary(as(temp.kin, "dgCMatrix"))) - idList <- rownames(temp.kin) - temp.tri <- data.frame(Id=idList[summaryDf$i], Id2=idList[summaryDf$j], coefficient=summaryDf$x) + if (nrow(hybridParents) > 0) { + print(paste0('There were ', nrow(hybridParents), ' records with parents of a different species')) + } + + if (mergeSpeciesWithHybrids && nrow(hybridParents) > 0) { + speciesGroups <- as.list(as.character(unique(allPed$Species))) + toMerge <- unique(hybridParents[c('Species.x', 'Species.y')]) + for (idx in 1:nrow(toMerge)){ + speciesToCollapse <- as.character(unlist(toMerge[idx,,drop = TRUE])) + matchingIdx <- sapply(speciesGroups, function(x){ + return(length(intersect(speciesToCollapse, x)) > 0) + }) + + speciesToCollapse <- sort(unique(c(speciesToCollapse, unlist(speciesGroups[matchingIdx])))) + speciesGroups <- speciesGroups[!matchingIdx] + speciesGroups <- append(speciesGroups, list(speciesToCollapse)) + } - # Now filter out parents added for kinship calculation - temp.tri <- dplyr::filter(temp.tri, grepl("^(?!addin).*$", Id, perl = TRUE)) - temp.tri <- dplyr::filter(temp.tri, grepl("^(?!addin).*$", Id2, perl = TRUE)) - temp.tri$Species <- species + print('These species will be merged for processing:') + invisible(lapply(speciesGroups, function(x){ + if (length(x) > 1) { + print(x) + } + })) + + return(speciesGroups) + } else { + return(unique(allPed$Species[allPed$Species != 'Unknown'])) + } +} - newRecords <- rbind(newRecords,temp.tri) +validateExpectedKinshipSubset <- function(dataToTest, expectedValues, errorRows, testReciprocal = TRUE) { + # Generate the reciprocal of relationships as well: + if (testReciprocal) { + ret2 <- data.frame(Id = expectedValues$Id2, Id2 = expectedValues$Id, Relationship = expectedValues$Relationship, ExpectedCoefficient = expectedValues$ExpectedCoefficient) + ret2$Relationship <- sapply(expectedValues$Relationship, function(x){ + x <- unlist(strsplit(x, split = '/')) + if (length(x) == 1) { + return(x) + } + + return(paste0(x[2], '/', x[1])) + }) + expectedValues <- dplyr::bind_rows(expectedValues, ret2) + rm(ret2) + } + + dat <- merge(dataToTest, expectedValues, by = c('Id', 'Id2'), all.x = T, all.y = T) %>% + filter(!is.na(ExpectedCoefficient)) %>% + filter(is.na(coefficient) | coefficient < ExpectedCoefficient) + + if (nrow(dat) == 0) { + return(errorRows) + } + + if (all(is.null(errorRows))) { + return(dat) + } + + return(dplyr::bind_rows(errorRows, dat)) } -generateExpectedKinship <- function(pedDf) { +validateExpectedKinship <- function(pedDf, dataToTest) { + errorRows <- NULL + # See reference: https://en.wikipedia.org/wiki/Coefficient_of_relationship#Kinship_coefficient - self <- data.frame(Id = pedDf$Id, Id2 = pedDf$Id, Species = pedDf$Species, Relationship = 'Self', ExpectedCoefficient = 0.5) + self <- data.frame(Id = pedDf$Id, Id2 = pedDf$Id, Relationship = 'Self', ExpectedCoefficient = 0.5) + errorRows <- validateExpectedKinshipSubset(dataToTest, self, errorRows, testReciprocal = FALSE) + rm(self) - parentChild <- rbind( - data.frame(Id = pedDf$Id, Id2 = pedDf$Dam, Species = pedDf$Species, Relationship = 'Child/Parent'), - data.frame(Id = pedDf$Id, Id2 = pedDf$Sire, Species = pedDf$Species, Relationship = 'Child/Parent') + parentChild <- dplyr::bind_rows( + data.frame(Id = pedDf$Id, Id2 = pedDf$Dam, Relationship = 'Child/Parent'), + data.frame(Id = pedDf$Id, Id2 = pedDf$Sire, Relationship = 'Child/Parent') ) %>% filter(!is.na(Id2)) %>% mutate(ExpectedCoefficient = 0.25) + errorRows <- validateExpectedKinshipSubset(dataToTest, parentChild, errorRows) + rm(parentChild) - grandParentOffspring1 <- merge(pedDf[!is.na(pedDf$Dam),], pedDf, by.x = c('Dam', 'Species'), by.y = c('Id', 'Species'), all.x = F, all.y = F) - grandParentOffspring1 <- rbind( - grandParentOffspring1 %>% select(Id, Dam.y, Species) %>% filter(!is.na(Dam.y)) %>% rename(Id2 = Dam.y) %>% mutate(Relationship = 'Grandchild/Maternal Granddam'), - grandParentOffspring1 %>% select(Id, Sire.y, Species) %>% filter(!is.na(Sire.y)) %>% rename(Id2 = Sire.y) %>% mutate(Relationship = 'Grandchild/Maternal Grandsire') + grandParentOffspring1 <- merge(pedDf[!is.na(pedDf$Dam),], pedDf, by.x = c('Dam'), by.y = c('Id'), all.x = F, all.y = F) + grandParentOffspring1 <- dplyr::bind_rows( + grandParentOffspring1 %>% select(Id, Dam.y) %>% filter(!is.na(Dam.y)) %>% rename(Id2 = Dam.y) %>% mutate(Relationship = 'Grandchild/Maternal Granddam'), + grandParentOffspring1 %>% select(Id, Sire.y) %>% filter(!is.na(Sire.y)) %>% rename(Id2 = Sire.y) %>% mutate(Relationship = 'Grandchild/Maternal Grandsire') ) %>% mutate(ExpectedCoefficient = 0.125) + errorRows <- validateExpectedKinshipSubset(dataToTest, grandParentOffspring1, errorRows) + rm(grandParentOffspring1) - grandParentOffspring2 <- merge(pedDf[!is.na(pedDf$Sire),], pedDf, by.x = c('Sire', 'Species'), by.y = c('Id', 'Species'), all.x = F, all.y = F) - grandParentOffspring2 <- rbind( - grandParentOffspring2 %>% select(Id, Dam.y, Species) %>% filter(!is.na(Dam.y)) %>% rename(Id2 = Dam.y) %>% mutate(Relationship = 'Grandchild/Paternal Granddam'), - grandParentOffspring2 %>% select(Id, Sire.y, Species) %>% filter(!is.na(Sire.y)) %>% rename(Id2 = Sire.y) %>% mutate(Relationship = 'Grandchild/Paternal Grandsire') + grandParentOffspring2 <- merge(pedDf[!is.na(pedDf$Sire),], pedDf, by.x = c('Sire'), by.y = c('Id'), all.x = F, all.y = F) + grandParentOffspring2 <- dplyr::bind_rows( + grandParentOffspring2 %>% select(Id, Dam.y) %>% filter(!is.na(Dam.y)) %>% rename(Id2 = Dam.y) %>% mutate(Relationship = 'Grandchild/Paternal Granddam'), + grandParentOffspring2 %>% select(Id, Sire.y) %>% filter(!is.na(Sire.y)) %>% rename(Id2 = Sire.y) %>% mutate(Relationship = 'Grandchild/Paternal Grandsire') ) %>% mutate(ExpectedCoefficient = 0.125) + errorRows <- validateExpectedKinshipSubset(dataToTest, grandParentOffspring2, errorRows) + rm(grandParentOffspring2) - fullSibs <- merge(pedDf[!is.na(pedDf$Dam) & !is.na(pedDf$Sire),], pedDf[!is.na(pedDf$Dam) & !is.na(pedDf$Sire),], by = c('Sire', 'Dam', 'Species'), all.x = F, all.y = F) %>% - select(Id.x, Id.y, Species) %>% + fullSibs <- merge(pedDf[!is.na(pedDf$Dam) & !is.na(pedDf$Sire),], pedDf[!is.na(pedDf$Dam) & !is.na(pedDf$Sire),], by = c('Sire', 'Dam'), all.x = F, all.y = F) %>% + select(Id.x, Id.y) %>% rename(Id = Id.x, Id2 = Id.y) %>% filter(Id != Id2) %>% mutate(Relationship = 'Full sib', ExpectedCoefficient = 0.25) + errorRows <- validateExpectedKinshipSubset(dataToTest, fullSibs, errorRows) + rm(fullSibs) - ret <- rbind( - parentChild, - grandParentOffspring1, - grandParentOffspring2, - fullSibs - ) - - # Generate the reciprocal of relationships as well: - ret2 <- data.frame(Id = ret$Id2, Id2 = ret$Id, Species = ret$Species, Relationship = ret$Relationship, ExpectedCoefficient = ret$ExpectedCoefficient) - ret2$Relationship <- sapply(ret2$Relationship, function(x){ - x <- unlist(strsplit(x, split = '/')) - if (length(x) == 1) { - return(x) - } + halfSibs1 <- merge(pedDf[!is.na(pedDf$Dam),], pedDf[!is.na(pedDf$Dam),], by = c('Dam'), all.x = F, all.y = F) %>% + filter(Sire.x != Sire.y) %>% + select(Id.x, Id.y) %>% + rename(Id = Id.x, Id2 = Id.y) %>% + filter(Id != Id2) %>% + mutate(Relationship = 'Half sib', ExpectedCoefficient = 0.125) + errorRows <- validateExpectedKinshipSubset(dataToTest, halfSibs1, errorRows) + rm(halfSibs1) - return(paste0(x[2], '/', x[1])) - }) + halfSibs2 <- merge(pedDf[!is.na(pedDf$Sire),], pedDf[!is.na(pedDf$Sire),], by = c('Sire'), all.x = F, all.y = F) %>% + filter(Dam.x != Dam.y) %>% + select(Id.x, Id.y) %>% + rename(Id = Id.x, Id2 = Id.y) %>% + filter(Id != Id2) %>% + mutate(Relationship = 'Half sib', ExpectedCoefficient = 0.125) + errorRows <- validateExpectedKinshipSubset(dataToTest, halfSibs2, errorRows) + rm(halfSibs2) - return(rbind(self, ret, ret2)) + return(errorRows) } -# Basic validation: -toValidate <- merge(newRecords, generateExpectedKinship(allPed), by = c('Id', 'Id2', 'Species'), all.x = T, all.y = T) -write.table(file = 'kinshipWithExpectedValues.txt', toValidate, sep = '\t', quote = F) +speciesToProcess <- generateSpeciesToProcess(allPed, opts$mergeSpeciesWithHybrids) -toValidate <- toValidate %>% - filter(!is.na(ExpectedCoefficient)) %>% - filter(is.na(coefficient) | coefficient < ExpectedCoefficient) +newRecords <- NULL +for (speciesSet in speciesToProcess){ + allRecordsForSpecies <- allPed[allPed$Species %in% speciesSet,] + print(paste0('Processing species set: ', paste0(speciesSet, collapse = ','), ', with ', nrow(allRecordsForSpecies), ' IDs')) + if (nrow(allRecordsForSpecies) == 1) { + print('single record, skipping') + newRecords <- dplyr::bind_rows(newRecords,data.frame(Id = allRecordsForSpecies$Id, Id2 = allRecordsForSpecies$Id, coefficient = 0.5, Species = allRecordsForSpecies$Species)) + next + } + + pctMissingSex <- sum(allRecordsForSpecies$Gender > 2) / nrow(allRecordsForSpecies) + if (pctMissingSex > 0.25) { + paste0('More than 25% of this species group are missing sex and cannot be processed by fixParents(), skipping') + newRecords <- dplyr::bind_rows(newRecords,data.frame(Id = allRecordsForSpecies$Id, Id2 = allRecordsForSpecies$Id, coefficient = 0.5, Species = allRecordsForSpecies$Species)) + next + } + + # Add missing parents for accurate kinship calculations + fixedRecords <- with(allRecordsForSpecies, fixParents(id = Id, dadid = Sire, momid = Dam, sex = Gender)) -if (nrow(toValidate) > 0) { - print(paste0('There were unexpected kinship values! See the file kinshipErrors.txt for more information')) - write.table(newRecords, file = "kinshipErrors.txt", append = FALSE, row.names = FALSE, quote = FALSE, sep = '\t') -} else { - print('All coefficients were within expected ranges from predicted values') + # Kinship is expecting records to be sorted IAW it's own pedigree function + recordsForSpecies <- with(fixedRecords, pedigree(id = id, dadid = dadid, momid = momid, sex = sex, missid = 0)) + + temp.kin <- kinship(recordsForSpecies) + + # Add rownames to make matrix symmetric, which is required downstream + rownames(temp.kin) <- colnames(temp.kin) + + # Convert kinship matrix to a triplet list of two ids and their coefficient + summaryDf <- as.data.frame(summary(as(temp.kin, "dgCMatrix"))) + idList <- rownames(temp.kin) + temp.tri <- data.frame(Id=idList[summaryDf$i], Id2=idList[summaryDf$j], coefficient=summaryDf$x) + + # Now filter out parents added for kinship calculation + temp.tri <- dplyr::filter(temp.tri, grepl("^(?!addin).*$", Id, perl = TRUE)) + temp.tri <- dplyr::filter(temp.tri, grepl("^(?!addin).*$", Id2, perl = TRUE)) + temp.tri <- merge(temp.tri, allRecordsForSpecies[c('Id', 'Species')], by = 'Id', all.x = TRUE) + + newRecords <- dplyr::bind_rows(newRecords,temp.tri) + + # NOTE: perform per-species to save memory + if (opts$performKinshipValidation) { + print('Validating coefficients against expected values') + errorRows <- validateExpectedKinship(allRecordsForSpecies, temp.tri) + if (!all(is.null(errorRows))) { + fileName <- paste0('kinshipErrors_', paste0(speciesSet, collapse = '.'), '.txt') + print(paste0('There were unexpected kinship values! See the file ', fileName, ' for more information')) + write.table(newRecords, file = fileName, row.names = FALSE, quote = FALSE, sep = '\t') + } else { + print('All coefficients were within expected ranges from predicted values') + } + } } -newRecords$Species <- NULL # write TSV to disk write.table(newRecords, file = "kinship.txt", append = FALSE, row.names = FALSE, quote = FALSE, sep = '\t') \ No newline at end of file diff --git a/ehr/resources/web/ehr/panel/GeneticCalculationSettingsPanel.js b/ehr/resources/web/ehr/panel/GeneticCalculationSettingsPanel.js index 0474ed15b..f4b8a3257 100644 --- a/ehr/resources/web/ehr/panel/GeneticCalculationSettingsPanel.js +++ b/ehr/resources/web/ehr/panel/GeneticCalculationSettingsPanel.js @@ -28,7 +28,7 @@ Ext4.define('EHR.panel.GeneticCalculationSettingsPanel', { xtype: 'checkbox', fieldLabel: 'Kinship validation?', itemId: 'kinshipValidation', - listeners : { + listeners: { render: function(c) { Ext4.create('Ext.tip.ToolTip', { target: c.getEl(), @@ -38,6 +38,18 @@ Ext4.define('EHR.panel.GeneticCalculationSettingsPanel', { }); } } + },{ + xtype: 'checkbox', + fieldLabel: 'Merge Species With Hybrids?', + itemId: 'mergeSpeciesWithHybrids', + listeners: { + render: function (c) { + Ext4.create('Ext.tip.ToolTip', { + target: c.getEl(), + html: 'If any hybrid animals are detected, these species groups will be merged and processed as one unit. Merging all these species together ensures that the correct ancestors from each side are present' + }); + } + } },{ xtype: 'numberfield', hideTrigger: true, @@ -94,6 +106,7 @@ Ext4.define('EHR.panel.GeneticCalculationSettingsPanel', { this.down('#hourOfDay').setValue(results.hourOfDay); this.down('#containerPath').setValue(results.containerPath); this.down('#kinshipValidation').setValue(results.kinshipValidation); + this.down('#mergeSpeciesWithHybrids').setValue(results.mergeSpeciesWithHybrids); }, saveData: function(){ @@ -104,7 +117,8 @@ Ext4.define('EHR.panel.GeneticCalculationSettingsPanel', { containerPath: this.down('#containerPath').getValue(), enabled: this.down('#enabled').getValue(), hourOfDay: this.down('#hourOfDay').getValue(), - kinshipValidation: this.down('#kinshipValidation').getValue() + kinshipValidation: this.down('#kinshipValidation').getValue(), + mergeSpeciesWithHybrids: this.down('#mergeSpeciesWithHybrids').getValue() }, method : 'POST', scope: this, diff --git a/ehr/src/org/labkey/ehr/EHRController.java b/ehr/src/org/labkey/ehr/EHRController.java index 15b665ed9..de873b492 100644 --- a/ehr/src/org/labkey/ehr/EHRController.java +++ b/ehr/src/org/labkey/ehr/EHRController.java @@ -639,7 +639,7 @@ public ApiResponse execute(ScheduleGeneticCalculationForm form, BindException er errors.reject(ERROR_MSG, "Unable to find container for path: " + form.getContainerPath()); return null; } - GeneticCalculationsJob.setProperties(form.isEnabled(), c, form.getHourOfDay(), form.isKinshipValidation()); + GeneticCalculationsJob.setProperties(form.isEnabled(), c, form.getHourOfDay(), form.isKinshipValidation(), form.isMergeSpeciesWithHybrids()); return new ApiSimpleResponse("success", true); } @@ -759,6 +759,7 @@ public static class ScheduleGeneticCalculationForm private int hourOfDay; private boolean _kinshipValidation; + private boolean _mergeSpeciesWithHybrids; public boolean isEnabled() { @@ -799,6 +800,16 @@ public void setKinshipValidation(boolean kinshipValidation) { _kinshipValidation = kinshipValidation; } + + public boolean isMergeSpeciesWithHybrids() + { + return _mergeSpeciesWithHybrids; + } + + public void setMergeSpeciesWithHybrids(boolean mergeSpeciesWithHybrids) + { + _mergeSpeciesWithHybrids = mergeSpeciesWithHybrids; + } } @RequiresPermission(AdminPermission.class) @@ -817,6 +828,7 @@ public ApiResponse execute(ScheduleGeneticCalculationForm form, BindException er ret.put("enabled", GeneticCalculationsJob.isEnabled()); ret.put("hourOfDay", GeneticCalculationsJob.getHourOfDay()); ret.put("kinshipValidation", GeneticCalculationsJob.isKinshipValidation()); + ret.put("mergeSpeciesWithHybrids", GeneticCalculationsJob.isMergeSpeciesWithHybrids()); return new ApiSimpleResponse(ret); } diff --git a/ehr/src/org/labkey/ehr/pipeline/GeneticCalculationsImportTask.java b/ehr/src/org/labkey/ehr/pipeline/GeneticCalculationsImportTask.java index 5bd5adbe5..30b5ebbc9 100644 --- a/ehr/src/org/labkey/ehr/pipeline/GeneticCalculationsImportTask.java +++ b/ehr/src/org/labkey/ehr/pipeline/GeneticCalculationsImportTask.java @@ -153,8 +153,10 @@ public RecordedActionSet run() throws PipelineJobException processInbreeding(job.getContainer(), job.getUser(), support.getAnalysisDirectoryPath().toFile(), job.getLogger()); processKinship(job.getContainer(), job.getUser(), support.getAnalysisDirectoryPath().toFile(), job.getLogger()); - if (GeneticCalculationsJob.isKinshipValidation()) + if (Boolean.parseBoolean(getJob().getParameters().get("kinshipValidation"))) + { validateKinship(); + } } return new RecordedActionSet(actions); diff --git a/ehr/src/org/labkey/ehr/pipeline/GeneticCalculationsJob.java b/ehr/src/org/labkey/ehr/pipeline/GeneticCalculationsJob.java index f6bb04170..a5e86ad0a 100644 --- a/ehr/src/org/labkey/ehr/pipeline/GeneticCalculationsJob.java +++ b/ehr/src/org/labkey/ehr/pipeline/GeneticCalculationsJob.java @@ -122,6 +122,16 @@ public static boolean isKinshipValidation() return false; } + public static boolean isMergeSpeciesWithHybrids() + { + Map saved = PropertyManager.getProperties(GENETICCALCULATIONS_PROPERTY_DOMAIN); + + if (saved.containsKey("mergeSpeciesWithHybrids")) + return Boolean.parseBoolean(saved.get("mergeSpeciesWithHybrids")); + else + return false; + } + public static boolean isEnabled() { Map saved = PropertyManager.getProperties(GENETICCALCULATIONS_PROPERTY_DOMAIN); @@ -152,13 +162,14 @@ public static Integer getHourOfDay() return null; } - public static void setProperties(Boolean isEnabled, Container c, Integer hourOfDay, Boolean isKinshipValidation) + public static void setProperties(Boolean isEnabled, Container c, Integer hourOfDay, Boolean isKinshipValidation, Boolean mergeSpeciesWithHybrids) { PropertyManager.PropertyMap props = PropertyManager.getWritableProperties(GENETICCALCULATIONS_PROPERTY_DOMAIN, true); props.put("enabled", isEnabled.toString()); props.put("container", c.getId()); props.put("hourOfDay", hourOfDay.toString()); props.put("kinshipValidation", isKinshipValidation.toString()); + props.put("mergeSpeciesWithHybrids", mergeSpeciesWithHybrids.toString()); props.save(); //unschedule in case settings have changed diff --git a/ehr/src/org/labkey/ehr/pipeline/GeneticCalculationsRTask.java b/ehr/src/org/labkey/ehr/pipeline/GeneticCalculationsRTask.java index 1fe9d72aa..47f7d1a4d 100644 --- a/ehr/src/org/labkey/ehr/pipeline/GeneticCalculationsRTask.java +++ b/ehr/src/org/labkey/ehr/pipeline/GeneticCalculationsRTask.java @@ -17,6 +17,7 @@ import org.apache.commons.lang3.StringUtils; import org.jetbrains.annotations.NotNull; +import org.jetbrains.annotations.Nullable; import org.labkey.api.module.Module; import org.labkey.api.module.ModuleLoader; import org.labkey.api.pipeline.AbstractTaskFactory; @@ -103,13 +104,25 @@ public RecordedActionSet run() throws PipelineJobException { List actions = new ArrayList<>(); - actions.add(runScript("populateInbreeding.r", GeneticCalculationsImportTask.INBREEDING_FILE, "Inbreeding Coefficient Output")); - actions.add(runScript("populateKinship.r", GeneticCalculationsImportTask.KINSHIP_FILE, "Kinship Output")); + actions.add(runScript("populateInbreeding.r", GeneticCalculationsImportTask.INBREEDING_FILE, "Inbreeding Coefficient Output", null)); + + List kinshipArgs = new ArrayList<>(); + if (getJob().getParameters().containsKey("mergeSpeciesWithHybrids") && "true".equalsIgnoreCase(getJob().getParameters().get("mergeSpeciesWithHybrids"))) + { + kinshipArgs.add("-m"); + } + + if (getJob().getParameters().containsKey("kinshipValidation") && "true".equalsIgnoreCase(getJob().getParameters().get("kinshipValidation"))) + { + kinshipArgs.add("-v"); + } + + actions.add(runScript("populateKinship.r", GeneticCalculationsImportTask.KINSHIP_FILE, "Kinship Output", kinshipArgs)); return new RecordedActionSet(actions); } - public RecordedAction runScript(String scriptName, String outputFileName, String actionLabel) throws PipelineJobException + public RecordedAction runScript(String scriptName, String outputFileName, String actionLabel, @Nullable List extraArgs) throws PipelineJobException { PipelineJob job = getJob(); FileAnalysisJobSupport support = (FileAnalysisJobSupport) job; @@ -133,6 +146,10 @@ public RecordedAction runScript(String scriptName, String outputFileName, String args.add(scriptPath); args.add("-f"); args.add(tsvFile.getPath()); + if (extraArgs != null) + { + args.addAll(extraArgs); + } getJob().getLogger().info("Using working directory of: " + support.getAnalysisDirectory().getPath()); ProcessBuilder pb = new ProcessBuilder(args); diff --git a/ehr/src/org/labkey/ehr/pipeline/GeneticCalculationsRunnable.java b/ehr/src/org/labkey/ehr/pipeline/GeneticCalculationsRunnable.java index b3bd6c833..45f471727 100644 --- a/ehr/src/org/labkey/ehr/pipeline/GeneticCalculationsRunnable.java +++ b/ehr/src/org/labkey/ehr/pipeline/GeneticCalculationsRunnable.java @@ -86,6 +86,8 @@ private void startCalculation(User u, Container c, boolean allowRunningDuringDay String xml = "\n" + "\n" + (allowRunningDuringDay ? "\ttrue" : "") + + "\t" + GeneticCalculationsJob.isKinshipValidation() + "" + + "\t" + GeneticCalculationsJob.isMergeSpeciesWithHybrids() + "" + ""; AbstractFileAnalysisProtocol protocol = factory.createProtocolInstance(protocolName, "", xml); From 851839195f759559cfb814318fb8533dcc489f72 Mon Sep 17 00:00:00 2001 From: bbimber Date: Tue, 5 Sep 2023 17:15:58 -0700 Subject: [PATCH 07/11] Avoid another fringe kinship case --- ehr/resources/pipeline/kinship/populateKinship.r | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/ehr/resources/pipeline/kinship/populateKinship.r b/ehr/resources/pipeline/kinship/populateKinship.r index ed1e55037..465fd10fc 100644 --- a/ehr/resources/pipeline/kinship/populateKinship.r +++ b/ehr/resources/pipeline/kinship/populateKinship.r @@ -84,6 +84,10 @@ generateSpeciesToProcess <- function(allPed, mergeSpeciesWithHybrids) { } validateExpectedKinshipSubset <- function(dataToTest, expectedValues, errorRows, testReciprocal = TRUE) { + if (nrow(dataToTest) == 0 || nrow(expectedValues) == 0) { + return(errorRows) + } + # Generate the reciprocal of relationships as well: if (testReciprocal) { ret2 <- data.frame(Id = expectedValues$Id2, Id2 = expectedValues$Id, Relationship = expectedValues$Relationship, ExpectedCoefficient = expectedValues$ExpectedCoefficient) From 335b23119c18a6d7807f21d984a6d3f8bbd41472 Mon Sep 17 00:00:00 2001 From: bbimber Date: Tue, 12 Sep 2023 11:33:25 -0700 Subject: [PATCH 08/11] Add setting to control whether kinship import is allowed during business hours --- .../panel/GeneticCalculationSettingsPanel.js | 8 +++++++- ehr/src/org/labkey/ehr/EHRController.java | 17 +++++++++++++++-- .../ehr/pipeline/GeneticCalculationsJob.java | 16 ++++++++++++++-- .../pipeline/GeneticCalculationsRunnable.java | 2 +- 4 files changed, 37 insertions(+), 6 deletions(-) diff --git a/ehr/resources/web/ehr/panel/GeneticCalculationSettingsPanel.js b/ehr/resources/web/ehr/panel/GeneticCalculationSettingsPanel.js index f4b8a3257..3d14716b4 100644 --- a/ehr/resources/web/ehr/panel/GeneticCalculationSettingsPanel.js +++ b/ehr/resources/web/ehr/panel/GeneticCalculationSettingsPanel.js @@ -24,6 +24,10 @@ Ext4.define('EHR.panel.GeneticCalculationSettingsPanel', { xtype: 'checkbox', fieldLabel: 'Is Enabled?', itemId: 'enabled' + },{ + xtype: 'checkbox', + fieldLabel: 'Allow Import During Business Hours?', + itemId: 'allowImportDuringBusinessHours' },{ xtype: 'checkbox', fieldLabel: 'Kinship validation?', @@ -107,6 +111,7 @@ Ext4.define('EHR.panel.GeneticCalculationSettingsPanel', { this.down('#containerPath').setValue(results.containerPath); this.down('#kinshipValidation').setValue(results.kinshipValidation); this.down('#mergeSpeciesWithHybrids').setValue(results.mergeSpeciesWithHybrids); + this.down('#allowImportDuringBusinessHours').setValue(results.allowImportDuringBusinessHours) }, saveData: function(){ @@ -118,7 +123,8 @@ Ext4.define('EHR.panel.GeneticCalculationSettingsPanel', { enabled: this.down('#enabled').getValue(), hourOfDay: this.down('#hourOfDay').getValue(), kinshipValidation: this.down('#kinshipValidation').getValue(), - mergeSpeciesWithHybrids: this.down('#mergeSpeciesWithHybrids').getValue() + mergeSpeciesWithHybrids: this.down('#mergeSpeciesWithHybrids').getValue(), + allowImportDuringBusinessHours: this.down('#allowImportDuringBusinessHours').getValue() }, method : 'POST', scope: this, diff --git a/ehr/src/org/labkey/ehr/EHRController.java b/ehr/src/org/labkey/ehr/EHRController.java index de873b492..f3be38338 100644 --- a/ehr/src/org/labkey/ehr/EHRController.java +++ b/ehr/src/org/labkey/ehr/EHRController.java @@ -83,6 +83,7 @@ import org.labkey.api.settings.AppProps; import org.labkey.api.study.DatasetTable; import org.labkey.api.util.ExceptionUtil; +import org.labkey.api.util.HtmlString; import org.labkey.api.util.HtmlStringBuilder; import org.labkey.api.util.PageFlowUtil; import org.labkey.api.util.Path; @@ -639,7 +640,7 @@ public ApiResponse execute(ScheduleGeneticCalculationForm form, BindException er errors.reject(ERROR_MSG, "Unable to find container for path: " + form.getContainerPath()); return null; } - GeneticCalculationsJob.setProperties(form.isEnabled(), c, form.getHourOfDay(), form.isKinshipValidation(), form.isMergeSpeciesWithHybrids()); + GeneticCalculationsJob.setProperties(form.isEnabled(), c, form.getHourOfDay(), form.isKinshipValidation(), form.isMergeSpeciesWithHybrids(), form.isAllowImportDuringBusinessHours()); return new ApiSimpleResponse("success", true); } @@ -760,6 +761,7 @@ public static class ScheduleGeneticCalculationForm private boolean _kinshipValidation; private boolean _mergeSpeciesWithHybrids; + private boolean _allowImportDuringBusinessHours; public boolean isEnabled() { @@ -810,6 +812,16 @@ public void setMergeSpeciesWithHybrids(boolean mergeSpeciesWithHybrids) { _mergeSpeciesWithHybrids = mergeSpeciesWithHybrids; } + + public boolean isAllowImportDuringBusinessHours() + { + return _allowImportDuringBusinessHours; + } + + public void setAllowImportDuringBusinessHours(boolean allowImportDuringBusinessHours) + { + _allowImportDuringBusinessHours = allowImportDuringBusinessHours; + } } @RequiresPermission(AdminPermission.class) @@ -829,6 +841,7 @@ public ApiResponse execute(ScheduleGeneticCalculationForm form, BindException er ret.put("hourOfDay", GeneticCalculationsJob.getHourOfDay()); ret.put("kinshipValidation", GeneticCalculationsJob.isKinshipValidation()); ret.put("mergeSpeciesWithHybrids", GeneticCalculationsJob.isMergeSpeciesWithHybrids()); + ret.put("allowImportDuringBusinessHours", GeneticCalculationsJob.isAllowImportDuringBusinessHours()); return new ApiSimpleResponse(ret); } @@ -1262,7 +1275,7 @@ public void validateCommand(Object form, Errors errors) @Override public ModelAndView getConfirmView(Object form, BindException errors) { - return new HtmlView("This will cause the system to recalculate kinship and inbreeding coefficients on the colony. Do you want to continue?"); + return new HtmlView(HtmlString.of("This will cause the system to recalculate kinship and inbreeding coefficients on the colony. Do you want to continue?")); } @Override diff --git a/ehr/src/org/labkey/ehr/pipeline/GeneticCalculationsJob.java b/ehr/src/org/labkey/ehr/pipeline/GeneticCalculationsJob.java index a5e86ad0a..883ff9188 100644 --- a/ehr/src/org/labkey/ehr/pipeline/GeneticCalculationsJob.java +++ b/ehr/src/org/labkey/ehr/pipeline/GeneticCalculationsJob.java @@ -132,6 +132,17 @@ public static boolean isMergeSpeciesWithHybrids() return false; } + + public static boolean isAllowImportDuringBusinessHours() + { + Map saved = PropertyManager.getProperties(GENETICCALCULATIONS_PROPERTY_DOMAIN); + + if (saved.containsKey("allowImportDuringBusinessHours")) + return Boolean.parseBoolean(saved.get("allowImportDuringBusinessHours")); + else + return false; + } + public static boolean isEnabled() { Map saved = PropertyManager.getProperties(GENETICCALCULATIONS_PROPERTY_DOMAIN); @@ -162,7 +173,7 @@ public static Integer getHourOfDay() return null; } - public static void setProperties(Boolean isEnabled, Container c, Integer hourOfDay, Boolean isKinshipValidation, Boolean mergeSpeciesWithHybrids) + public static void setProperties(Boolean isEnabled, Container c, Integer hourOfDay, Boolean isKinshipValidation, Boolean mergeSpeciesWithHybrids, Boolean allowImportDuringBusinessHours) { PropertyManager.PropertyMap props = PropertyManager.getWritableProperties(GENETICCALCULATIONS_PROPERTY_DOMAIN, true); props.put("enabled", isEnabled.toString()); @@ -170,6 +181,7 @@ public static void setProperties(Boolean isEnabled, Container c, Integer hourOfD props.put("hourOfDay", hourOfDay.toString()); props.put("kinshipValidation", isKinshipValidation.toString()); props.put("mergeSpeciesWithHybrids", mergeSpeciesWithHybrids.toString()); + props.put("allowImportDuringBusinessHours", allowImportDuringBusinessHours.toString()); props.save(); //unschedule in case settings have changed @@ -193,7 +205,7 @@ public void execute(JobExecutionContext context) throws JobExecutionException try { _log.info("Running Scheduled Genetic Calculations Job"); - new GeneticCalculationsRunnable().run(c, false); + new GeneticCalculationsRunnable().run(c, isAllowImportDuringBusinessHours()); } catch (PipelineJobException e) { diff --git a/ehr/src/org/labkey/ehr/pipeline/GeneticCalculationsRunnable.java b/ehr/src/org/labkey/ehr/pipeline/GeneticCalculationsRunnable.java index 45f471727..4ec2cc288 100644 --- a/ehr/src/org/labkey/ehr/pipeline/GeneticCalculationsRunnable.java +++ b/ehr/src/org/labkey/ehr/pipeline/GeneticCalculationsRunnable.java @@ -85,7 +85,7 @@ private void startCalculation(User u, Container c, boolean allowRunningDuringDay String protocolName = "EHR Kinship Calculation"; String xml = "\n" + "\n" + - (allowRunningDuringDay ? "\ttrue" : "") + + "\t" + allowRunningDuringDay + "" + "\t" + GeneticCalculationsJob.isKinshipValidation() + "" + "\t" + GeneticCalculationsJob.isMergeSpeciesWithHybrids() + "" + ""; From 2cbf5990477a4be0a5d1b138059f5c559a6421b3 Mon Sep 17 00:00:00 2001 From: bbimber Date: Wed, 28 Feb 2024 09:25:57 -0800 Subject: [PATCH 09/11] Back out some changes not directly related to remote kinship execution --- .../pipeline/kinship/populateKinship.r | 168 +----------------- .../panel/GeneticCalculationSettingsPanel.js | 14 -- ehr/src/org/labkey/ehr/EHRController.java | 14 +- .../ehr/pipeline/GeneticCalculationsJob.java | 14 +- .../pipeline/GeneticCalculationsRTask.java | 22 +-- .../pipeline/GeneticCalculationsRunnable.java | 2 - 6 files changed, 9 insertions(+), 225 deletions(-) diff --git a/ehr/resources/pipeline/kinship/populateKinship.r b/ehr/resources/pipeline/kinship/populateKinship.r index 465fd10fc..46e0c845d 100644 --- a/ehr/resources/pipeline/kinship/populateKinship.r +++ b/ehr/resources/pipeline/kinship/populateKinship.r @@ -13,20 +13,10 @@ library(Matrix) library(dplyr) spec <- matrix(c( - 'inputFile', '-f', 1, 'character', - 'mergeSpeciesWithHybrids', '-m', 0, 'logical', - 'performKinshipValidation', '-v', 0, 'logical' + 'inputFile', '-f', 1, 'character' ), ncol=4, byrow=TRUE) opts <- getopt(spec, commandArgs(trailingOnly = TRUE)) -if (is.null(opts$mergeSpeciesWithHybrids)){ - opts$mergeSpeciesWithHybrids <- FALSE -} - -if (is.null(opts$performKinshipValidation)){ - opts$performKinshipValidation <- FALSE -} - allPed <- read.table(opts$inputFile, quote="\"") colnames(allPed)<-c('Id', 'Dam', 'Sire', 'Gender', 'Species') @@ -43,147 +33,10 @@ if (any(allPed$Species == 'Unknown')) { print(paste0('There are ', sum(allPed$Species == 'Unknown'), ' Ids with species = Unknown')) } -# The purpose of this function is to handle instances where there was cross-breeding. -# While this is probably rare, it can occur. When this happens, simply merge the entire species together and process as one unit. -# This ensures that all relevant ancestors from both species are present -generateSpeciesToProcess <- function(allPed, mergeSpeciesWithHybrids) { - hybridParents <- dplyr::bind_rows( - merge(allPed, allPed, by.x = 'Dam', by.y = 'Id') %>% filter(Species.x != Species.y) %>% select(Id, Dam, Species.x, Species.y) %>% rename(Parent = Dam) %>% mutate(ParentType = 'Dam'), - merge(allPed, allPed, by.x = 'Sire', by.y = 'Id') %>% filter(Species.x != Species.y) %>% select(Id, Sire, Species.x, Species.y) %>% rename(Parent = Sire) %>% mutate(ParentType = 'Sire') - ) - - if (nrow(hybridParents) > 0) { - print(paste0('There were ', nrow(hybridParents), ' records with parents of a different species')) - } - - if (mergeSpeciesWithHybrids && nrow(hybridParents) > 0) { - speciesGroups <- as.list(as.character(unique(allPed$Species))) - toMerge <- unique(hybridParents[c('Species.x', 'Species.y')]) - for (idx in 1:nrow(toMerge)){ - speciesToCollapse <- as.character(unlist(toMerge[idx,,drop = TRUE])) - matchingIdx <- sapply(speciesGroups, function(x){ - return(length(intersect(speciesToCollapse, x)) > 0) - }) - - speciesToCollapse <- sort(unique(c(speciesToCollapse, unlist(speciesGroups[matchingIdx])))) - speciesGroups <- speciesGroups[!matchingIdx] - speciesGroups <- append(speciesGroups, list(speciesToCollapse)) - } - - print('These species will be merged for processing:') - invisible(lapply(speciesGroups, function(x){ - if (length(x) > 1) { - print(x) - } - })) - - return(speciesGroups) - } else { - return(unique(allPed$Species[allPed$Species != 'Unknown'])) - } -} - -validateExpectedKinshipSubset <- function(dataToTest, expectedValues, errorRows, testReciprocal = TRUE) { - if (nrow(dataToTest) == 0 || nrow(expectedValues) == 0) { - return(errorRows) - } - - # Generate the reciprocal of relationships as well: - if (testReciprocal) { - ret2 <- data.frame(Id = expectedValues$Id2, Id2 = expectedValues$Id, Relationship = expectedValues$Relationship, ExpectedCoefficient = expectedValues$ExpectedCoefficient) - ret2$Relationship <- sapply(expectedValues$Relationship, function(x){ - x <- unlist(strsplit(x, split = '/')) - if (length(x) == 1) { - return(x) - } - - return(paste0(x[2], '/', x[1])) - }) - expectedValues <- dplyr::bind_rows(expectedValues, ret2) - rm(ret2) - } - - dat <- merge(dataToTest, expectedValues, by = c('Id', 'Id2'), all.x = T, all.y = T) %>% - filter(!is.na(ExpectedCoefficient)) %>% - filter(is.na(coefficient) | coefficient < ExpectedCoefficient) - - if (nrow(dat) == 0) { - return(errorRows) - } - - if (all(is.null(errorRows))) { - return(dat) - } - - return(dplyr::bind_rows(errorRows, dat)) -} - -validateExpectedKinship <- function(pedDf, dataToTest) { - errorRows <- NULL - - # See reference: https://en.wikipedia.org/wiki/Coefficient_of_relationship#Kinship_coefficient - self <- data.frame(Id = pedDf$Id, Id2 = pedDf$Id, Relationship = 'Self', ExpectedCoefficient = 0.5) - errorRows <- validateExpectedKinshipSubset(dataToTest, self, errorRows, testReciprocal = FALSE) - rm(self) - - parentChild <- dplyr::bind_rows( - data.frame(Id = pedDf$Id, Id2 = pedDf$Dam, Relationship = 'Child/Parent'), - data.frame(Id = pedDf$Id, Id2 = pedDf$Sire, Relationship = 'Child/Parent') - ) %>% filter(!is.na(Id2)) %>% mutate(ExpectedCoefficient = 0.25) - errorRows <- validateExpectedKinshipSubset(dataToTest, parentChild, errorRows) - rm(parentChild) - - grandParentOffspring1 <- merge(pedDf[!is.na(pedDf$Dam),], pedDf, by.x = c('Dam'), by.y = c('Id'), all.x = F, all.y = F) - grandParentOffspring1 <- dplyr::bind_rows( - grandParentOffspring1 %>% select(Id, Dam.y) %>% filter(!is.na(Dam.y)) %>% rename(Id2 = Dam.y) %>% mutate(Relationship = 'Grandchild/Maternal Granddam'), - grandParentOffspring1 %>% select(Id, Sire.y) %>% filter(!is.na(Sire.y)) %>% rename(Id2 = Sire.y) %>% mutate(Relationship = 'Grandchild/Maternal Grandsire') - ) %>% mutate(ExpectedCoefficient = 0.125) - errorRows <- validateExpectedKinshipSubset(dataToTest, grandParentOffspring1, errorRows) - rm(grandParentOffspring1) - - grandParentOffspring2 <- merge(pedDf[!is.na(pedDf$Sire),], pedDf, by.x = c('Sire'), by.y = c('Id'), all.x = F, all.y = F) - grandParentOffspring2 <- dplyr::bind_rows( - grandParentOffspring2 %>% select(Id, Dam.y) %>% filter(!is.na(Dam.y)) %>% rename(Id2 = Dam.y) %>% mutate(Relationship = 'Grandchild/Paternal Granddam'), - grandParentOffspring2 %>% select(Id, Sire.y) %>% filter(!is.na(Sire.y)) %>% rename(Id2 = Sire.y) %>% mutate(Relationship = 'Grandchild/Paternal Grandsire') - ) %>% mutate(ExpectedCoefficient = 0.125) - errorRows <- validateExpectedKinshipSubset(dataToTest, grandParentOffspring2, errorRows) - rm(grandParentOffspring2) - - fullSibs <- merge(pedDf[!is.na(pedDf$Dam) & !is.na(pedDf$Sire),], pedDf[!is.na(pedDf$Dam) & !is.na(pedDf$Sire),], by = c('Sire', 'Dam'), all.x = F, all.y = F) %>% - select(Id.x, Id.y) %>% - rename(Id = Id.x, Id2 = Id.y) %>% - filter(Id != Id2) %>% - mutate(Relationship = 'Full sib', ExpectedCoefficient = 0.25) - errorRows <- validateExpectedKinshipSubset(dataToTest, fullSibs, errorRows) - rm(fullSibs) - - halfSibs1 <- merge(pedDf[!is.na(pedDf$Dam),], pedDf[!is.na(pedDf$Dam),], by = c('Dam'), all.x = F, all.y = F) %>% - filter(Sire.x != Sire.y) %>% - select(Id.x, Id.y) %>% - rename(Id = Id.x, Id2 = Id.y) %>% - filter(Id != Id2) %>% - mutate(Relationship = 'Half sib', ExpectedCoefficient = 0.125) - errorRows <- validateExpectedKinshipSubset(dataToTest, halfSibs1, errorRows) - rm(halfSibs1) - - halfSibs2 <- merge(pedDf[!is.na(pedDf$Sire),], pedDf[!is.na(pedDf$Sire),], by = c('Sire'), all.x = F, all.y = F) %>% - filter(Dam.x != Dam.y) %>% - select(Id.x, Id.y) %>% - rename(Id = Id.x, Id2 = Id.y) %>% - filter(Id != Id2) %>% - mutate(Relationship = 'Half sib', ExpectedCoefficient = 0.125) - errorRows <- validateExpectedKinshipSubset(dataToTest, halfSibs2, errorRows) - rm(halfSibs2) - - return(errorRows) -} - -speciesToProcess <- generateSpeciesToProcess(allPed, opts$mergeSpeciesWithHybrids) - newRecords <- NULL -for (speciesSet in speciesToProcess){ - allRecordsForSpecies <- allPed[allPed$Species %in% speciesSet,] - print(paste0('Processing species set: ', paste0(speciesSet, collapse = ','), ', with ', nrow(allRecordsForSpecies), ' IDs')) +for (species in unique(allPed$Species)){ + allRecordsForSpecies <- allPed[allPed$Species %in% species,] + print(paste0('Processing species: ', species, ', with ', nrow(allRecordsForSpecies), ' IDs')) if (nrow(allRecordsForSpecies) == 1) { print('single record, skipping') newRecords <- dplyr::bind_rows(newRecords,data.frame(Id = allRecordsForSpecies$Id, Id2 = allRecordsForSpecies$Id, coefficient = 0.5, Species = allRecordsForSpecies$Species)) @@ -219,19 +72,6 @@ for (speciesSet in speciesToProcess){ temp.tri <- merge(temp.tri, allRecordsForSpecies[c('Id', 'Species')], by = 'Id', all.x = TRUE) newRecords <- dplyr::bind_rows(newRecords,temp.tri) - - # NOTE: perform per-species to save memory - if (opts$performKinshipValidation) { - print('Validating coefficients against expected values') - errorRows <- validateExpectedKinship(allRecordsForSpecies, temp.tri) - if (!all(is.null(errorRows))) { - fileName <- paste0('kinshipErrors_', paste0(speciesSet, collapse = '.'), '.txt') - print(paste0('There were unexpected kinship values! See the file ', fileName, ' for more information')) - write.table(newRecords, file = fileName, row.names = FALSE, quote = FALSE, sep = '\t') - } else { - print('All coefficients were within expected ranges from predicted values') - } - } } # write TSV to disk diff --git a/ehr/resources/web/ehr/panel/GeneticCalculationSettingsPanel.js b/ehr/resources/web/ehr/panel/GeneticCalculationSettingsPanel.js index 3d14716b4..b5e217dd1 100644 --- a/ehr/resources/web/ehr/panel/GeneticCalculationSettingsPanel.js +++ b/ehr/resources/web/ehr/panel/GeneticCalculationSettingsPanel.js @@ -42,18 +42,6 @@ Ext4.define('EHR.panel.GeneticCalculationSettingsPanel', { }); } } - },{ - xtype: 'checkbox', - fieldLabel: 'Merge Species With Hybrids?', - itemId: 'mergeSpeciesWithHybrids', - listeners: { - render: function (c) { - Ext4.create('Ext.tip.ToolTip', { - target: c.getEl(), - html: 'If any hybrid animals are detected, these species groups will be merged and processed as one unit. Merging all these species together ensures that the correct ancestors from each side are present' - }); - } - } },{ xtype: 'numberfield', hideTrigger: true, @@ -110,7 +98,6 @@ Ext4.define('EHR.panel.GeneticCalculationSettingsPanel', { this.down('#hourOfDay').setValue(results.hourOfDay); this.down('#containerPath').setValue(results.containerPath); this.down('#kinshipValidation').setValue(results.kinshipValidation); - this.down('#mergeSpeciesWithHybrids').setValue(results.mergeSpeciesWithHybrids); this.down('#allowImportDuringBusinessHours').setValue(results.allowImportDuringBusinessHours) }, @@ -123,7 +110,6 @@ Ext4.define('EHR.panel.GeneticCalculationSettingsPanel', { enabled: this.down('#enabled').getValue(), hourOfDay: this.down('#hourOfDay').getValue(), kinshipValidation: this.down('#kinshipValidation').getValue(), - mergeSpeciesWithHybrids: this.down('#mergeSpeciesWithHybrids').getValue(), allowImportDuringBusinessHours: this.down('#allowImportDuringBusinessHours').getValue() }, method : 'POST', diff --git a/ehr/src/org/labkey/ehr/EHRController.java b/ehr/src/org/labkey/ehr/EHRController.java index f3be38338..2de880002 100644 --- a/ehr/src/org/labkey/ehr/EHRController.java +++ b/ehr/src/org/labkey/ehr/EHRController.java @@ -640,7 +640,7 @@ public ApiResponse execute(ScheduleGeneticCalculationForm form, BindException er errors.reject(ERROR_MSG, "Unable to find container for path: " + form.getContainerPath()); return null; } - GeneticCalculationsJob.setProperties(form.isEnabled(), c, form.getHourOfDay(), form.isKinshipValidation(), form.isMergeSpeciesWithHybrids(), form.isAllowImportDuringBusinessHours()); + GeneticCalculationsJob.setProperties(form.isEnabled(), c, form.getHourOfDay(), form.isKinshipValidation(), form.isAllowImportDuringBusinessHours()); return new ApiSimpleResponse("success", true); } @@ -760,7 +760,6 @@ public static class ScheduleGeneticCalculationForm private int hourOfDay; private boolean _kinshipValidation; - private boolean _mergeSpeciesWithHybrids; private boolean _allowImportDuringBusinessHours; public boolean isEnabled() @@ -803,16 +802,6 @@ public void setKinshipValidation(boolean kinshipValidation) _kinshipValidation = kinshipValidation; } - public boolean isMergeSpeciesWithHybrids() - { - return _mergeSpeciesWithHybrids; - } - - public void setMergeSpeciesWithHybrids(boolean mergeSpeciesWithHybrids) - { - _mergeSpeciesWithHybrids = mergeSpeciesWithHybrids; - } - public boolean isAllowImportDuringBusinessHours() { return _allowImportDuringBusinessHours; @@ -840,7 +829,6 @@ public ApiResponse execute(ScheduleGeneticCalculationForm form, BindException er ret.put("enabled", GeneticCalculationsJob.isEnabled()); ret.put("hourOfDay", GeneticCalculationsJob.getHourOfDay()); ret.put("kinshipValidation", GeneticCalculationsJob.isKinshipValidation()); - ret.put("mergeSpeciesWithHybrids", GeneticCalculationsJob.isMergeSpeciesWithHybrids()); ret.put("allowImportDuringBusinessHours", GeneticCalculationsJob.isAllowImportDuringBusinessHours()); return new ApiSimpleResponse(ret); diff --git a/ehr/src/org/labkey/ehr/pipeline/GeneticCalculationsJob.java b/ehr/src/org/labkey/ehr/pipeline/GeneticCalculationsJob.java index 883ff9188..3b486dee0 100644 --- a/ehr/src/org/labkey/ehr/pipeline/GeneticCalculationsJob.java +++ b/ehr/src/org/labkey/ehr/pipeline/GeneticCalculationsJob.java @@ -122,17 +122,6 @@ public static boolean isKinshipValidation() return false; } - public static boolean isMergeSpeciesWithHybrids() - { - Map saved = PropertyManager.getProperties(GENETICCALCULATIONS_PROPERTY_DOMAIN); - - if (saved.containsKey("mergeSpeciesWithHybrids")) - return Boolean.parseBoolean(saved.get("mergeSpeciesWithHybrids")); - else - return false; - } - - public static boolean isAllowImportDuringBusinessHours() { Map saved = PropertyManager.getProperties(GENETICCALCULATIONS_PROPERTY_DOMAIN); @@ -173,14 +162,13 @@ public static Integer getHourOfDay() return null; } - public static void setProperties(Boolean isEnabled, Container c, Integer hourOfDay, Boolean isKinshipValidation, Boolean mergeSpeciesWithHybrids, Boolean allowImportDuringBusinessHours) + public static void setProperties(Boolean isEnabled, Container c, Integer hourOfDay, Boolean isKinshipValidation, Boolean allowImportDuringBusinessHours) { PropertyManager.PropertyMap props = PropertyManager.getWritableProperties(GENETICCALCULATIONS_PROPERTY_DOMAIN, true); props.put("enabled", isEnabled.toString()); props.put("container", c.getId()); props.put("hourOfDay", hourOfDay.toString()); props.put("kinshipValidation", isKinshipValidation.toString()); - props.put("mergeSpeciesWithHybrids", mergeSpeciesWithHybrids.toString()); props.put("allowImportDuringBusinessHours", allowImportDuringBusinessHours.toString()); props.save(); diff --git a/ehr/src/org/labkey/ehr/pipeline/GeneticCalculationsRTask.java b/ehr/src/org/labkey/ehr/pipeline/GeneticCalculationsRTask.java index 47f7d1a4d..59a0e8b86 100644 --- a/ehr/src/org/labkey/ehr/pipeline/GeneticCalculationsRTask.java +++ b/ehr/src/org/labkey/ehr/pipeline/GeneticCalculationsRTask.java @@ -104,25 +104,13 @@ public RecordedActionSet run() throws PipelineJobException { List actions = new ArrayList<>(); - actions.add(runScript("populateInbreeding.r", GeneticCalculationsImportTask.INBREEDING_FILE, "Inbreeding Coefficient Output", null)); - - List kinshipArgs = new ArrayList<>(); - if (getJob().getParameters().containsKey("mergeSpeciesWithHybrids") && "true".equalsIgnoreCase(getJob().getParameters().get("mergeSpeciesWithHybrids"))) - { - kinshipArgs.add("-m"); - } - - if (getJob().getParameters().containsKey("kinshipValidation") && "true".equalsIgnoreCase(getJob().getParameters().get("kinshipValidation"))) - { - kinshipArgs.add("-v"); - } - - actions.add(runScript("populateKinship.r", GeneticCalculationsImportTask.KINSHIP_FILE, "Kinship Output", kinshipArgs)); + actions.add(runScript("populateInbreeding.r", GeneticCalculationsImportTask.INBREEDING_FILE, "Inbreeding Coefficient Output")); + actions.add(runScript("populateKinship.r", GeneticCalculationsImportTask.KINSHIP_FILE, "Kinship Output")); return new RecordedActionSet(actions); } - public RecordedAction runScript(String scriptName, String outputFileName, String actionLabel, @Nullable List extraArgs) throws PipelineJobException + public RecordedAction runScript(String scriptName, String outputFileName, String actionLabel) throws PipelineJobException { PipelineJob job = getJob(); FileAnalysisJobSupport support = (FileAnalysisJobSupport) job; @@ -146,10 +134,6 @@ public RecordedAction runScript(String scriptName, String outputFileName, String args.add(scriptPath); args.add("-f"); args.add(tsvFile.getPath()); - if (extraArgs != null) - { - args.addAll(extraArgs); - } getJob().getLogger().info("Using working directory of: " + support.getAnalysisDirectory().getPath()); ProcessBuilder pb = new ProcessBuilder(args); diff --git a/ehr/src/org/labkey/ehr/pipeline/GeneticCalculationsRunnable.java b/ehr/src/org/labkey/ehr/pipeline/GeneticCalculationsRunnable.java index 4ec2cc288..738310145 100644 --- a/ehr/src/org/labkey/ehr/pipeline/GeneticCalculationsRunnable.java +++ b/ehr/src/org/labkey/ehr/pipeline/GeneticCalculationsRunnable.java @@ -86,8 +86,6 @@ private void startCalculation(User u, Container c, boolean allowRunningDuringDay String xml = "\n" + "\n" + "\t" + allowRunningDuringDay + "" + - "\t" + GeneticCalculationsJob.isKinshipValidation() + "" + - "\t" + GeneticCalculationsJob.isMergeSpeciesWithHybrids() + "" + ""; AbstractFileAnalysisProtocol protocol = factory.createProtocolInstance(protocolName, "", xml); From 16b8c2fd04883bbdd387d085985c66848b10ba51 Mon Sep 17 00:00:00 2001 From: bbimber Date: Wed, 28 Feb 2024 09:36:33 -0800 Subject: [PATCH 10/11] Minor script cleanup --- ehr/resources/pipeline/kinship/populateKinship.r | 2 -- 1 file changed, 2 deletions(-) diff --git a/ehr/resources/pipeline/kinship/populateKinship.r b/ehr/resources/pipeline/kinship/populateKinship.r index 46e0c845d..f9b18bf34 100644 --- a/ehr/resources/pipeline/kinship/populateKinship.r +++ b/ehr/resources/pipeline/kinship/populateKinship.r @@ -39,14 +39,12 @@ for (species in unique(allPed$Species)){ print(paste0('Processing species: ', species, ', with ', nrow(allRecordsForSpecies), ' IDs')) if (nrow(allRecordsForSpecies) == 1) { print('single record, skipping') - newRecords <- dplyr::bind_rows(newRecords,data.frame(Id = allRecordsForSpecies$Id, Id2 = allRecordsForSpecies$Id, coefficient = 0.5, Species = allRecordsForSpecies$Species)) next } pctMissingSex <- sum(allRecordsForSpecies$Gender > 2) / nrow(allRecordsForSpecies) if (pctMissingSex > 0.25) { paste0('More than 25% of this species group are missing sex and cannot be processed by fixParents(), skipping') - newRecords <- dplyr::bind_rows(newRecords,data.frame(Id = allRecordsForSpecies$Id, Id2 = allRecordsForSpecies$Id, coefficient = 0.5, Species = allRecordsForSpecies$Species)) next } From 7f2c9875b0f7bf19a9a1df34dec96dc00af26079 Mon Sep 17 00:00:00 2001 From: bbimber Date: Mon, 11 Mar 2024 06:41:52 -0700 Subject: [PATCH 11/11] Code review --- ehr/resources/pipeline/kinship/populateKinship.r | 6 ------ .../labkey/ehr/pipeline/GeneticCalculationsImportTask.java | 2 +- 2 files changed, 1 insertion(+), 7 deletions(-) diff --git a/ehr/resources/pipeline/kinship/populateKinship.r b/ehr/resources/pipeline/kinship/populateKinship.r index f9b18bf34..fc2cad6be 100644 --- a/ehr/resources/pipeline/kinship/populateKinship.r +++ b/ehr/resources/pipeline/kinship/populateKinship.r @@ -42,12 +42,6 @@ for (species in unique(allPed$Species)){ next } - pctMissingSex <- sum(allRecordsForSpecies$Gender > 2) / nrow(allRecordsForSpecies) - if (pctMissingSex > 0.25) { - paste0('More than 25% of this species group are missing sex and cannot be processed by fixParents(), skipping') - next - } - # Add missing parents for accurate kinship calculations fixedRecords <- with(allRecordsForSpecies, fixParents(id = Id, dadid = Sire, momid = Dam, sex = Gender)) diff --git a/ehr/src/org/labkey/ehr/pipeline/GeneticCalculationsImportTask.java b/ehr/src/org/labkey/ehr/pipeline/GeneticCalculationsImportTask.java index 30b5ebbc9..217ee28b3 100644 --- a/ehr/src/org/labkey/ehr/pipeline/GeneticCalculationsImportTask.java +++ b/ehr/src/org/labkey/ehr/pipeline/GeneticCalculationsImportTask.java @@ -153,7 +153,7 @@ public RecordedActionSet run() throws PipelineJobException processInbreeding(job.getContainer(), job.getUser(), support.getAnalysisDirectoryPath().toFile(), job.getLogger()); processKinship(job.getContainer(), job.getUser(), support.getAnalysisDirectoryPath().toFile(), job.getLogger()); - if (Boolean.parseBoolean(getJob().getParameters().get("kinshipValidation"))) + if (GeneticCalculationsJob.isKinshipValidation()) { validateKinship(); }