From 5b0eee7bb8be5ecf3cc9d0248f1bb0098935accc Mon Sep 17 00:00:00 2001 From: bbimber Date: Mon, 4 Sep 2023 08:54:51 -0700 Subject: [PATCH 1/9] Refactor genetics pipeline to allow standalone import of TSVs that were calculated separately --- .../org/labkey/api/ehr/EHRService.java | 13 ++ .../pipeline/kinship/populateInbreeding.r | 53 ++++--- .../pipeline/kinship/populateKinship.r | 109 +++++++++++---- ehr/src/org/labkey/ehr/EHRServiceImpl.java | 9 ++ .../GeneticCalculationsImportTask.java | 130 ++++++++---------- .../pipeline/GeneticCalculationsRTask.java | 2 +- 6 files changed, 189 insertions(+), 127 deletions(-) diff --git a/ehr/api-src/org/labkey/api/ehr/EHRService.java b/ehr/api-src/org/labkey/api/ehr/EHRService.java index a0885febf..cc4587072 100644 --- a/ehr/api-src/org/labkey/api/ehr/EHRService.java +++ b/ehr/api-src/org/labkey/api/ehr/EHRService.java @@ -15,6 +15,7 @@ */ package org.labkey.api.ehr; +import org.apache.logging.log4j.Logger; import org.jetbrains.annotations.NotNull; import org.jetbrains.annotations.Nullable; import org.labkey.api.data.AbstractTableInfo; @@ -31,6 +32,7 @@ import org.labkey.api.ehr.history.*; import org.labkey.api.ldk.table.ButtonConfigFactory; import org.labkey.api.module.Module; +import org.labkey.api.pipeline.PipelineJobException; import org.labkey.api.query.BatchValidationException; import org.labkey.api.query.DetailsURL; import org.labkey.api.query.ExprColumn; @@ -43,6 +45,7 @@ import org.labkey.api.view.ActionURL; import org.labkey.api.view.template.ClientDependency; +import java.io.File; import java.io.IOException; import java.util.Collection; import java.util.Date; @@ -324,4 +327,14 @@ public EHRQCState getQCState(@NotNull Container c) /** The EHR expects certain QC states to exist. This will inspect the current study and create any missing QC states. **/ abstract public List ensureStudyQCStates(Container c, final User u, final boolean commitChanges); + + /** + * The EHR has a built-in GeneticsCalculations pipeline job that computes inbreeding and kinship based on the pedigree. + * These are normally calculated in R, saved as TSVs, and imported using java code. This method is a separate entrypoint + * that allows other code perform the calculations, save the results as TSVs, and then trigger import here. + * + * A use case is a separate pipeline server that performs the R computation on a cluster, and then triggers the main webserver to import + * those results. + */ + abstract public void standaloneProcessKinshipAndInbreeding(Container c, User u, File pipelineDir, Logger log) throws PipelineJobException; } diff --git a/ehr/resources/pipeline/kinship/populateInbreeding.r b/ehr/resources/pipeline/kinship/populateInbreeding.r index 9bf71ba85..49145eec5 100644 --- a/ehr/resources/pipeline/kinship/populateInbreeding.r +++ b/ehr/resources/pipeline/kinship/populateInbreeding.r @@ -7,47 +7,46 @@ # This R script will calculate and store inbreeding coefficients for all animals in the colony. This data will be compared against # the information currently stored in the DB and the minimal number of inserts/updates/deletes are then performed. This script is designed # to run as a daily cron job. - - -options(error = dump.frames); -library(pedigree); -library(getopt); -library(Matrix); +options(error = dump.frames) +library(pedigree) +library(getopt) +library(Matrix) library(dplyr) spec <- matrix(c( -'inputFile', '-f', 1, "character" -), ncol=4, byrow=TRUE); -opts = getopt(spec, commandArgs(trailingOnly = TRUE)); + 'inputFile', '-f', 1, 'character' +), ncol=4, byrow=TRUE) +opts <- getopt(spec, commandArgs(trailingOnly = TRUE)) -allPed <- read.table(opts$inputFile); +allPed <- read.table(opts$inputFile) colnames(allPed)<-c('Id', 'Dam', 'Sire', 'Gender') -is.na(allPed$Id)<-which(allPed$Id=="") -is.na(allPed$Dam)<-which(allPed$Dam=="") -is.na(allPed$Sire)<-which(allPed$Sire=="") -is.na(allPed$Gender)<-which(allPed$Gender=="") - -df <- data.frame(id=as.character(allPed$Id), 'id parent1'=allPed$Dam, 'id parent2'=allPed$Sire, stringsAsFactors=FALSE); -colnames(df)<-c("id", "id parent1", "id parent2") +allPed$Id[allPed$Id == ""] <- NA +allPed$Id[allPed$Dam == ""] <- NA +allPed$Id[allPed$Sire == ""] <- NA +allPed$Id[allPed$Gender == ""] <- NA -originalIds <-as.data.frame(df[,1,drop=FALSE]) +df <- data.frame(id=as.character(allPed$Id), 'id parent1'=allPed$Dam, 'id parent2'=allPed$Sire, stringsAsFactors=FALSE) +originalIds <- df$id #this is a function in the pedigree package designed to add missing parents to the dataframe #see pedigree package documentation for more detail -df <- add.Inds(df); +df <- add.Inds(df) ord <- orderPed(df) df <- df[order(ord),] #use an existing package to calculate inbreeding -ib = calcInbreeding(df); - -newRecords <- data.frame(Id=as.character(df$id), coefficient=ib, stringsAsFactors=FALSE); +ib <- calcInbreeding(df) #only calculate inbreeding for Ids at the center -newRecords <- dplyr::filter(newRecords, Id %in% originalIds$id) +newRecords <- data.frame(Id=as.character(df$id), coefficient=ib, stringsAsFactors=FALSE) %>% + dplyr::filter(Id %in% originalIds) + +print("Output table:") +print(str(newRecords)) + +if (nrow(newRecords) != length(originalIds)) { + stop(paste0('Output dataframe and input IDs not the same length! Expected: ', length(originalIds), ', was: ', nrow(newRecords))) +} -# write TSV to disk -print("Output table:"); -print(str(newRecords)); -write.table(newRecords, file = "inbreeding.txt", append = FALSE,row.names=F,quote=F,sep="\t"); \ No newline at end of file +write.table(newRecords, file = "inbreeding.txt", append = FALSE, row.names=F, quote=F, sep="\t") \ No newline at end of file diff --git a/ehr/resources/pipeline/kinship/populateKinship.r b/ehr/resources/pipeline/kinship/populateKinship.r index abffbd120..e930530a1 100644 --- a/ehr/resources/pipeline/kinship/populateKinship.r +++ b/ehr/resources/pipeline/kinship/populateKinship.r @@ -7,30 +7,24 @@ # This R script will calculate and store kinship coefficients (aka. relatedness) for all animals in the colony. This is a large, sparse matrix. # The matrix is converted into a very long 3-column dataframe (animal1, animal2, coefficient). This dataframe is output to a TSV file, # which is normally imported into ehr.kinship by java code in GeneticCalculationsImportTask - - -#options(echo=TRUE); -options(error = dump.frames); -library(methods); -library(kinship2); -library(getopt); -library(Matrix); -library(dplyr); +options(error = dump.frames) +library(kinship2) +library(getopt) +library(Matrix) +library(dplyr) spec <- matrix(c( -#'containerPath', '-c', 1, "character", -#'baseUrl', '-b', 1, "character" -'inputFile', '-f', 1, "character" -), ncol=4, byrow=TRUE); -opts = getopt(spec, commandArgs(trailingOnly = TRUE)); + 'inputFile', '-f', 1, "character" +), ncol=4, byrow=TRUE) +opts <- getopt(spec, commandArgs(trailingOnly = TRUE)) -allPed <- read.table(opts$inputFile, quote="\""); -colnames(allPed)<-c('Id', 'Dam', 'Sire', 'Gender', 'Species'); +allPed <- read.table(opts$inputFile, quote="\"") +colnames(allPed)<-c('Id', 'Dam', 'Sire', 'Gender', 'Species') -is.na(allPed$Id)<-which(allPed$Id=="") -is.na(allPed$Dam)<-which(allPed$Dam=="") -is.na(allPed$Sire)<-which(allPed$Sire=="") -is.na(allPed$Gender)<-which(allPed$Gender=="") +allPed$Id[allPed$Id == ""] <- NA +allPed$Id[allPed$Dam == ""] <- NA +allPed$Id[allPed$Sire == ""] <- NA +allPed$Id[allPed$Gender == ""] <- NA allPed$Species <- as.character(allPed$Species) allPed$Species[is.na(allPed$Species)] <- c('Unknown') @@ -38,10 +32,10 @@ allPed$Species <- as.factor(allPed$Species) # In order to reduce the max matrix size, calculate famids using makefamid, then analyze each group separately # It resizes the biggest matrix from 12000^2 to 8200^2 thus reduces the memory used by half -newRecords=NULL +newRecords <- NULL for (species in unique(allPed$Species)){ - print(paste0('processing species: ', species)) allRecordsForSpecies <- allPed[allPed$Species == species,] + print(paste0('Processing species: ', species, ', with ', nrow(allRecordsForSpecies), ' IDs')) # Add missing parents for accurate kinship calculations fixedRecords <- with(allRecordsForSpecies, fixParents(id = Id, dadid = Sire, momid = Dam, sex = Gender)) @@ -49,25 +43,80 @@ for (species in unique(allPed$Species)){ # Kinship is expecting records to be sorted IAW it's own pedigree function recordsForSpecies <- with(fixedRecords, pedigree(id=id,dadid=dadid,momid=momid,sex=sex,missid=0)) - temp.kin=kinship(recordsForSpecies) + temp.kin <- kinship(recordsForSpecies) # Add rownames to make matrix symmetric, which is required downstream rownames(temp.kin) <- colnames(temp.kin) # Convert kinship matrix to a triplet list of two ids and their coefficient - summaryDf = as.data.frame(summary(as(temp.kin, "dgCMatrix"))) + summaryDf <- as.data.frame(summary(as(temp.kin, "dgCMatrix"))) idList <- rownames(temp.kin) - temp.tri = data.frame(Id=idList[summaryDf$i], Id2=idList[summaryDf$j], coefficient=summaryDf$x) + temp.tri <- data.frame(Id=idList[summaryDf$i], Id2=idList[summaryDf$j], coefficient=summaryDf$x) # Now filter out parents added for kinship calculation temp.tri <- dplyr::filter(temp.tri, grepl("^(?!addin).*$", Id, perl = TRUE)) temp.tri <- dplyr::filter(temp.tri, grepl("^(?!addin).*$", Id2, perl = TRUE)) - newRecords=rbind(newRecords,temp.tri) - print(paste0('total subjects: ', nrow(allRecordsForSpecies))) + newRecords <- rbind(newRecords,temp.tri) + print(paste0('Total subjects: ', nrow(allRecordsForSpecies))) +} + +generateExpectedKinship <- function(pedDf) { + parentChild <- rbind( + data.frame(Id = pedDf$Id, Id2 = pedDf$Dam, Species = pedDf$Species, Relationship = 'Child/Parent'), + data.frame(Id = pedDf$Id, Id2 = pedDf$Sire, Species = pedDf$Species, Relationship = 'Child/Parent') + ) %>% filter(!is.na(Id2)) %>% mutate(ExpectedCoefficient = 0.5) + + grandParentOffspring1 <- merge(pedDf[!is.na(pedDf$Dam),], pedDf, by.x = c('Dam', 'Species'), by.y = c('Id', 'Species'), all.x = F, all.y = F) + grandParentOffspring1 <- rbind( + grandParentOffspring1 %>% select(Id, Dam.y, Species) %>% filter(!is.na(Dam.y)) %>% rename(Id2 = Dam.y) %>% mutate(Relationship = 'Grandchild/Maternal Granddam'), + grandParentOffspring1 %>% select(Id, Sire.y, Species) %>% filter(!is.na(Sire.y)) %>% rename(Id2 = Sire.y) %>% mutate(Relationship = 'Grandchild/Maternal Grandsire') + ) %>% mutate(ExpectedCoefficient = 0.25) + + grandParentOffspring2 <- merge(pedDf[!is.na(pedDf$Sire),], pedDf, by.x = c('Sire', 'Species'), by.y = c('Id', 'Species'), all.x = F, all.y = F) + grandParentOffspring2 <- rbind( + grandParentOffspring2 %>% select(Id, Dam.y, Species) %>% filter(!is.na(Dam.y)) %>% rename(Id2 = Dam.y) %>% mutate(Relationship = 'Grandchild/Paternal Granddam'), + grandParentOffspring2 %>% select(Id, Sire.y, Species) %>% filter(!is.na(Sire.y)) %>% rename(Id2 = Sire.y) %>% mutate(Relationship = 'Grandchild/Paternal Grandsire') + ) %>% mutate(ExpectedCoefficient = 0.25) + + fullSibs <- merge(pedDf[!is.na(pedDf$Dam) & !is.na(pedDf$Sire),], pedDf[!is.na(pedDf$Dam) & !is.na(pedDf$Sire),], by = c('Sire', 'Dam', 'Species'), all.x = F, all.y = F) %>% + select(Id.x, Id.y, Species) %>% + rename(Id = Id.x, Id2 = Id.y) %>% + mutate(Relationship = 'Full sib', ExpectedCoefficient = 0.25) + + ret <- rbind( + parentChild, + grandParentOffspring1, + grandParentOffspring2, + fullSibs + ) + + # Generate the reciprocal of relationships as well: + ret2 <- data.frame(Id = ret$Id2, Id2 = ret$Id, Species = ret$Species, Relationship = ret$Relationship, ExpectedCoefficient = ret$ExpectedCoefficient) + ret2$Relationship <- sapply(ret2$Relationship, function(x){ + x <- unlist(strsplit(x, split = '/')) + if (length(x) == 1) { + return(x) + } + + return(paste0(x[2], '/', x[1])) + }) + + return(rbind(ret, ret2)) +} + +# Basic validation: +toValidate <- merge(newRecords, generateExpectedKinship(pedDf), by = c('Id', 'Id2', 'Species'), all.x = T, all.y = T) %>% + filter(!is.na(ExpectedCoefficient)) %>% + mutate(MinCoefficient = ExpectedCoefficient / 2) %>% + filter(is.na(coefficient) | coefficient < MinCoefficient) + +if (nrow(toValidate) > 0) { + print(paste0('There were unexpected kinship values! See the file kinshipErrors.txt for more information')) + write.table(newRecords, file = "kinshipErrors.txt", append = FALSE, row.names=F, quote=F, sep='\t') } # write TSV to disk -print("Output table:"); -print(str(newRecords)); -write.table(newRecords, file = "kinship.txt", append = FALSE,row.names=F,quote=F,sep="\t"); \ No newline at end of file +print("Output table:") +print(str(newRecords)) +write.table(newRecords, file = "kinship.txt", append = FALSE, row.names = FALSE, quote = FALSE, sep='\t') \ No newline at end of file diff --git a/ehr/src/org/labkey/ehr/EHRServiceImpl.java b/ehr/src/org/labkey/ehr/EHRServiceImpl.java index 9047aefc4..bdb2554cd 100644 --- a/ehr/src/org/labkey/ehr/EHRServiceImpl.java +++ b/ehr/src/org/labkey/ehr/EHRServiceImpl.java @@ -50,6 +50,7 @@ import org.labkey.api.module.ModuleLoader; import org.labkey.api.module.ModuleProperty; import org.labkey.api.pipeline.PipeRoot; +import org.labkey.api.pipeline.PipelineJobException; import org.labkey.api.pipeline.PipelineService; import org.labkey.api.query.BatchValidationException; import org.labkey.api.query.DetailsURL; @@ -78,10 +79,12 @@ import org.labkey.ehr.history.DefaultObservationsDataSource; import org.labkey.ehr.history.DefaultPregnanciesDataSource; import org.labkey.ehr.history.LabworkManager; +import org.labkey.ehr.pipeline.GeneticCalculationsImportTask; import org.labkey.ehr.security.EHRSecurityManager; import org.labkey.ehr.table.DefaultEHRCustomizer; import org.labkey.ehr.table.SNOMEDCodesDisplayColumn; +import java.io.File; import java.io.FileNotFoundException; import java.io.IOException; import java.io.InputStream; @@ -1050,4 +1053,10 @@ public void appendSNOMEDCols(AbstractTableInfo ti, String displayColumnName, Str ti.addColumn(simpleRawCol); } } + + @Override + public void standaloneProcessKinshipAndInbreeding(Container c, User u, File pipelineDir, Logger log) throws PipelineJobException + { + GeneticCalculationsImportTask.standaloneProcessKinshipAndInbreeding(c, u, pipelineDir, log); + } } diff --git a/ehr/src/org/labkey/ehr/pipeline/GeneticCalculationsImportTask.java b/ehr/src/org/labkey/ehr/pipeline/GeneticCalculationsImportTask.java index 28d277270..5bd5adbe5 100644 --- a/ehr/src/org/labkey/ehr/pipeline/GeneticCalculationsImportTask.java +++ b/ehr/src/org/labkey/ehr/pipeline/GeneticCalculationsImportTask.java @@ -16,6 +16,7 @@ package org.labkey.ehr.pipeline; import org.apache.commons.lang3.StringUtils; +import org.apache.logging.log4j.Logger; import org.jetbrains.annotations.NotNull; import org.jetbrains.annotations.Nullable; import org.labkey.api.collections.CaseInsensitiveHashMap; @@ -50,6 +51,7 @@ import org.labkey.api.query.QueryUpdateServiceException; import org.labkey.api.query.UserSchema; import org.labkey.api.query.ValidationException; +import org.labkey.api.reader.Readers; import org.labkey.api.security.User; import org.labkey.api.util.FileType; import org.labkey.api.util.PageFlowUtil; @@ -116,7 +118,7 @@ public List getProtocolActionNames() } @Override - public PipelineJob.Task createTask(PipelineJob job) + public PipelineJob.Task createTask(PipelineJob job) { GeneticCalculationsImportTask task = new GeneticCalculationsImportTask(this, job); setJoin(false); @@ -145,8 +147,11 @@ public RecordedActionSet run() throws PipelineJobException } else { - processInbreeding(); - processKinship(); + PipelineJob job = getJob(); + FileAnalysisJobSupport support = (FileAnalysisJobSupport) job; + + processInbreeding(job.getContainer(), job.getUser(), support.getAnalysisDirectoryPath().toFile(), job.getLogger()); + processKinship(job.getContainer(), job.getUser(), support.getAnalysisDirectoryPath().toFile(), job.getLogger()); if (GeneticCalculationsJob.isKinshipValidation()) validateKinship(); @@ -155,24 +160,27 @@ public RecordedActionSet run() throws PipelineJobException return new RecordedActionSet(actions); } - private void processKinship() throws PipelineJobException + public static void standaloneProcessKinshipAndInbreeding(Container c, User u, File pipelineDir, Logger log) throws PipelineJobException { - PipelineJob job = getJob(); - FileAnalysisJobSupport support = (FileAnalysisJobSupport) job; + processInbreeding(c, u, pipelineDir, log); + processKinship(c, u, pipelineDir, log); + } - File output = new File(support.getAnalysisDirectory(), KINSHIP_FILE); + private static void processKinship(Container c, User u, File pipelineDir, Logger log) throws PipelineJobException + { + File output = new File(pipelineDir, KINSHIP_FILE); if (!output.exists()) throw new PipelineJobException("Unable to find file: " + output.getPath()); DbSchema ehrSchema = EHRSchema.getInstance().getSchema(); TableInfo kinshipTable = ehrSchema.getTable("kinship"); - getJob().getLogger().info("Inspecting file length: " + output.getPath()); + log.info("Inspecting file length: " + output.getPath()); try { try (DbScope.Transaction transaction = ExperimentService.get().ensureTransaction(); - LineNumberReader lnr = new LineNumberReader(new BufferedReader(new FileReader(output)))) + LineNumberReader lnr = new LineNumberReader(Readers.getReader(output))) { while (lnr.readLine() != null) { @@ -180,19 +188,19 @@ private void processKinship() throws PipelineJobException break; } int lineNumber = lnr.getLineNumber(); - lnr.close(); - if (lineNumber < 3) + { throw new PipelineJobException("Too few lines found in output. Line count was: " + lineNumber); + } //delete all previous records - getJob().getLogger().info("Deleting existing rows"); - Table.delete(kinshipTable, new SimpleFilter(FieldKey.fromString("container"), getJob().getContainerId(), CompareType.EQUAL)); + log.info("Deleting existing rows"); + Table.delete(kinshipTable, new SimpleFilter(FieldKey.fromString("container"), c.getId(), CompareType.EQUAL)); //NOTE: this process creates and deletes a ton of rows each day. the rowId can balloon very quickly, so we reset it here SqlSelector ss = new SqlSelector(kinshipTable.getSchema(), new SQLFragment("SELECT max(rowid) as expt FROM " + kinshipTable.getSelectName())); List ret = ss.getArrayList(Long.class); - Integer maxVal; + int maxVal; if (ret.isEmpty()) { maxVal = 0; @@ -230,9 +238,9 @@ else if (kinshipTable.getSqlDialect().isPostgreSQL()) } try (DbScope.Transaction transaction = ExperimentService.get().ensureTransaction(); - BufferedReader reader = new BufferedReader(new FileReader(output))) + BufferedReader reader = Readers.getReader(output)) { - getJob().getLogger().info("Inserting rows"); + log.info("Inserting rows"); String line = null; int lineNum = 0; while ((line = reader.readLine()) != null) @@ -246,7 +254,7 @@ else if (kinshipTable.getSqlDialect().isPostgreSQL()) if (fields[0].equalsIgnoreCase(fields[1])) continue; //dont import self-kinship - Map row = new HashMap(); + Map row = new HashMap<>(); assert fields[0].length() < 80 : "Field Id value too long: [" + fields[0] + ']'; assert fields[1].length() < 80 : "Field Id2 value too long: [" + fields[1] + "]"; @@ -261,19 +269,19 @@ else if (kinshipTable.getSqlDialect().isPostgreSQL()) throw new PipelineJobException("Invalid kinship coefficient on line " + (lineNum + 1) + " for IDs " + fields[0] + " and " + fields[1] + ": " + fields[2], e); } - row.put("container", job.getContainer().getId()); + row.put("container", c.getId()); row.put("created", new Date()); - row.put("createdby", job.getUser().getUserId()); - Table.insert(job.getUser(), kinshipTable, row); + row.put("createdby", u.getUserId()); + Table.insert(u, kinshipTable, row); lineNum++; if (lineNum % 100000 == 0) { - getJob().getLogger().info("processed " + lineNum + " rows"); + log.info("processed " + lineNum + " rows"); } } - job.getLogger().info("Inserted " + lineNum + " rows into ehr.kinship"); + log.info("Inserted " + lineNum + " rows into ehr.kinship"); transaction.commit(); } } @@ -431,7 +439,7 @@ private void validateSetOfRelations(TableInfo kinshipTable, Map familyMembers) { - PipelineJob job = getJob(); - TableSelector familyTs = new TableSelector(familyTable, new LinkedHashSet(familyMembers)); + TableSelector familyTs = new TableSelector(familyTable, new LinkedHashSet<>(familyMembers)); Map> relations = new HashMap<>(); familyTs.forEach(rs -> { @@ -564,7 +570,7 @@ private boolean validateKinship() return true; } - private TableInfo getRealTable(TableInfo ti) + private static TableInfo getRealTable(TableInfo ti) { Domain domain = ti.getDomain(); if (domain != null) @@ -575,48 +581,44 @@ private TableInfo getRealTable(TableInfo ti) return null; } - private void processInbreeding() throws PipelineJobException + private static void processInbreeding(Container c, User u, File pipelineDir, Logger log) throws PipelineJobException { - PipelineJob job = getJob(); - FileAnalysisJobSupport support = (FileAnalysisJobSupport) job; - - File output = new File(support.getAnalysisDirectory(), INBREEDING_FILE); + File output = new File(pipelineDir, INBREEDING_FILE); if (!output.exists()) throw new PipelineJobException("Unable to find file: " + output.getPath()); - UserSchema us = QueryService.get().getUserSchema(job.getUser(), job.getContainer(), "study"); + UserSchema us = QueryService.get().getUserSchema(u, c, "study"); TableInfo ti = us.getTable("Inbreeding Coefficients"); if (ti == null) { - getJob().getLogger().warn("Unable to find table study.inbreeding coefficients"); + log.warn("Unable to find table study.inbreeding coefficients"); return; } QueryUpdateService qus = ti.getUpdateService(); qus.setBulkLoad(true); - LineNumberReader lnr = null; - BufferedReader reader = null; - - try + try (BufferedReader reader = Readers.getReader(output)) { try (DbScope.Transaction transaction = ExperimentService.get().ensureTransaction()) { - getJob().getLogger().info("Inspecting file length: " + output.getPath()); - lnr = new LineNumberReader(new BufferedReader(new FileReader(output))); - while (lnr.readLine() != null) + log.info("Inspecting file length: " + output.getPath()); + try (LineNumberReader lnr = new LineNumberReader(Readers.getReader(output))) { - if (lnr.getLineNumber() > 3) - break; + while (lnr.readLine() != null) + { + if (lnr.getLineNumber() > 3) + break; + } + int lineNumber = lnr.getLineNumber(); + if (lineNumber < 3) + { + throw new PipelineJobException("Too few lines found in inbreeding output. Line count was: " + lineNumber); + } } - int lineNumber = lnr.getLineNumber(); - lnr.close(); - - if (lineNumber < 3) - throw new PipelineJobException("Too few lines found in inbreeding output. Line count was: " + lineNumber); //delete all previous records - getJob().getLogger().info("Deleting existing rows"); + log.info("Deleting existing rows"); TableInfo realTable = getRealTable(ti); if (realTable == null) { @@ -628,14 +630,12 @@ private void processInbreeding() throws PipelineJobException transaction.commit(); } - reader = new BufferedReader(new FileReader(output)); - String line; int lineNum = 0; List> rows = new ArrayList<>(); Date date = new Date(); - getJob().getLogger().info("Reading file"); + log.info("Reading file"); while ((line = reader.readLine()) != null){ String[] fields = line.split("\t"); if (fields.length < 2) @@ -643,11 +643,11 @@ private void processInbreeding() throws PipelineJobException if ("coefficient".equalsIgnoreCase(fields[1])) continue; //skip header - Map row = new CaseInsensitiveHashMap(); + Map row = new CaseInsensitiveHashMap<>(); String subjectId = StringUtils.trimToNull(fields[0]); if (subjectId == null) { - getJob().getLogger().error("Missing subjectId on row " + lineNum); + log.error("Missing subjectId on row " + lineNum); continue; } @@ -660,22 +660,22 @@ private void processInbreeding() throws PipelineJobException lineNum++; } - getJob().getLogger().info("Inserting rows"); + log.info("Inserting rows"); BatchValidationException errors = new BatchValidationException(); Map options = new HashMap<>(); - options.put(QueryUpdateService.ConfigParameters.Logger, getJob().getLogger()); + options.put(QueryUpdateService.ConfigParameters.Logger, log); try (DbScope.Transaction transaction = ExperimentService.get().ensureTransaction()) { - qus.insertRows(getJob().getUser(), getJob().getContainer(), rows, errors, options, new HashMap()); + qus.insertRows(u, c, rows, errors, options, new HashMap<>()); if (errors.hasErrors()) throw errors; transaction.commit(); } - job.getLogger().info("Inserted " + lineNum + " rows into inbreeding coefficients table"); + log.info("Inserted " + lineNum + " rows into inbreeding coefficients table"); } catch (DuplicateKeyException | SQLException | IOException | QueryUpdateServiceException e) @@ -684,21 +684,13 @@ private void processInbreeding() throws PipelineJobException } catch (BatchValidationException e) { - getJob().getLogger().info("error inserting rows"); + log.info("error inserting rows"); for (ValidationException ve : e.getRowErrors()) { - getJob().getLogger().info(ve.getMessage()); + log.info(ve.getMessage()); } throw new PipelineJobException(e); } - finally - { - if (lnr != null) - try{lnr.close();}catch (Exception ignored){} - - if (reader != null) - try{reader.close();}catch (Exception ignored){} - } } } diff --git a/ehr/src/org/labkey/ehr/pipeline/GeneticCalculationsRTask.java b/ehr/src/org/labkey/ehr/pipeline/GeneticCalculationsRTask.java index 7a075a391..1fe9d72aa 100644 --- a/ehr/src/org/labkey/ehr/pipeline/GeneticCalculationsRTask.java +++ b/ehr/src/org/labkey/ehr/pipeline/GeneticCalculationsRTask.java @@ -82,7 +82,7 @@ public List getProtocolActionNames() } @Override - public PipelineJob.Task createTask(PipelineJob job) + public PipelineJob.Task createTask(PipelineJob job) { GeneticCalculationsRTask task = new GeneticCalculationsRTask(this, job); setJoin(false); From 117fd05962560c04cf40630d375efd0aad6ff371 Mon Sep 17 00:00:00 2001 From: bbimber Date: Mon, 4 Sep 2023 12:04:12 -0700 Subject: [PATCH 2/9] Improve R scripts: - Make R scripts exit immediately on error - Bugfix to expected kinship/validation --- .../pipeline/kinship/populateInbreeding.r | 5 +-- .../pipeline/kinship/populateKinship.r | 32 +++++++++++-------- 2 files changed, 20 insertions(+), 17 deletions(-) diff --git a/ehr/resources/pipeline/kinship/populateInbreeding.r b/ehr/resources/pipeline/kinship/populateInbreeding.r index 49145eec5..629178452 100644 --- a/ehr/resources/pipeline/kinship/populateInbreeding.r +++ b/ehr/resources/pipeline/kinship/populateInbreeding.r @@ -7,7 +7,6 @@ # This R script will calculate and store inbreeding coefficients for all animals in the colony. This data will be compared against # the information currently stored in the DB and the minimal number of inserts/updates/deletes are then performed. This script is designed # to run as a daily cron job. -options(error = dump.frames) library(pedigree) library(getopt) library(Matrix) @@ -28,6 +27,7 @@ allPed$Id[allPed$Gender == ""] <- NA df <- data.frame(id=as.character(allPed$Id), 'id parent1'=allPed$Dam, 'id parent2'=allPed$Sire, stringsAsFactors=FALSE) originalIds <- df$id +print(paste0('Input IDs: ', nrow(df))) #this is a function in the pedigree package designed to add missing parents to the dataframe #see pedigree package documentation for more detail @@ -42,9 +42,6 @@ ib <- calcInbreeding(df) newRecords <- data.frame(Id=as.character(df$id), coefficient=ib, stringsAsFactors=FALSE) %>% dplyr::filter(Id %in% originalIds) -print("Output table:") -print(str(newRecords)) - if (nrow(newRecords) != length(originalIds)) { stop(paste0('Output dataframe and input IDs not the same length! Expected: ', length(originalIds), ', was: ', nrow(newRecords))) } diff --git a/ehr/resources/pipeline/kinship/populateKinship.r b/ehr/resources/pipeline/kinship/populateKinship.r index e930530a1..762329d7a 100644 --- a/ehr/resources/pipeline/kinship/populateKinship.r +++ b/ehr/resources/pipeline/kinship/populateKinship.r @@ -7,7 +7,6 @@ # This R script will calculate and store kinship coefficients (aka. relatedness) for all animals in the colony. This is a large, sparse matrix. # The matrix is converted into a very long 3-column dataframe (animal1, animal2, coefficient). This dataframe is output to a TSV file, # which is normally imported into ehr.kinship by java code in GeneticCalculationsImportTask -options(error = dump.frames) library(kinship2) library(getopt) library(Matrix) @@ -56,32 +55,36 @@ for (species in unique(allPed$Species)){ # Now filter out parents added for kinship calculation temp.tri <- dplyr::filter(temp.tri, grepl("^(?!addin).*$", Id, perl = TRUE)) temp.tri <- dplyr::filter(temp.tri, grepl("^(?!addin).*$", Id2, perl = TRUE)) + temp.tri$Species <- species newRecords <- rbind(newRecords,temp.tri) - print(paste0('Total subjects: ', nrow(allRecordsForSpecies))) } generateExpectedKinship <- function(pedDf) { + # See reference: https://en.wikipedia.org/wiki/Coefficient_of_relationship#Kinship_coefficient + self <- data.frame(Id = pedDf$Id, Id2 = pedDf$Id, Species = pedDf$Species, Relationship = 'Self', ExpectedCoefficient = 0.5) + parentChild <- rbind( data.frame(Id = pedDf$Id, Id2 = pedDf$Dam, Species = pedDf$Species, Relationship = 'Child/Parent'), data.frame(Id = pedDf$Id, Id2 = pedDf$Sire, Species = pedDf$Species, Relationship = 'Child/Parent') - ) %>% filter(!is.na(Id2)) %>% mutate(ExpectedCoefficient = 0.5) + ) %>% filter(!is.na(Id2)) %>% mutate(ExpectedCoefficient = 0.25) grandParentOffspring1 <- merge(pedDf[!is.na(pedDf$Dam),], pedDf, by.x = c('Dam', 'Species'), by.y = c('Id', 'Species'), all.x = F, all.y = F) grandParentOffspring1 <- rbind( grandParentOffspring1 %>% select(Id, Dam.y, Species) %>% filter(!is.na(Dam.y)) %>% rename(Id2 = Dam.y) %>% mutate(Relationship = 'Grandchild/Maternal Granddam'), grandParentOffspring1 %>% select(Id, Sire.y, Species) %>% filter(!is.na(Sire.y)) %>% rename(Id2 = Sire.y) %>% mutate(Relationship = 'Grandchild/Maternal Grandsire') - ) %>% mutate(ExpectedCoefficient = 0.25) + ) %>% mutate(ExpectedCoefficient = 0.125) grandParentOffspring2 <- merge(pedDf[!is.na(pedDf$Sire),], pedDf, by.x = c('Sire', 'Species'), by.y = c('Id', 'Species'), all.x = F, all.y = F) grandParentOffspring2 <- rbind( grandParentOffspring2 %>% select(Id, Dam.y, Species) %>% filter(!is.na(Dam.y)) %>% rename(Id2 = Dam.y) %>% mutate(Relationship = 'Grandchild/Paternal Granddam'), grandParentOffspring2 %>% select(Id, Sire.y, Species) %>% filter(!is.na(Sire.y)) %>% rename(Id2 = Sire.y) %>% mutate(Relationship = 'Grandchild/Paternal Grandsire') - ) %>% mutate(ExpectedCoefficient = 0.25) + ) %>% mutate(ExpectedCoefficient = 0.125) fullSibs <- merge(pedDf[!is.na(pedDf$Dam) & !is.na(pedDf$Sire),], pedDf[!is.na(pedDf$Dam) & !is.na(pedDf$Sire),], by = c('Sire', 'Dam', 'Species'), all.x = F, all.y = F) %>% select(Id.x, Id.y, Species) %>% rename(Id = Id.x, Id2 = Id.y) %>% + filter(Id != Id2) %>% mutate(Relationship = 'Full sib', ExpectedCoefficient = 0.25) ret <- rbind( @@ -102,21 +105,24 @@ generateExpectedKinship <- function(pedDf) { return(paste0(x[2], '/', x[1])) }) - return(rbind(ret, ret2)) + return(rbind(self, ret, ret2)) } # Basic validation: -toValidate <- merge(newRecords, generateExpectedKinship(pedDf), by = c('Id', 'Id2', 'Species'), all.x = T, all.y = T) %>% +toValidate <- merge(newRecords, generateExpectedKinship(allPed), by = c('Id', 'Id2', 'Species'), all.x = T, all.y = T) +write.table(file = 'kinshipWithExpectedValues.txt', toValidate, sep = '\t', quote = F) + +toValidate <- toValidate %>% filter(!is.na(ExpectedCoefficient)) %>% - mutate(MinCoefficient = ExpectedCoefficient / 2) %>% - filter(is.na(coefficient) | coefficient < MinCoefficient) + filter(is.na(coefficient) | coefficient < ExpectedCoefficient) if (nrow(toValidate) > 0) { print(paste0('There were unexpected kinship values! See the file kinshipErrors.txt for more information')) - write.table(newRecords, file = "kinshipErrors.txt", append = FALSE, row.names=F, quote=F, sep='\t') + write.table(newRecords, file = "kinshipErrors.txt", append = FALSE, row.names = FALSE, quote = FALSE, sep = '\t') +} else { + print('All coefficients were within expected ranges from predicted values') } +newRecords$Species <- NULL # write TSV to disk -print("Output table:") -print(str(newRecords)) -write.table(newRecords, file = "kinship.txt", append = FALSE, row.names = FALSE, quote = FALSE, sep='\t') \ No newline at end of file +write.table(newRecords, file = "kinship.txt", append = FALSE, row.names = FALSE, quote = FALSE, sep = '\t') \ No newline at end of file From dbb5c4607c7892549db5ed126739dc750f0e50ee Mon Sep 17 00:00:00 2001 From: bbimber Date: Mon, 4 Sep 2023 12:05:18 -0700 Subject: [PATCH 3/9] Add action to allow external server to trigger re-import of externally computed kinship data --- .../ehr/pipeline/GeneticCalculationsRunnable.java | 15 ++++++--------- 1 file changed, 6 insertions(+), 9 deletions(-) diff --git a/ehr/src/org/labkey/ehr/pipeline/GeneticCalculationsRunnable.java b/ehr/src/org/labkey/ehr/pipeline/GeneticCalculationsRunnable.java index 4d6f0bf5b..b3bd6c833 100644 --- a/ehr/src/org/labkey/ehr/pipeline/GeneticCalculationsRunnable.java +++ b/ehr/src/org/labkey/ehr/pipeline/GeneticCalculationsRunnable.java @@ -31,6 +31,7 @@ import org.labkey.api.pipeline.file.FileAnalysisTaskPipeline; import org.labkey.api.security.User; import org.labkey.api.util.ConfigurationException; +import org.labkey.api.util.StringUtilsLabKey; import org.labkey.api.util.logging.LogHelper; import org.labkey.api.view.ActionURL; import org.labkey.api.view.ViewBackgroundInfo; @@ -73,12 +74,12 @@ private void startCalculation(User u, Container c, boolean allowRunningDuringDay { String taskIdString = FileAnalysisTaskPipeline.class.getName() + ":" + KINSHIP_PIPELINE_NAME; TaskId taskId = new TaskId(taskIdString); - TaskPipeline taskPipeline = PipelineJobService.get().getTaskPipeline(taskId); + TaskPipeline taskPipeline = PipelineJobService.get().getTaskPipeline(taskId); if (taskPipeline == null) throw new PipelineJobException("Unable to find kinship pipeline: " + taskId); AbstractFileAnalysisProvider provider = (AbstractFileAnalysisProvider) PipelineService.get().getPipelineProvider("File Analysis"); - AbstractFileAnalysisProtocolFactory factory = provider.getProtocolFactory(taskPipeline); + AbstractFileAnalysisProtocolFactory factory = provider.getProtocolFactory(taskPipeline); ViewBackgroundInfo bg = new ViewBackgroundInfo(c, u, new ActionURL()); PipeRoot root = PipelineService.get().getPipelineRootSetting(c); String protocolName = "EHR Kinship Calculation"; @@ -87,7 +88,7 @@ private void startCalculation(User u, Container c, boolean allowRunningDuringDay (allowRunningDuringDay ? "\ttrue" : "") + ""; - AbstractFileAnalysisProtocol protocol = factory.createProtocolInstance(protocolName, "", xml); + AbstractFileAnalysisProtocol protocol = factory.createProtocolInstance(protocolName, "", xml); if (protocol == null) { return; @@ -111,7 +112,7 @@ private void startCalculation(User u, Container c, boolean allowRunningDuringDay defaultXml.getParentFile().mkdirs(); defaultXml.createNewFile(); - try (FileWriter w = new FileWriter(defaultXml)) + try (FileWriter w = new FileWriter(defaultXml, StringUtilsLabKey.DEFAULT_CHARSET)) { w.write(xml); } @@ -135,11 +136,7 @@ private void startCalculation(User u, Container c, boolean allowRunningDuringDay { throw new ConfigurationException("The EHR kinship pipeline has not been configured on this server", e); } - catch (IOException e) - { - throw new PipelineJobException(e); - } - catch (PipelineValidationException e) + catch (IOException | PipelineValidationException e) { throw new PipelineJobException(e); } From d301f032dc4c980632a6e3dea1f657a8aacafffd Mon Sep 17 00:00:00 2001 From: bbimber Date: Mon, 4 Sep 2023 13:35:12 -0700 Subject: [PATCH 4/9] Correct typos --- ehr/resources/pipeline/kinship/populateInbreeding.r | 6 +++--- ehr/resources/pipeline/kinship/populateKinship.r | 6 +++--- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/ehr/resources/pipeline/kinship/populateInbreeding.r b/ehr/resources/pipeline/kinship/populateInbreeding.r index 629178452..bc5b6b95e 100644 --- a/ehr/resources/pipeline/kinship/populateInbreeding.r +++ b/ehr/resources/pipeline/kinship/populateInbreeding.r @@ -21,9 +21,9 @@ allPed <- read.table(opts$inputFile) colnames(allPed)<-c('Id', 'Dam', 'Sire', 'Gender') allPed$Id[allPed$Id == ""] <- NA -allPed$Id[allPed$Dam == ""] <- NA -allPed$Id[allPed$Sire == ""] <- NA -allPed$Id[allPed$Gender == ""] <- NA +allPed$Dam[allPed$Dam == ""] <- NA +allPed$Sire[allPed$Sire == ""] <- NA +allPed$Gender[allPed$Gender == ""] <- NA df <- data.frame(id=as.character(allPed$Id), 'id parent1'=allPed$Dam, 'id parent2'=allPed$Sire, stringsAsFactors=FALSE) originalIds <- df$id diff --git a/ehr/resources/pipeline/kinship/populateKinship.r b/ehr/resources/pipeline/kinship/populateKinship.r index 762329d7a..36ff7be58 100644 --- a/ehr/resources/pipeline/kinship/populateKinship.r +++ b/ehr/resources/pipeline/kinship/populateKinship.r @@ -21,9 +21,9 @@ allPed <- read.table(opts$inputFile, quote="\"") colnames(allPed)<-c('Id', 'Dam', 'Sire', 'Gender', 'Species') allPed$Id[allPed$Id == ""] <- NA -allPed$Id[allPed$Dam == ""] <- NA -allPed$Id[allPed$Sire == ""] <- NA -allPed$Id[allPed$Gender == ""] <- NA +allPed$Dam[allPed$Dam == ""] <- NA +allPed$Sire[allPed$Sire == ""] <- NA +allPed$Gender[allPed$Gender == ""] <- NA allPed$Species <- as.character(allPed$Species) allPed$Species[is.na(allPed$Species)] <- c('Unknown') From 2c3dd1532dd22664e5f6d362de16b230172eb7cd Mon Sep 17 00:00:00 2001 From: bbimber Date: Mon, 4 Sep 2023 14:04:13 -0700 Subject: [PATCH 5/9] Minor cleanup --- .../org/labkey/ehr/pipeline/GeneticCalculationsInitTask.java | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/ehr/src/org/labkey/ehr/pipeline/GeneticCalculationsInitTask.java b/ehr/src/org/labkey/ehr/pipeline/GeneticCalculationsInitTask.java index 14cb4948f..fb6482853 100644 --- a/ehr/src/org/labkey/ehr/pipeline/GeneticCalculationsInitTask.java +++ b/ehr/src/org/labkey/ehr/pipeline/GeneticCalculationsInitTask.java @@ -33,6 +33,7 @@ import org.labkey.api.query.UserSchema; import org.labkey.api.util.FileType; import org.labkey.api.util.PageFlowUtil; +import org.labkey.api.writer.PrintWriters; import org.springframework.jdbc.BadSqlGrammarException; import java.io.File; @@ -84,7 +85,7 @@ public List getProtocolActionNames() } @Override - public PipelineJob.Task createTask(PipelineJob job) + public PipelineJob.Task createTask(PipelineJob job) { GeneticCalculationsInitTask task = new GeneticCalculationsInitTask(this, job); setJoin(false); @@ -138,7 +139,7 @@ public RecordedActionSet run() throws PipelineJobException File outputFile = new File(support.getAnalysisDirectory(), GeneticCalculationsImportTask.PEDIGREE_FILE); - try (CSVWriter writer = new CSVWriter(new OutputStreamWriter(new FileOutputStream(outputFile)), '\t', CSVWriter.DEFAULT_QUOTE_CHARACTER)) + try (CSVWriter writer = new CSVWriter(PrintWriters.getPrintWriter(outputFile), '\t', CSVWriter.DEFAULT_QUOTE_CHARACTER)) { long count = ts.getRowCount(); if (count > 0) From b99c3da059046201f36b09816b029cc0927f75ce Mon Sep 17 00:00:00 2001 From: bbimber Date: Tue, 5 Sep 2023 14:48:39 -0700 Subject: [PATCH 6/9] - Refactor kinship script to further reduce memory - Refactor kinship script to optionally merge species where hybrids are present and process together --- .../pipeline/kinship/populateKinship.r | 248 +++++++++++++----- .../panel/GeneticCalculationSettingsPanel.js | 18 +- ehr/src/org/labkey/ehr/EHRController.java | 14 +- .../GeneticCalculationsImportTask.java | 4 +- .../ehr/pipeline/GeneticCalculationsJob.java | 13 +- .../pipeline/GeneticCalculationsRTask.java | 23 +- .../pipeline/GeneticCalculationsRunnable.java | 2 + 7 files changed, 243 insertions(+), 79 deletions(-) diff --git a/ehr/resources/pipeline/kinship/populateKinship.r b/ehr/resources/pipeline/kinship/populateKinship.r index 36ff7be58..ed1e55037 100644 --- a/ehr/resources/pipeline/kinship/populateKinship.r +++ b/ehr/resources/pipeline/kinship/populateKinship.r @@ -13,116 +13,222 @@ library(Matrix) library(dplyr) spec <- matrix(c( - 'inputFile', '-f', 1, "character" + 'inputFile', '-f', 1, 'character', + 'mergeSpeciesWithHybrids', '-m', 0, 'logical', + 'performKinshipValidation', '-v', 0, 'logical' ), ncol=4, byrow=TRUE) opts <- getopt(spec, commandArgs(trailingOnly = TRUE)) +if (is.null(opts$mergeSpeciesWithHybrids)){ + opts$mergeSpeciesWithHybrids <- FALSE +} + +if (is.null(opts$performKinshipValidation)){ + opts$performKinshipValidation <- FALSE +} + allPed <- read.table(opts$inputFile, quote="\"") colnames(allPed)<-c('Id', 'Dam', 'Sire', 'Gender', 'Species') allPed$Id[allPed$Id == ""] <- NA allPed$Dam[allPed$Dam == ""] <- NA allPed$Sire[allPed$Sire == ""] <- NA -allPed$Gender[allPed$Gender == ""] <- NA +allPed$Gender[allPed$Gender == "" | is.na(allPed$Gender)] <- 3 # 3 = unknown allPed$Species <- as.character(allPed$Species) allPed$Species[is.na(allPed$Species)] <- c('Unknown') allPed$Species <- as.factor(allPed$Species) -# In order to reduce the max matrix size, calculate famids using makefamid, then analyze each group separately -# It resizes the biggest matrix from 12000^2 to 8200^2 thus reduces the memory used by half -newRecords <- NULL -for (species in unique(allPed$Species)){ - allRecordsForSpecies <- allPed[allPed$Species == species,] - print(paste0('Processing species: ', species, ', with ', nrow(allRecordsForSpecies), ' IDs')) - - # Add missing parents for accurate kinship calculations - fixedRecords <- with(allRecordsForSpecies, fixParents(id = Id, dadid = Sire, momid = Dam, sex = Gender)) - - # Kinship is expecting records to be sorted IAW it's own pedigree function - recordsForSpecies <- with(fixedRecords, pedigree(id=id,dadid=dadid,momid=momid,sex=sex,missid=0)) - - temp.kin <- kinship(recordsForSpecies) +if (any(allPed$Species == 'Unknown')) { + print(paste0('There are ', sum(allPed$Species == 'Unknown'), ' Ids with species = Unknown')) +} - # Add rownames to make matrix symmetric, which is required downstream - rownames(temp.kin) <- colnames(temp.kin) +# The purpose of this function is to handle instances where there was cross-breeding. +# While this is probably rare, it can occur. When this happens, simply merge the entire species together and process as one unit. +# This ensures that all relevant ancestors from both species are present +generateSpeciesToProcess <- function(allPed, mergeSpeciesWithHybrids) { + hybridParents <- dplyr::bind_rows( + merge(allPed, allPed, by.x = 'Dam', by.y = 'Id') %>% filter(Species.x != Species.y) %>% select(Id, Dam, Species.x, Species.y) %>% rename(Parent = Dam) %>% mutate(ParentType = 'Dam'), + merge(allPed, allPed, by.x = 'Sire', by.y = 'Id') %>% filter(Species.x != Species.y) %>% select(Id, Sire, Species.x, Species.y) %>% rename(Parent = Sire) %>% mutate(ParentType = 'Sire') + ) - # Convert kinship matrix to a triplet list of two ids and their coefficient - summaryDf <- as.data.frame(summary(as(temp.kin, "dgCMatrix"))) - idList <- rownames(temp.kin) - temp.tri <- data.frame(Id=idList[summaryDf$i], Id2=idList[summaryDf$j], coefficient=summaryDf$x) + if (nrow(hybridParents) > 0) { + print(paste0('There were ', nrow(hybridParents), ' records with parents of a different species')) + } + + if (mergeSpeciesWithHybrids && nrow(hybridParents) > 0) { + speciesGroups <- as.list(as.character(unique(allPed$Species))) + toMerge <- unique(hybridParents[c('Species.x', 'Species.y')]) + for (idx in 1:nrow(toMerge)){ + speciesToCollapse <- as.character(unlist(toMerge[idx,,drop = TRUE])) + matchingIdx <- sapply(speciesGroups, function(x){ + return(length(intersect(speciesToCollapse, x)) > 0) + }) + + speciesToCollapse <- sort(unique(c(speciesToCollapse, unlist(speciesGroups[matchingIdx])))) + speciesGroups <- speciesGroups[!matchingIdx] + speciesGroups <- append(speciesGroups, list(speciesToCollapse)) + } - # Now filter out parents added for kinship calculation - temp.tri <- dplyr::filter(temp.tri, grepl("^(?!addin).*$", Id, perl = TRUE)) - temp.tri <- dplyr::filter(temp.tri, grepl("^(?!addin).*$", Id2, perl = TRUE)) - temp.tri$Species <- species + print('These species will be merged for processing:') + invisible(lapply(speciesGroups, function(x){ + if (length(x) > 1) { + print(x) + } + })) + + return(speciesGroups) + } else { + return(unique(allPed$Species[allPed$Species != 'Unknown'])) + } +} - newRecords <- rbind(newRecords,temp.tri) +validateExpectedKinshipSubset <- function(dataToTest, expectedValues, errorRows, testReciprocal = TRUE) { + # Generate the reciprocal of relationships as well: + if (testReciprocal) { + ret2 <- data.frame(Id = expectedValues$Id2, Id2 = expectedValues$Id, Relationship = expectedValues$Relationship, ExpectedCoefficient = expectedValues$ExpectedCoefficient) + ret2$Relationship <- sapply(expectedValues$Relationship, function(x){ + x <- unlist(strsplit(x, split = '/')) + if (length(x) == 1) { + return(x) + } + + return(paste0(x[2], '/', x[1])) + }) + expectedValues <- dplyr::bind_rows(expectedValues, ret2) + rm(ret2) + } + + dat <- merge(dataToTest, expectedValues, by = c('Id', 'Id2'), all.x = T, all.y = T) %>% + filter(!is.na(ExpectedCoefficient)) %>% + filter(is.na(coefficient) | coefficient < ExpectedCoefficient) + + if (nrow(dat) == 0) { + return(errorRows) + } + + if (all(is.null(errorRows))) { + return(dat) + } + + return(dplyr::bind_rows(errorRows, dat)) } -generateExpectedKinship <- function(pedDf) { +validateExpectedKinship <- function(pedDf, dataToTest) { + errorRows <- NULL + # See reference: https://en.wikipedia.org/wiki/Coefficient_of_relationship#Kinship_coefficient - self <- data.frame(Id = pedDf$Id, Id2 = pedDf$Id, Species = pedDf$Species, Relationship = 'Self', ExpectedCoefficient = 0.5) + self <- data.frame(Id = pedDf$Id, Id2 = pedDf$Id, Relationship = 'Self', ExpectedCoefficient = 0.5) + errorRows <- validateExpectedKinshipSubset(dataToTest, self, errorRows, testReciprocal = FALSE) + rm(self) - parentChild <- rbind( - data.frame(Id = pedDf$Id, Id2 = pedDf$Dam, Species = pedDf$Species, Relationship = 'Child/Parent'), - data.frame(Id = pedDf$Id, Id2 = pedDf$Sire, Species = pedDf$Species, Relationship = 'Child/Parent') + parentChild <- dplyr::bind_rows( + data.frame(Id = pedDf$Id, Id2 = pedDf$Dam, Relationship = 'Child/Parent'), + data.frame(Id = pedDf$Id, Id2 = pedDf$Sire, Relationship = 'Child/Parent') ) %>% filter(!is.na(Id2)) %>% mutate(ExpectedCoefficient = 0.25) + errorRows <- validateExpectedKinshipSubset(dataToTest, parentChild, errorRows) + rm(parentChild) - grandParentOffspring1 <- merge(pedDf[!is.na(pedDf$Dam),], pedDf, by.x = c('Dam', 'Species'), by.y = c('Id', 'Species'), all.x = F, all.y = F) - grandParentOffspring1 <- rbind( - grandParentOffspring1 %>% select(Id, Dam.y, Species) %>% filter(!is.na(Dam.y)) %>% rename(Id2 = Dam.y) %>% mutate(Relationship = 'Grandchild/Maternal Granddam'), - grandParentOffspring1 %>% select(Id, Sire.y, Species) %>% filter(!is.na(Sire.y)) %>% rename(Id2 = Sire.y) %>% mutate(Relationship = 'Grandchild/Maternal Grandsire') + grandParentOffspring1 <- merge(pedDf[!is.na(pedDf$Dam),], pedDf, by.x = c('Dam'), by.y = c('Id'), all.x = F, all.y = F) + grandParentOffspring1 <- dplyr::bind_rows( + grandParentOffspring1 %>% select(Id, Dam.y) %>% filter(!is.na(Dam.y)) %>% rename(Id2 = Dam.y) %>% mutate(Relationship = 'Grandchild/Maternal Granddam'), + grandParentOffspring1 %>% select(Id, Sire.y) %>% filter(!is.na(Sire.y)) %>% rename(Id2 = Sire.y) %>% mutate(Relationship = 'Grandchild/Maternal Grandsire') ) %>% mutate(ExpectedCoefficient = 0.125) + errorRows <- validateExpectedKinshipSubset(dataToTest, grandParentOffspring1, errorRows) + rm(grandParentOffspring1) - grandParentOffspring2 <- merge(pedDf[!is.na(pedDf$Sire),], pedDf, by.x = c('Sire', 'Species'), by.y = c('Id', 'Species'), all.x = F, all.y = F) - grandParentOffspring2 <- rbind( - grandParentOffspring2 %>% select(Id, Dam.y, Species) %>% filter(!is.na(Dam.y)) %>% rename(Id2 = Dam.y) %>% mutate(Relationship = 'Grandchild/Paternal Granddam'), - grandParentOffspring2 %>% select(Id, Sire.y, Species) %>% filter(!is.na(Sire.y)) %>% rename(Id2 = Sire.y) %>% mutate(Relationship = 'Grandchild/Paternal Grandsire') + grandParentOffspring2 <- merge(pedDf[!is.na(pedDf$Sire),], pedDf, by.x = c('Sire'), by.y = c('Id'), all.x = F, all.y = F) + grandParentOffspring2 <- dplyr::bind_rows( + grandParentOffspring2 %>% select(Id, Dam.y) %>% filter(!is.na(Dam.y)) %>% rename(Id2 = Dam.y) %>% mutate(Relationship = 'Grandchild/Paternal Granddam'), + grandParentOffspring2 %>% select(Id, Sire.y) %>% filter(!is.na(Sire.y)) %>% rename(Id2 = Sire.y) %>% mutate(Relationship = 'Grandchild/Paternal Grandsire') ) %>% mutate(ExpectedCoefficient = 0.125) + errorRows <- validateExpectedKinshipSubset(dataToTest, grandParentOffspring2, errorRows) + rm(grandParentOffspring2) - fullSibs <- merge(pedDf[!is.na(pedDf$Dam) & !is.na(pedDf$Sire),], pedDf[!is.na(pedDf$Dam) & !is.na(pedDf$Sire),], by = c('Sire', 'Dam', 'Species'), all.x = F, all.y = F) %>% - select(Id.x, Id.y, Species) %>% + fullSibs <- merge(pedDf[!is.na(pedDf$Dam) & !is.na(pedDf$Sire),], pedDf[!is.na(pedDf$Dam) & !is.na(pedDf$Sire),], by = c('Sire', 'Dam'), all.x = F, all.y = F) %>% + select(Id.x, Id.y) %>% rename(Id = Id.x, Id2 = Id.y) %>% filter(Id != Id2) %>% mutate(Relationship = 'Full sib', ExpectedCoefficient = 0.25) + errorRows <- validateExpectedKinshipSubset(dataToTest, fullSibs, errorRows) + rm(fullSibs) - ret <- rbind( - parentChild, - grandParentOffspring1, - grandParentOffspring2, - fullSibs - ) - - # Generate the reciprocal of relationships as well: - ret2 <- data.frame(Id = ret$Id2, Id2 = ret$Id, Species = ret$Species, Relationship = ret$Relationship, ExpectedCoefficient = ret$ExpectedCoefficient) - ret2$Relationship <- sapply(ret2$Relationship, function(x){ - x <- unlist(strsplit(x, split = '/')) - if (length(x) == 1) { - return(x) - } + halfSibs1 <- merge(pedDf[!is.na(pedDf$Dam),], pedDf[!is.na(pedDf$Dam),], by = c('Dam'), all.x = F, all.y = F) %>% + filter(Sire.x != Sire.y) %>% + select(Id.x, Id.y) %>% + rename(Id = Id.x, Id2 = Id.y) %>% + filter(Id != Id2) %>% + mutate(Relationship = 'Half sib', ExpectedCoefficient = 0.125) + errorRows <- validateExpectedKinshipSubset(dataToTest, halfSibs1, errorRows) + rm(halfSibs1) - return(paste0(x[2], '/', x[1])) - }) + halfSibs2 <- merge(pedDf[!is.na(pedDf$Sire),], pedDf[!is.na(pedDf$Sire),], by = c('Sire'), all.x = F, all.y = F) %>% + filter(Dam.x != Dam.y) %>% + select(Id.x, Id.y) %>% + rename(Id = Id.x, Id2 = Id.y) %>% + filter(Id != Id2) %>% + mutate(Relationship = 'Half sib', ExpectedCoefficient = 0.125) + errorRows <- validateExpectedKinshipSubset(dataToTest, halfSibs2, errorRows) + rm(halfSibs2) - return(rbind(self, ret, ret2)) + return(errorRows) } -# Basic validation: -toValidate <- merge(newRecords, generateExpectedKinship(allPed), by = c('Id', 'Id2', 'Species'), all.x = T, all.y = T) -write.table(file = 'kinshipWithExpectedValues.txt', toValidate, sep = '\t', quote = F) +speciesToProcess <- generateSpeciesToProcess(allPed, opts$mergeSpeciesWithHybrids) -toValidate <- toValidate %>% - filter(!is.na(ExpectedCoefficient)) %>% - filter(is.na(coefficient) | coefficient < ExpectedCoefficient) +newRecords <- NULL +for (speciesSet in speciesToProcess){ + allRecordsForSpecies <- allPed[allPed$Species %in% speciesSet,] + print(paste0('Processing species set: ', paste0(speciesSet, collapse = ','), ', with ', nrow(allRecordsForSpecies), ' IDs')) + if (nrow(allRecordsForSpecies) == 1) { + print('single record, skipping') + newRecords <- dplyr::bind_rows(newRecords,data.frame(Id = allRecordsForSpecies$Id, Id2 = allRecordsForSpecies$Id, coefficient = 0.5, Species = allRecordsForSpecies$Species)) + next + } + + pctMissingSex <- sum(allRecordsForSpecies$Gender > 2) / nrow(allRecordsForSpecies) + if (pctMissingSex > 0.25) { + paste0('More than 25% of this species group are missing sex and cannot be processed by fixParents(), skipping') + newRecords <- dplyr::bind_rows(newRecords,data.frame(Id = allRecordsForSpecies$Id, Id2 = allRecordsForSpecies$Id, coefficient = 0.5, Species = allRecordsForSpecies$Species)) + next + } + + # Add missing parents for accurate kinship calculations + fixedRecords <- with(allRecordsForSpecies, fixParents(id = Id, dadid = Sire, momid = Dam, sex = Gender)) -if (nrow(toValidate) > 0) { - print(paste0('There were unexpected kinship values! See the file kinshipErrors.txt for more information')) - write.table(newRecords, file = "kinshipErrors.txt", append = FALSE, row.names = FALSE, quote = FALSE, sep = '\t') -} else { - print('All coefficients were within expected ranges from predicted values') + # Kinship is expecting records to be sorted IAW it's own pedigree function + recordsForSpecies <- with(fixedRecords, pedigree(id = id, dadid = dadid, momid = momid, sex = sex, missid = 0)) + + temp.kin <- kinship(recordsForSpecies) + + # Add rownames to make matrix symmetric, which is required downstream + rownames(temp.kin) <- colnames(temp.kin) + + # Convert kinship matrix to a triplet list of two ids and their coefficient + summaryDf <- as.data.frame(summary(as(temp.kin, "dgCMatrix"))) + idList <- rownames(temp.kin) + temp.tri <- data.frame(Id=idList[summaryDf$i], Id2=idList[summaryDf$j], coefficient=summaryDf$x) + + # Now filter out parents added for kinship calculation + temp.tri <- dplyr::filter(temp.tri, grepl("^(?!addin).*$", Id, perl = TRUE)) + temp.tri <- dplyr::filter(temp.tri, grepl("^(?!addin).*$", Id2, perl = TRUE)) + temp.tri <- merge(temp.tri, allRecordsForSpecies[c('Id', 'Species')], by = 'Id', all.x = TRUE) + + newRecords <- dplyr::bind_rows(newRecords,temp.tri) + + # NOTE: perform per-species to save memory + if (opts$performKinshipValidation) { + print('Validating coefficients against expected values') + errorRows <- validateExpectedKinship(allRecordsForSpecies, temp.tri) + if (!all(is.null(errorRows))) { + fileName <- paste0('kinshipErrors_', paste0(speciesSet, collapse = '.'), '.txt') + print(paste0('There were unexpected kinship values! See the file ', fileName, ' for more information')) + write.table(newRecords, file = fileName, row.names = FALSE, quote = FALSE, sep = '\t') + } else { + print('All coefficients were within expected ranges from predicted values') + } + } } -newRecords$Species <- NULL # write TSV to disk write.table(newRecords, file = "kinship.txt", append = FALSE, row.names = FALSE, quote = FALSE, sep = '\t') \ No newline at end of file diff --git a/ehr/resources/web/ehr/panel/GeneticCalculationSettingsPanel.js b/ehr/resources/web/ehr/panel/GeneticCalculationSettingsPanel.js index 0474ed15b..f4b8a3257 100644 --- a/ehr/resources/web/ehr/panel/GeneticCalculationSettingsPanel.js +++ b/ehr/resources/web/ehr/panel/GeneticCalculationSettingsPanel.js @@ -28,7 +28,7 @@ Ext4.define('EHR.panel.GeneticCalculationSettingsPanel', { xtype: 'checkbox', fieldLabel: 'Kinship validation?', itemId: 'kinshipValidation', - listeners : { + listeners: { render: function(c) { Ext4.create('Ext.tip.ToolTip', { target: c.getEl(), @@ -38,6 +38,18 @@ Ext4.define('EHR.panel.GeneticCalculationSettingsPanel', { }); } } + },{ + xtype: 'checkbox', + fieldLabel: 'Merge Species With Hybrids?', + itemId: 'mergeSpeciesWithHybrids', + listeners: { + render: function (c) { + Ext4.create('Ext.tip.ToolTip', { + target: c.getEl(), + html: 'If any hybrid animals are detected, these species groups will be merged and processed as one unit. Merging all these species together ensures that the correct ancestors from each side are present' + }); + } + } },{ xtype: 'numberfield', hideTrigger: true, @@ -94,6 +106,7 @@ Ext4.define('EHR.panel.GeneticCalculationSettingsPanel', { this.down('#hourOfDay').setValue(results.hourOfDay); this.down('#containerPath').setValue(results.containerPath); this.down('#kinshipValidation').setValue(results.kinshipValidation); + this.down('#mergeSpeciesWithHybrids').setValue(results.mergeSpeciesWithHybrids); }, saveData: function(){ @@ -104,7 +117,8 @@ Ext4.define('EHR.panel.GeneticCalculationSettingsPanel', { containerPath: this.down('#containerPath').getValue(), enabled: this.down('#enabled').getValue(), hourOfDay: this.down('#hourOfDay').getValue(), - kinshipValidation: this.down('#kinshipValidation').getValue() + kinshipValidation: this.down('#kinshipValidation').getValue(), + mergeSpeciesWithHybrids: this.down('#mergeSpeciesWithHybrids').getValue() }, method : 'POST', scope: this, diff --git a/ehr/src/org/labkey/ehr/EHRController.java b/ehr/src/org/labkey/ehr/EHRController.java index 6fefb9a74..5b9487a43 100644 --- a/ehr/src/org/labkey/ehr/EHRController.java +++ b/ehr/src/org/labkey/ehr/EHRController.java @@ -639,7 +639,7 @@ public ApiResponse execute(ScheduleGeneticCalculationForm form, BindException er errors.reject(ERROR_MSG, "Unable to find container for path: " + form.getContainerPath()); return null; } - GeneticCalculationsJob.setProperties(form.isEnabled(), c, form.getHourOfDay(), form.isKinshipValidation()); + GeneticCalculationsJob.setProperties(form.isEnabled(), c, form.getHourOfDay(), form.isKinshipValidation(), form.isMergeSpeciesWithHybrids()); return new ApiSimpleResponse("success", true); } @@ -759,6 +759,7 @@ public static class ScheduleGeneticCalculationForm private int hourOfDay; private boolean _kinshipValidation; + private boolean _mergeSpeciesWithHybrids; public boolean isEnabled() { @@ -799,6 +800,16 @@ public void setKinshipValidation(boolean kinshipValidation) { _kinshipValidation = kinshipValidation; } + + public boolean isMergeSpeciesWithHybrids() + { + return _mergeSpeciesWithHybrids; + } + + public void setMergeSpeciesWithHybrids(boolean mergeSpeciesWithHybrids) + { + _mergeSpeciesWithHybrids = mergeSpeciesWithHybrids; + } } @RequiresPermission(AdminPermission.class) @@ -817,6 +828,7 @@ public ApiResponse execute(ScheduleGeneticCalculationForm form, BindException er ret.put("enabled", GeneticCalculationsJob.isEnabled()); ret.put("hourOfDay", GeneticCalculationsJob.getHourOfDay()); ret.put("kinshipValidation", GeneticCalculationsJob.isKinshipValidation()); + ret.put("mergeSpeciesWithHybrids", GeneticCalculationsJob.isMergeSpeciesWithHybrids()); return new ApiSimpleResponse(ret); } diff --git a/ehr/src/org/labkey/ehr/pipeline/GeneticCalculationsImportTask.java b/ehr/src/org/labkey/ehr/pipeline/GeneticCalculationsImportTask.java index 5bd5adbe5..30b5ebbc9 100644 --- a/ehr/src/org/labkey/ehr/pipeline/GeneticCalculationsImportTask.java +++ b/ehr/src/org/labkey/ehr/pipeline/GeneticCalculationsImportTask.java @@ -153,8 +153,10 @@ public RecordedActionSet run() throws PipelineJobException processInbreeding(job.getContainer(), job.getUser(), support.getAnalysisDirectoryPath().toFile(), job.getLogger()); processKinship(job.getContainer(), job.getUser(), support.getAnalysisDirectoryPath().toFile(), job.getLogger()); - if (GeneticCalculationsJob.isKinshipValidation()) + if (Boolean.parseBoolean(getJob().getParameters().get("kinshipValidation"))) + { validateKinship(); + } } return new RecordedActionSet(actions); diff --git a/ehr/src/org/labkey/ehr/pipeline/GeneticCalculationsJob.java b/ehr/src/org/labkey/ehr/pipeline/GeneticCalculationsJob.java index f6bb04170..a5e86ad0a 100644 --- a/ehr/src/org/labkey/ehr/pipeline/GeneticCalculationsJob.java +++ b/ehr/src/org/labkey/ehr/pipeline/GeneticCalculationsJob.java @@ -122,6 +122,16 @@ public static boolean isKinshipValidation() return false; } + public static boolean isMergeSpeciesWithHybrids() + { + Map saved = PropertyManager.getProperties(GENETICCALCULATIONS_PROPERTY_DOMAIN); + + if (saved.containsKey("mergeSpeciesWithHybrids")) + return Boolean.parseBoolean(saved.get("mergeSpeciesWithHybrids")); + else + return false; + } + public static boolean isEnabled() { Map saved = PropertyManager.getProperties(GENETICCALCULATIONS_PROPERTY_DOMAIN); @@ -152,13 +162,14 @@ public static Integer getHourOfDay() return null; } - public static void setProperties(Boolean isEnabled, Container c, Integer hourOfDay, Boolean isKinshipValidation) + public static void setProperties(Boolean isEnabled, Container c, Integer hourOfDay, Boolean isKinshipValidation, Boolean mergeSpeciesWithHybrids) { PropertyManager.PropertyMap props = PropertyManager.getWritableProperties(GENETICCALCULATIONS_PROPERTY_DOMAIN, true); props.put("enabled", isEnabled.toString()); props.put("container", c.getId()); props.put("hourOfDay", hourOfDay.toString()); props.put("kinshipValidation", isKinshipValidation.toString()); + props.put("mergeSpeciesWithHybrids", mergeSpeciesWithHybrids.toString()); props.save(); //unschedule in case settings have changed diff --git a/ehr/src/org/labkey/ehr/pipeline/GeneticCalculationsRTask.java b/ehr/src/org/labkey/ehr/pipeline/GeneticCalculationsRTask.java index 1fe9d72aa..47f7d1a4d 100644 --- a/ehr/src/org/labkey/ehr/pipeline/GeneticCalculationsRTask.java +++ b/ehr/src/org/labkey/ehr/pipeline/GeneticCalculationsRTask.java @@ -17,6 +17,7 @@ import org.apache.commons.lang3.StringUtils; import org.jetbrains.annotations.NotNull; +import org.jetbrains.annotations.Nullable; import org.labkey.api.module.Module; import org.labkey.api.module.ModuleLoader; import org.labkey.api.pipeline.AbstractTaskFactory; @@ -103,13 +104,25 @@ public RecordedActionSet run() throws PipelineJobException { List actions = new ArrayList<>(); - actions.add(runScript("populateInbreeding.r", GeneticCalculationsImportTask.INBREEDING_FILE, "Inbreeding Coefficient Output")); - actions.add(runScript("populateKinship.r", GeneticCalculationsImportTask.KINSHIP_FILE, "Kinship Output")); + actions.add(runScript("populateInbreeding.r", GeneticCalculationsImportTask.INBREEDING_FILE, "Inbreeding Coefficient Output", null)); + + List kinshipArgs = new ArrayList<>(); + if (getJob().getParameters().containsKey("mergeSpeciesWithHybrids") && "true".equalsIgnoreCase(getJob().getParameters().get("mergeSpeciesWithHybrids"))) + { + kinshipArgs.add("-m"); + } + + if (getJob().getParameters().containsKey("kinshipValidation") && "true".equalsIgnoreCase(getJob().getParameters().get("kinshipValidation"))) + { + kinshipArgs.add("-v"); + } + + actions.add(runScript("populateKinship.r", GeneticCalculationsImportTask.KINSHIP_FILE, "Kinship Output", kinshipArgs)); return new RecordedActionSet(actions); } - public RecordedAction runScript(String scriptName, String outputFileName, String actionLabel) throws PipelineJobException + public RecordedAction runScript(String scriptName, String outputFileName, String actionLabel, @Nullable List extraArgs) throws PipelineJobException { PipelineJob job = getJob(); FileAnalysisJobSupport support = (FileAnalysisJobSupport) job; @@ -133,6 +146,10 @@ public RecordedAction runScript(String scriptName, String outputFileName, String args.add(scriptPath); args.add("-f"); args.add(tsvFile.getPath()); + if (extraArgs != null) + { + args.addAll(extraArgs); + } getJob().getLogger().info("Using working directory of: " + support.getAnalysisDirectory().getPath()); ProcessBuilder pb = new ProcessBuilder(args); diff --git a/ehr/src/org/labkey/ehr/pipeline/GeneticCalculationsRunnable.java b/ehr/src/org/labkey/ehr/pipeline/GeneticCalculationsRunnable.java index b3bd6c833..45f471727 100644 --- a/ehr/src/org/labkey/ehr/pipeline/GeneticCalculationsRunnable.java +++ b/ehr/src/org/labkey/ehr/pipeline/GeneticCalculationsRunnable.java @@ -86,6 +86,8 @@ private void startCalculation(User u, Container c, boolean allowRunningDuringDay String xml = "\n" + "\n" + (allowRunningDuringDay ? "\ttrue" : "") + + "\t" + GeneticCalculationsJob.isKinshipValidation() + "" + + "\t" + GeneticCalculationsJob.isMergeSpeciesWithHybrids() + "" + ""; AbstractFileAnalysisProtocol protocol = factory.createProtocolInstance(protocolName, "", xml); From 4af97091be132e7aad829093de87f36044bf3701 Mon Sep 17 00:00:00 2001 From: bbimber Date: Tue, 5 Sep 2023 17:15:58 -0700 Subject: [PATCH 7/9] Avoid another fringe kinship case --- ehr/resources/pipeline/kinship/populateKinship.r | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/ehr/resources/pipeline/kinship/populateKinship.r b/ehr/resources/pipeline/kinship/populateKinship.r index ed1e55037..465fd10fc 100644 --- a/ehr/resources/pipeline/kinship/populateKinship.r +++ b/ehr/resources/pipeline/kinship/populateKinship.r @@ -84,6 +84,10 @@ generateSpeciesToProcess <- function(allPed, mergeSpeciesWithHybrids) { } validateExpectedKinshipSubset <- function(dataToTest, expectedValues, errorRows, testReciprocal = TRUE) { + if (nrow(dataToTest) == 0 || nrow(expectedValues) == 0) { + return(errorRows) + } + # Generate the reciprocal of relationships as well: if (testReciprocal) { ret2 <- data.frame(Id = expectedValues$Id2, Id2 = expectedValues$Id, Relationship = expectedValues$Relationship, ExpectedCoefficient = expectedValues$ExpectedCoefficient) From 2cd09b16f5aeb436c810acf8d9db9ae9ee7d96f1 Mon Sep 17 00:00:00 2001 From: bbimber Date: Tue, 12 Sep 2023 11:33:25 -0700 Subject: [PATCH 8/9] Add setting to control whether kinship import is allowed during business hours --- .../panel/GeneticCalculationSettingsPanel.js | 8 +++++++- ehr/src/org/labkey/ehr/EHRController.java | 17 +++++++++++++++-- .../ehr/pipeline/GeneticCalculationsJob.java | 16 ++++++++++++++-- .../pipeline/GeneticCalculationsRunnable.java | 2 +- 4 files changed, 37 insertions(+), 6 deletions(-) diff --git a/ehr/resources/web/ehr/panel/GeneticCalculationSettingsPanel.js b/ehr/resources/web/ehr/panel/GeneticCalculationSettingsPanel.js index f4b8a3257..3d14716b4 100644 --- a/ehr/resources/web/ehr/panel/GeneticCalculationSettingsPanel.js +++ b/ehr/resources/web/ehr/panel/GeneticCalculationSettingsPanel.js @@ -24,6 +24,10 @@ Ext4.define('EHR.panel.GeneticCalculationSettingsPanel', { xtype: 'checkbox', fieldLabel: 'Is Enabled?', itemId: 'enabled' + },{ + xtype: 'checkbox', + fieldLabel: 'Allow Import During Business Hours?', + itemId: 'allowImportDuringBusinessHours' },{ xtype: 'checkbox', fieldLabel: 'Kinship validation?', @@ -107,6 +111,7 @@ Ext4.define('EHR.panel.GeneticCalculationSettingsPanel', { this.down('#containerPath').setValue(results.containerPath); this.down('#kinshipValidation').setValue(results.kinshipValidation); this.down('#mergeSpeciesWithHybrids').setValue(results.mergeSpeciesWithHybrids); + this.down('#allowImportDuringBusinessHours').setValue(results.allowImportDuringBusinessHours) }, saveData: function(){ @@ -118,7 +123,8 @@ Ext4.define('EHR.panel.GeneticCalculationSettingsPanel', { enabled: this.down('#enabled').getValue(), hourOfDay: this.down('#hourOfDay').getValue(), kinshipValidation: this.down('#kinshipValidation').getValue(), - mergeSpeciesWithHybrids: this.down('#mergeSpeciesWithHybrids').getValue() + mergeSpeciesWithHybrids: this.down('#mergeSpeciesWithHybrids').getValue(), + allowImportDuringBusinessHours: this.down('#allowImportDuringBusinessHours').getValue() }, method : 'POST', scope: this, diff --git a/ehr/src/org/labkey/ehr/EHRController.java b/ehr/src/org/labkey/ehr/EHRController.java index 5b9487a43..2561d5f73 100644 --- a/ehr/src/org/labkey/ehr/EHRController.java +++ b/ehr/src/org/labkey/ehr/EHRController.java @@ -83,6 +83,7 @@ import org.labkey.api.settings.AppProps; import org.labkey.api.study.DatasetTable; import org.labkey.api.util.ExceptionUtil; +import org.labkey.api.util.HtmlString; import org.labkey.api.util.HtmlStringBuilder; import org.labkey.api.util.PageFlowUtil; import org.labkey.api.util.Path; @@ -639,7 +640,7 @@ public ApiResponse execute(ScheduleGeneticCalculationForm form, BindException er errors.reject(ERROR_MSG, "Unable to find container for path: " + form.getContainerPath()); return null; } - GeneticCalculationsJob.setProperties(form.isEnabled(), c, form.getHourOfDay(), form.isKinshipValidation(), form.isMergeSpeciesWithHybrids()); + GeneticCalculationsJob.setProperties(form.isEnabled(), c, form.getHourOfDay(), form.isKinshipValidation(), form.isMergeSpeciesWithHybrids(), form.isAllowImportDuringBusinessHours()); return new ApiSimpleResponse("success", true); } @@ -760,6 +761,7 @@ public static class ScheduleGeneticCalculationForm private boolean _kinshipValidation; private boolean _mergeSpeciesWithHybrids; + private boolean _allowImportDuringBusinessHours; public boolean isEnabled() { @@ -810,6 +812,16 @@ public void setMergeSpeciesWithHybrids(boolean mergeSpeciesWithHybrids) { _mergeSpeciesWithHybrids = mergeSpeciesWithHybrids; } + + public boolean isAllowImportDuringBusinessHours() + { + return _allowImportDuringBusinessHours; + } + + public void setAllowImportDuringBusinessHours(boolean allowImportDuringBusinessHours) + { + _allowImportDuringBusinessHours = allowImportDuringBusinessHours; + } } @RequiresPermission(AdminPermission.class) @@ -829,6 +841,7 @@ public ApiResponse execute(ScheduleGeneticCalculationForm form, BindException er ret.put("hourOfDay", GeneticCalculationsJob.getHourOfDay()); ret.put("kinshipValidation", GeneticCalculationsJob.isKinshipValidation()); ret.put("mergeSpeciesWithHybrids", GeneticCalculationsJob.isMergeSpeciesWithHybrids()); + ret.put("allowImportDuringBusinessHours", GeneticCalculationsJob.isAllowImportDuringBusinessHours()); return new ApiSimpleResponse(ret); } @@ -1262,7 +1275,7 @@ public void validateCommand(Object form, Errors errors) @Override public ModelAndView getConfirmView(Object form, BindException errors) { - return new HtmlView("This will cause the system to recalculate kinship and inbreeding coefficients on the colony. Do you want to continue?"); + return new HtmlView(HtmlString.of("This will cause the system to recalculate kinship and inbreeding coefficients on the colony. Do you want to continue?")); } @Override diff --git a/ehr/src/org/labkey/ehr/pipeline/GeneticCalculationsJob.java b/ehr/src/org/labkey/ehr/pipeline/GeneticCalculationsJob.java index a5e86ad0a..883ff9188 100644 --- a/ehr/src/org/labkey/ehr/pipeline/GeneticCalculationsJob.java +++ b/ehr/src/org/labkey/ehr/pipeline/GeneticCalculationsJob.java @@ -132,6 +132,17 @@ public static boolean isMergeSpeciesWithHybrids() return false; } + + public static boolean isAllowImportDuringBusinessHours() + { + Map saved = PropertyManager.getProperties(GENETICCALCULATIONS_PROPERTY_DOMAIN); + + if (saved.containsKey("allowImportDuringBusinessHours")) + return Boolean.parseBoolean(saved.get("allowImportDuringBusinessHours")); + else + return false; + } + public static boolean isEnabled() { Map saved = PropertyManager.getProperties(GENETICCALCULATIONS_PROPERTY_DOMAIN); @@ -162,7 +173,7 @@ public static Integer getHourOfDay() return null; } - public static void setProperties(Boolean isEnabled, Container c, Integer hourOfDay, Boolean isKinshipValidation, Boolean mergeSpeciesWithHybrids) + public static void setProperties(Boolean isEnabled, Container c, Integer hourOfDay, Boolean isKinshipValidation, Boolean mergeSpeciesWithHybrids, Boolean allowImportDuringBusinessHours) { PropertyManager.PropertyMap props = PropertyManager.getWritableProperties(GENETICCALCULATIONS_PROPERTY_DOMAIN, true); props.put("enabled", isEnabled.toString()); @@ -170,6 +181,7 @@ public static void setProperties(Boolean isEnabled, Container c, Integer hourOfD props.put("hourOfDay", hourOfDay.toString()); props.put("kinshipValidation", isKinshipValidation.toString()); props.put("mergeSpeciesWithHybrids", mergeSpeciesWithHybrids.toString()); + props.put("allowImportDuringBusinessHours", allowImportDuringBusinessHours.toString()); props.save(); //unschedule in case settings have changed @@ -193,7 +205,7 @@ public void execute(JobExecutionContext context) throws JobExecutionException try { _log.info("Running Scheduled Genetic Calculations Job"); - new GeneticCalculationsRunnable().run(c, false); + new GeneticCalculationsRunnable().run(c, isAllowImportDuringBusinessHours()); } catch (PipelineJobException e) { diff --git a/ehr/src/org/labkey/ehr/pipeline/GeneticCalculationsRunnable.java b/ehr/src/org/labkey/ehr/pipeline/GeneticCalculationsRunnable.java index 45f471727..4ec2cc288 100644 --- a/ehr/src/org/labkey/ehr/pipeline/GeneticCalculationsRunnable.java +++ b/ehr/src/org/labkey/ehr/pipeline/GeneticCalculationsRunnable.java @@ -85,7 +85,7 @@ private void startCalculation(User u, Container c, boolean allowRunningDuringDay String protocolName = "EHR Kinship Calculation"; String xml = "\n" + "\n" + - (allowRunningDuringDay ? "\ttrue" : "") + + "\t" + allowRunningDuringDay + "" + "\t" + GeneticCalculationsJob.isKinshipValidation() + "" + "\t" + GeneticCalculationsJob.isMergeSpeciesWithHybrids() + "" + ""; From 6dfbed173f71fe548f3c67044e0fac131c5eb27a Mon Sep 17 00:00:00 2001 From: bbimber Date: Thu, 28 Sep 2023 10:27:02 -0700 Subject: [PATCH 9/9] VetAssignmentAndCacheFalsePositives --- ehr/api-src/org/labkey/api/ehr/EHRDemographicsService.java | 5 +++++ .../ehr/demographics/EHRDemographicsServiceImpl.java | 7 +++++++ 2 files changed, 12 insertions(+) diff --git a/ehr/api-src/org/labkey/api/ehr/EHRDemographicsService.java b/ehr/api-src/org/labkey/api/ehr/EHRDemographicsService.java index 349b3ea4d..6ee69f0d4 100644 --- a/ehr/api-src/org/labkey/api/ehr/EHRDemographicsService.java +++ b/ehr/api-src/org/labkey/api/ehr/EHRDemographicsService.java @@ -50,4 +50,9 @@ static public void setInstance(EHRDemographicsService instance) abstract public AnimalRecord getAnimal(Container c, String id); abstract public List getAnimals(Container c, Collection ids); + + /** + * + */ + abstract public void recalculateForAllIdsInCache(final Container c, final String schema, final String query, final boolean async); } diff --git a/ehr/src/org/labkey/ehr/demographics/EHRDemographicsServiceImpl.java b/ehr/src/org/labkey/ehr/demographics/EHRDemographicsServiceImpl.java index e4e0f3291..9e05f4ab8 100644 --- a/ehr/src/org/labkey/ehr/demographics/EHRDemographicsServiceImpl.java +++ b/ehr/src/org/labkey/ehr/demographics/EHRDemographicsServiceImpl.java @@ -235,6 +235,13 @@ public void reportDataChange(Container c, String schema, String query, List cachedIds = _cache.getKeys().stream().map(x -> x.replaceAll(getCacheKeyPrefix(c), "")).toList(); + reportDataChange(c, Collections.singletonList(Pair.of(schema, query)), cachedIds, async); + } + public void reportDataChange(final Container c, final List> changed, final List ids, boolean async) { final User u = EHRService.get().getEHRUser(c);