diff --git a/ehr/resources/pipeline/kinship/populateKinship.r b/ehr/resources/pipeline/kinship/populateKinship.r index abffbd120..ef08381da 100644 --- a/ehr/resources/pipeline/kinship/populateKinship.r +++ b/ehr/resources/pipeline/kinship/populateKinship.r @@ -1,5 +1,5 @@ ## -# Copyright (c) 2013-2019 LabKey Corporation +# Copyright (c) 2012-2017 LabKey Corporation # # Licensed under the Apache License, Version 2.0: http://www.apache.org/licenses/LICENSE-2.0 ## @@ -15,7 +15,11 @@ library(methods); library(kinship2); library(getopt); library(Matrix); -library(dplyr); +library(dataCompareR); +library(Rlabkey); +library(lme4); +library(parallel); + spec <- matrix(c( #'containerPath', '-c', 1, "character", @@ -36,38 +40,197 @@ allPed$Species <- as.character(allPed$Species) allPed$Species[is.na(allPed$Species)] <- c('Unknown') allPed$Species <- as.factor(allPed$Species) +##### commented out after checking data +# duplicatedId=allPed$Id[duplicated(allPed$Id)] +# allPed[allPed$Id%in%duplicatedId,]# Check for duplication: passed +# Id.Dam=unique(allPed$Dam) +# allPed[(allPed$Id%in%Id.Dam)&(allPed$Gender!=2),]#Check for Dam that is not female +# Id.Sire=unique(allPed$Sire) +# allPed[(allPed$Id%in%Id.Sire)&(allPed$Gender!=1),]#Check for Sire that is not male:1 not passed + +# this function adds missing parents to the pedigree +# it is similar to add.Inds from kinship; however, we retain gender +addMissing <- function(ped, species){ + if(ncol(ped)<4)stop("pedigree should have at least 4 columns") + head <- names(ped) + + nsires <- match(ped[,3],ped[,1])# [Quoc] change ped,2 to ped,3 + nsires <- as.character(unique(ped[is.na(nsires),3])) + nsires <- nsires[!is.na(nsires)] + if(length(nsires)){ + ped <- rbind(ped, data.frame(Id=nsires, Dam=rep(NA, length(nsires)), Sire=rep(NA, length(nsires)), Gender=rep(1, length(nsires)), Species=rep(species, length(nsires)))); + } + + ndams <- match(ped[,2],ped[,1])# [Quoc] change ped,3 to ped,2 + ndams <- as.character(unique(ped[is.na(ndams),2])) + ndams <- ndams[!is.na(ndams)]; + + if(length(ndams)){ + ped <- rbind(ped,data.frame(Id=ndams, Dam=rep(NA, length(ndams)), Sire=rep(NA, length(ndams)), Gender=rep(2, length(ndams)), Species=rep(species, length(ndams)))); + } + + names(ped) <- head + return(ped) +} + +print(paste0('Getting previousKinship')) +library(Rlabkey); +labkey.url.base = "http://localhost:8080/labkey/" +labkey.url.path = "/WNPRC/EHR" + +#labkey.setCurlOptions(ssl.verifypeer=FALSE, ssl.verifyhost=TRUE) +#previousRecords=NULL +previousKinship <- function(bool){ + print('fetching previousKinship using quarterOfCores') + previousRecords <- labkey.selectRows ( + baseUrl=labkey.url.base, + #to run directly in R, uncomment this line. otherwise providing a containerPath is not necessary + folderPath=labkey.url.path, + schemaName="ehr", + queryName="kinship", + colSelect=c('Id', 'Id2','coefficient'), + showHidden = TRUE, + colNameOpt = 'fieldname', #rname + #showHidden = FALSE + ) +} +numCores <- detectCores() +quarterOfCores <- numCores/2 +print(quarterOfCores) +#mclapply(TRUE,previousKinship, mc.cores = quarterOfCores) + # In order to reduce the max matrix size, calculate famids using makefamid, then analyze each group separately # It resizes the biggest matrix from 12000^2 to 8200^2 thus reduces the memory used by half -newRecords=NULL -for (species in unique(allPed$Species)){ + + +calculationKinship <- function(species){ + newRecords=NULL + filename=NULL + previousRecords=NULL + compareResults=NULL + summaryResults=NULL + mismatchResults=NULL + #columnKeys=NULL + + print(paste0('processing species: ', species)) - allRecordsForSpecies <- allPed[allPed$Species == species,] + recordsForSpecies <- allPed[allPed$Species == species,] + print(paste0('total subjects: ', nrow(recordsForSpecies))) + + recordsForSpecies <- addMissing(recordsForSpecies, species) + gc() + print(paste0('total subjects after adding missing: ', nrow(recordsForSpecies))) + + fami=makefamid(id=recordsForSpecies$Id,father.id=recordsForSpecies$Sire,mother.id=recordsForSpecies$Dam) + famid=unique(fami) + famid=famid[famid!=0] + gc() + + print(paste0('total families: ', length(famid))) + for (fam.no in famid){ + familytemp=recordsForSpecies[fami==fam.no,] + # print(paste0('family size: ',species, ' ' , nrow(familytemp))) + + temp.kin=kinship(familytemp$Id,familytemp$Sire,familytemp$Dam) + rownames(temp.kin) <- colnames(temp.kin) #add rownames to make matrix symmetric, which is required downstream + sparse.kin=as(temp.kin,"dsTMatrix") #change kinship matrix to symmetric triplet sparse matrix + + # convert to a sparse matrix usng S4 object from Matrix library + temp.tri=data.frame(Id=colnames(temp.kin)[sparse.kin@i+1],Id2=colnames(temp.kin)[sparse.kin@j+1],coefficient=sparse.kin@x,stringsAsFactors=FALSE) + newRecords=rbind(newRecords,temp.tri) + } + + filename <- paste0("kinship_",species,".txt"); + + firstRun = TRUE + print(paste("value of file.exists",file.exists(filename), filename)) + + + if (file.exists(filename)){ + print(paste0("opening previouskinship for ", species)) + previousRecords <- read.table(filename, header = TRUE, sep="\t", colClasses = c("character","character", "numeric")) + #colnames(previousRecords)0 && summaryResults$nrowSomeUnequal>0) + print("print comparison") + detailMismatches <- colsWithUnequalValues(COEFFICIENT,mismatchResults) + #print(summaryResults$colsWithUnequalValues) + mismatchFile <- paste0("mismatchesKinship",species,".txt") + saveReport(compareResults,reportName = mismatchFile, HTMLReport = FALSE) + write.table(detailMismatches, file = 'mismatches', append = FALSE,row.names=F,quote=F,sep="\t") + #write.table(mismatchResults, file = mismatchFile, append = FALSE, row.name=F,quote=F,sep="\t") + } + - # Add missing parents for accurate kinship calculations - fixedRecords <- with(allRecordsForSpecies, fixParents(id = Id, dadid = Sire, momid = Dam, sex = Gender)) +} +system.time({ + #lapply(unique(allPed$Species), calculationKinship) + mclapply(unique(allPed$Species), calculationKinship, mc.cores = quarterOfCores) +}) - # Kinship is expecting records to be sorted IAW it's own pedigree function - recordsForSpecies <- with(fixedRecords, pedigree(id=id,dadid=dadid,momid=momid,sex=sex,missid=0)) - temp.kin=kinship(recordsForSpecies) - # Add rownames to make matrix symmetric, which is required downstream - rownames(temp.kin) <- colnames(temp.kin) - # Convert kinship matrix to a triplet list of two ids and their coefficient - summaryDf = as.data.frame(summary(as(temp.kin, "dgCMatrix"))) - idList <- rownames(temp.kin) - temp.tri = data.frame(Id=idList[summaryDf$i], Id2=idList[summaryDf$j], coefficient=summaryDf$x) +#summary(previousRecords) +#See size of previousKinship to determine to add initial ran. + +#columnKeys <- c('Id','Id2') +#compareResults <- rCompare(previousRecords,newRecords,keys = columnKeys,roundDigits = NA,mismatches = NA,trimChars = FALSE); +#summaryResults <- summary(compareResults) +#print(summaryResults) +#print(summaryResults('nrowSomeUnequal')) +print(paste0('nrowCommon')) +#print(summaryResults$nrowCommon) +#print(summaryResults$nrowInAOnly) +#print(summaryResults$nrowInBOnly) +#print(summaryResults$datasetSummary) +#print(paste0('Number of rows with same results: ',nrowSomeUnequal(summaryResults))) +#print(paste0('Generate mismatches')) +#mismatchResults <- generateMismatchData(compareResults,previousKinship,newRecords) +#print(mismatchResults) - # Now filter out parents added for kinship calculation - temp.tri <- dplyr::filter(temp.tri, grepl("^(?!addin).*$", Id, perl = TRUE)) - temp.tri <- dplyr::filter(temp.tri, grepl("^(?!addin).*$", Id2, perl = TRUE)) - newRecords=rbind(newRecords,temp.tri) - print(paste0('total subjects: ', nrow(allRecordsForSpecies))) -} # write TSV to disk print("Output table:"); -print(str(newRecords)); -write.table(newRecords, file = "kinship.txt", append = FALSE,row.names=F,quote=F,sep="\t"); \ No newline at end of file +#print(str(newRecords)); +#print('value of mismatches ',is.null(mismatchResults)) +#if (is.null(mismatchResults)){ +#write.table(newRecords, file = "kinship.txt", append = FALSE,row.names=F,quote=F,sep="\t") +#} +#if(!is.null(mismatchResults)){ +#write.table('null', file = "kinship.txt", append = FALSE, row.name=F,quote=F,sep="\t") +#} \ No newline at end of file