From 9819c1d5e5e22633afb21e981bf711f272485663 Mon Sep 17 00:00:00 2001 From: Glenn Hickey Date: Wed, 18 Dec 2024 13:16:03 -0500 Subject: [PATCH 1/8] start new mafviz script --- setup.py | 3 +- src/cactus/maf/cactus_mafviz.py | 268 ++++++++++++++++++++++++++++++++ 2 files changed, 270 insertions(+), 1 deletion(-) create mode 100644 src/cactus/maf/cactus_mafviz.py diff --git a/setup.py b/setup.py index 877b43fa8..7c2a9aaed 100644 --- a/setup.py +++ b/setup.py @@ -65,4 +65,5 @@ def run(self): 'cactus-hal2maf = cactus.maf.cactus_hal2maf:main', 'cactus-hal2chains = cactus.maf.cactus_hal2chains:main', 'cactus-maf2bigmaf = cactus.maf.cactus_maf2bigmaf:main', - 'cactus-pangenome = cactus.refmap.cactus_pangenome:main']},) + 'cactus-pangenome = cactus.refmap.cactus_pangenome:main', + 'cactus-mafviz = cactus.maf.cactus_mafviz:main']},) diff --git a/src/cactus/maf/cactus_mafviz.py b/src/cactus/maf/cactus_mafviz.py new file mode 100644 index 000000000..c7e395c52 --- /dev/null +++ b/src/cactus/maf/cactus_mafviz.py @@ -0,0 +1,268 @@ +#!/usr/bin/env python3 + +""" +This is a script to view a MAF (from cactus-hal2maf) using taffy +""" + +import os, sys +from argparse import ArgumentParser +import copy +import timeit, time +import math +import xml.etree.ElementTree as ET + +from operator import itemgetter + +from cactus.progressive.seqFile import SeqFile +from cactus.shared.common import setupBinaries, importSingularityImage +from cactus.shared.common import cactusRootPath +from cactus.shared.configWrapper import ConfigWrapper +from cactus.shared.common import makeURL, catFiles +from cactus.shared.common import enableDumpStack +from cactus.shared.common import cactus_override_toil_options +from cactus.shared.common import cactus_call +from cactus.shared.common import getOptionalAttrib, findRequiredNode +from cactus.shared.common import cactus_clamp_memory +from cactus.shared.version import cactus_commit +from toil.job import Job +from toil.common import Toil +from toil.statsAndLogging import logger +from toil.statsAndLogging import set_logging_from_options +from toil.realtimeLogger import RealtimeLogger +from cactus.shared.common import cactus_cpu_count +from cactus.maf.cactus_hal2maf import get_sed_rename_scripts +from toil.lib.humanize import bytes2human +from sonLib.bioio import getTempDirectory + +def main(): + parser = ArgumentParser() + Job.Runner.addToilOptions(parser) + + parser.add_argument("mafFile", help = "MAF file to convert to BigMaf (can be gzipped)") + parser.add_argument("outDir", help = "Output bigMaf file (.bb)") + + parser.add_argument("--refGenome", required=True, help = "Name of reference genome (1st row in MAF)") + parser.add_argument("--chroms", nargs='*', + help = "Subset to input chromosome(s). If used without arguments, split on all chromosomes") + + parser.add_argument("--genomes", nargs='+', + help = "Split into separate output for each given genome. Ex. --genomes $(halStats --genomes )") + + parser.add_argument("--haplotypes", nargs='+', + help = "Split into separate output for each given sample. Same as --genomes except " + " samples will be treaded together. ex hap.1,hap.2,hap.3 will be in the same output file " + "and renamed to hap_1,hap_2,hap_3") + + #Progressive Cactus Options + parser.add_argument("--configFile", dest="configFile", + help="Specify cactus configuration file", + default=os.path.join(cactusRootPath(), "cactus_progressive_config.xml")) + parser.add_argument("--latest", dest="latest", action="store_true", + help="Use the latest version of the docker container " + "rather than pulling one matching this version of cactus") + parser.add_argument("--containerImage", dest="containerImage", default=None, + help="Use the the specified pre-built containter image " + "rather than pulling one from quay.io") + parser.add_argument("--binariesMode", choices=["docker", "local", "singularity"], + help="The way to run the Cactus binaries", default=None) + + options = parser.parse_args() + + setupBinaries(options) + set_logging_from_options(options) + enableDumpStack() + + if not (options.mafFile.endswith('.maf') or options.mafFile.endswith('.maf.gz') or + options.mafFile.endswith('.taf') or options.mafFile.endswith('.taf.gz')): + raise RuntimeError('Input MAF file must have .maf/.maf.gz/.taf/.taf.gz extension') + + # Mess with some toil options to create useful defaults. + cactus_override_toil_options(options) + + logger.info('Cactus Command: {}'.format(' '.join(sys.argv))) + logger.info('Cactus Commit: {}'.format(cactus_commit)) + start_time = timeit.default_timer() + + with Toil(options) as toil: + importSingularityImage(options) + #Run the workflow + if options.restart: + viz_id_dict = toil.restart() + else: + #load cactus config + configNode = ET.parse(options.configFile).getroot() + config = ConfigWrapper(configNode) + config.substituteAllPredefinedConstantsWithLiterals(options) + + logger.info("Importing {}".format(options.mafFile)) + maf_id = toil.importFile(makeURL(options.mafFile)) + try: + logger.info("Importing {}".format(options.mafFile + '.tai')) + tai_id = toil.importFile(makeURL(options.mafFile + '.tai')) + except: + raise RuntimeError('Maf Index {} not found: Please create with taffy index'.format( + options.mafFile + '.tai')) + + viz_id_dict = toil.start(Job.wrapJobFn(mafviz_workflow, config, options, maf_id, tai_id)) + + #export the results + export_mafviz(toil, options, viz_id_dict) + + end_time = timeit.default_timer() + run_time = end_time - start_time + logger.info("cactus-mafviz has finished after {} seconds".format(run_time)) + + +def mafviz_workflow(job, config, options, maf_id, tai_id): + root_job = Job() + job.addChild(root_job) + # get the reference chromosomes + if options.chroms == []: + chrom_job = root_job.addFollowOnJobFn(tai_ref_croms, options, maf_id, tai_id) + root_job = chrom_job + options.chroms = chrom_job.rv() + + viz_all_job = root_job.addFollowOnJobFn(maf_viz_all, config, options, maf_id, tai_id) + + return viz_all_job.rv() + +def tai_ref_chroms(job, options, maf_id, tai_id): + """ use taffy to get the reference chromosomes from the tai """ + work_dir = job.fileStore.getLocalTempDir() + maf_path = os.path.join(work_dir, os.path.basename(options.mafFile)) + tai_path = maf_path + '.tai' + job.fileStore.readGlobalFile(maf_id, maf_path) + job.fileStore.readGlobalFile(tai_id, tai_path) + seq_stats = cactus_call(parameters=['taffy' 'stats', '-s', '-i', maf_path], check_output=True) + seq_list = [] + for line in seq_stats.split('\n'): + seq_list.append(line.split()[0]) + return seq_list + +def maf_viz_all(job, config, options, maf_id, tai_id): + """ make visualization, for each chrom / genome as specified, in parallel """ + out_dict = {} + + # "None" chromsome means whole-genome + if not options.chroms: + options.chroms = [None] + if options.chroms: + for chrom in options.chroms: + chrom_viz_job = job.addChildJobFn(maf_viz_chrom, config, options, maf_id, tai_id, chrom) + out_dict[chrom] = chrom_viz_job.rv() + + return out_dict + +def maf_viz_chrom(job, config, options, maf_id, tai_id, chrom): + """ make visualizations for given chromosome """ + work_dir = job.fileStore.getLocalTempDir() + maf_path = os.path.join(work_dir, os.path.basename(options.mafFile)) + tai_path = maf_path + '.tai' + job.fileStore.readGlobalFile(maf_id, maf_path) + job.fileStore.readGlobalFile(tai_id, tai_path) + + # extract the chromosome if specified + if chrom: + chrom_maf_path = os.path.join(work_dir, chrom + '.' + os.path.basename(options.mafFile)) + maf_view_command = ['taffy', 'view', '-m', '-i', maf_path, '-r', options.refGenome + '.' + chrom] + if maf_path.endswith('.gz'): + maf_view_command += ['-c'] + cactus_call(parameters=maf_view_command, outfile=chrom_maf_path) + maf_path = chrom_maf_path + maf_id = job.fileStore.writeGlobalFile(maf_path) + + if not options.genomes and not options.haplotypes: + options.genomes = [None] + + genome_groups = [] + if options.genomes: + for genome in options.genomes: + genome_groups.append([g]) + + # group haplotypes by sample, using .1 .2 etc naming convention from mc + if options.haplotypes: + sample_map = {} + for h in options.haplotypes: + assert '.' in h + sample = h[0:h.rfind('.')] + if sample in sample_map: + sample_map[sample].append(h) + else: + sample_map[sample] = [h] + for hap_list in sample_map.values(): + genome_groups.append(hap_list) + + + genome_dict = {} + for genome_group in genome_groups: + viz_job = job.addChildJobFn(maf_viz, config, options, maf_id, chrom, genome_group) + genome_dict[genome_group[0]] = viz_job.rv() + + return genome_dict + +def maf_viz(job, config, options, maf_id, chrom, genomes): + """ make the visualizations for the given maf and genomes """ + + work_dir = job.fileStore.getLocalTempDir() + prefix = chrom + '.' if chrom else '' + maf_path = os.path.join(work_dir, prefix + os.path.basename(options.mafFile)) + job.fileStore.readGlobalFile(maf_id, maf_path) + + if genomes != [None]: + extract_cmd = [['gzip', '-dc', maf_path]] if options.mafFile.endswith('.gz') else [['cat', maf_path]] + for genome in genomes: + if '.' in genome: + new_genome = '_'.join(genome.rsplit('.', 1)) + extract_cmd.append(['sed', '-e', 's/{}/{}/g'.format(genome, new_genome)]) + extract_cmd.append(['mafFilter', '-m', '-', '-l', '2', '-i', ','.join(genomes + [options.refGenome])]) + extract_cmd.append(['bgzip']) + + extract_maf_path = os.path.join(work_dir, genomes[0] + '.' + os.path.basename(options.mafFile)) + if not extract_maf_path.endswith('.gz'): + extract_maf_path += '.gz' + cactus_call(parameters=extract_cmd, outfile=extract_maf_path) + else: + extract_maf_path = maf_path + + # now we can do this viz + dotfile = os.path.join(work_dir, 'dotplot.pdf') + cactus_call(parameters=['python3', '/home/hickey/dev/taffy/scripts/taffy_alignment_plot.py', + maf_path, '--show_dot_plot', '--output_format', 'png', + '--out_file', dotfile]) + + syntfile = os.path.join(work_dir, 'synteny.pdf') + cactus_call(parameters=['python3', '/home/hickey/dev/taffy/scripts/taffy_alignment_plot.py', + maf_path, '--show_synteny', '--output_format', 'png', + '--bin_number', '500', '--show_sequence_boundaries', + '--out_file', syntfile]) + + return { 'dot' : job.fileStore.writeGlobalFile(dotfile), + 'synteny' : job.fileStore.writeGlobalFile(syntfile) } + +def export_mafviz(toil, options, output_dict): + """ export the files into a directory """ + + name = os.path.splitext(os.path.basename(options.mafFile))[0] + if options.mafFile.endswith('.gz'): + name = os.path.splitext(name)[0] + + for chrom, chrom_dict in output_dict.items(): + chrom_base = os.path.join(options.outDir, chrom) if chrom else options.outDir + if not chrom_base.startswith('s3://') and not os.path.isdir(chrom_base): + os.makedirs(chrom_base) + + for genome, genome_dict in chrom_dict.items(): + genome_base = os.path.join(chrom_base, genome) if genome else chrom_base + if not genome_base.startswith('s3://') and not os.path.isdir(genome_base): + os.makedirs(genome_base) + + for viz_type, viz_id in genome_dict.items(): + file_name = name + if chrom: + file_name += '.{}'.format(chrom) + if genome: + file_name += '.{}'.format(genome) + file_name += '.{}.png'.format(viz_type) + + toil.exportFile(viz_id, makeURL(os.path.join(genome_base, file_name))) + From b2ae7a553553444c2d0b605e8c9b87922c53f054 Mon Sep 17 00:00:00 2001 From: Glenn Hickey Date: Wed, 18 Dec 2024 14:04:39 -0500 Subject: [PATCH 2/8] split maffilter --- src/cactus/maf/cactus_mafviz.py | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/src/cactus/maf/cactus_mafviz.py b/src/cactus/maf/cactus_mafviz.py index c7e395c52..073db8a63 100644 --- a/src/cactus/maf/cactus_mafviz.py +++ b/src/cactus/maf/cactus_mafviz.py @@ -208,13 +208,18 @@ def maf_viz(job, config, options, maf_id, chrom, genomes): maf_path = os.path.join(work_dir, prefix + os.path.basename(options.mafFile)) job.fileStore.readGlobalFile(maf_id, maf_path) - if genomes != [None]: + if genomes != [None]: + sed_cmd = ['sed'] extract_cmd = [['gzip', '-dc', maf_path]] if options.mafFile.endswith('.gz') else [['cat', maf_path]] for genome in genomes: if '.' in genome: new_genome = '_'.join(genome.rsplit('.', 1)) - extract_cmd.append(['sed', '-e', 's/{}/{}/g'.format(genome, new_genome)]) - extract_cmd.append(['mafFilter', '-m', '-', '-l', '2', '-i', ','.join(genomes + [options.refGenome])]) + sed_cmd += ['-e', 's/{}/{}/g'.format(genome, new_genome)] + if len(sed_cmd) > 1: + extract_cmd.append(sed_cmd) + extract_cmd.append(['mafFilter', '-m', '-', '-i', ','.join(genomes + [options.refGenome])]) + extract_cmd.append(['mafFilter', '-m', '-', '-l', '2']) + extract_cmd.append(['bgzip']) extract_maf_path = os.path.join(work_dir, genomes[0] + '.' + os.path.basename(options.mafFile)) From dc8612cb9d8bf87ff58e82bd41f99313f7622ad5 Mon Sep 17 00:00:00 2001 From: Glenn Hickey Date: Wed, 18 Dec 2024 14:54:48 -0500 Subject: [PATCH 3/8] fix bugs --- src/cactus/maf/cactus_mafviz.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/src/cactus/maf/cactus_mafviz.py b/src/cactus/maf/cactus_mafviz.py index 073db8a63..d821e6306 100644 --- a/src/cactus/maf/cactus_mafviz.py +++ b/src/cactus/maf/cactus_mafviz.py @@ -211,13 +211,15 @@ def maf_viz(job, config, options, maf_id, chrom, genomes): if genomes != [None]: sed_cmd = ['sed'] extract_cmd = [['gzip', '-dc', maf_path]] if options.mafFile.endswith('.gz') else [['cat', maf_path]] + new_genomes = [] for genome in genomes: if '.' in genome: new_genome = '_'.join(genome.rsplit('.', 1)) sed_cmd += ['-e', 's/{}/{}/g'.format(genome, new_genome)] + new_genomes.append(new_genome) if len(sed_cmd) > 1: extract_cmd.append(sed_cmd) - extract_cmd.append(['mafFilter', '-m', '-', '-i', ','.join(genomes + [options.refGenome])]) + extract_cmd.append(['mafFilter', '-m', '-', '-i', ','.join(new_genomes + [options.refGenome])]) extract_cmd.append(['mafFilter', '-m', '-', '-l', '2']) extract_cmd.append(['bgzip']) @@ -232,12 +234,12 @@ def maf_viz(job, config, options, maf_id, chrom, genomes): # now we can do this viz dotfile = os.path.join(work_dir, 'dotplot.pdf') cactus_call(parameters=['python3', '/home/hickey/dev/taffy/scripts/taffy_alignment_plot.py', - maf_path, '--show_dot_plot', '--output_format', 'png', + extract_maf_path, '--show_dot_plot', '--output_format', 'png', '--out_file', dotfile]) syntfile = os.path.join(work_dir, 'synteny.pdf') cactus_call(parameters=['python3', '/home/hickey/dev/taffy/scripts/taffy_alignment_plot.py', - maf_path, '--show_synteny', '--output_format', 'png', + extract_maf_path, '--show_synteny', '--output_format', 'png', '--bin_number', '500', '--show_sequence_boundaries', '--out_file', syntfile]) From 48ee8fddceb05b99050980b4306a9b8ff3496204 Mon Sep 17 00:00:00 2001 From: Glenn Hickey Date: Wed, 18 Dec 2024 15:17:46 -0500 Subject: [PATCH 4/8] shallower outdir --- src/cactus/maf/cactus_mafviz.py | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/src/cactus/maf/cactus_mafviz.py b/src/cactus/maf/cactus_mafviz.py index d821e6306..5f561519c 100644 --- a/src/cactus/maf/cactus_mafviz.py +++ b/src/cactus/maf/cactus_mafviz.py @@ -259,10 +259,6 @@ def export_mafviz(toil, options, output_dict): os.makedirs(chrom_base) for genome, genome_dict in chrom_dict.items(): - genome_base = os.path.join(chrom_base, genome) if genome else chrom_base - if not genome_base.startswith('s3://') and not os.path.isdir(genome_base): - os.makedirs(genome_base) - for viz_type, viz_id in genome_dict.items(): file_name = name if chrom: @@ -271,5 +267,5 @@ def export_mafviz(toil, options, output_dict): file_name += '.{}'.format(genome) file_name += '.{}.png'.format(viz_type) - toil.exportFile(viz_id, makeURL(os.path.join(genome_base, file_name))) + toil.exportFile(viz_id, makeURL(os.path.join(chrom_base, file_name))) From 80f63f100228be362d1a8509c7ed0ab8c08d02c7 Mon Sep 17 00:00:00 2001 From: Glenn Hickey Date: Wed, 18 Dec 2024 15:31:50 -0500 Subject: [PATCH 5/8] bug --- src/cactus/maf/cactus_mafviz.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/cactus/maf/cactus_mafviz.py b/src/cactus/maf/cactus_mafviz.py index 5f561519c..c7594062b 100644 --- a/src/cactus/maf/cactus_mafviz.py +++ b/src/cactus/maf/cactus_mafviz.py @@ -177,7 +177,7 @@ def maf_viz_chrom(job, config, options, maf_id, tai_id, chrom): genome_groups = [] if options.genomes: for genome in options.genomes: - genome_groups.append([g]) + genome_groups.append([genome]) # group haplotypes by sample, using .1 .2 etc naming convention from mc if options.haplotypes: From 7ac36c30ed72c593b53219216ca9bfb28a97c40b Mon Sep 17 00:00:00 2001 From: Glenn Hickey Date: Wed, 18 Dec 2024 15:39:11 -0500 Subject: [PATCH 6/8] fix stuff --- src/cactus/maf/cactus_mafviz.py | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/src/cactus/maf/cactus_mafviz.py b/src/cactus/maf/cactus_mafviz.py index c7594062b..9d9d28b52 100644 --- a/src/cactus/maf/cactus_mafviz.py +++ b/src/cactus/maf/cactus_mafviz.py @@ -148,7 +148,8 @@ def maf_viz_all(job, config, options, maf_id, tai_id): options.chroms = [None] if options.chroms: for chrom in options.chroms: - chrom_viz_job = job.addChildJobFn(maf_viz_chrom, config, options, maf_id, tai_id, chrom) + chrom_viz_job = job.addChildJobFn(maf_viz_chrom, config, options, maf_id, tai_id, chrom, + disk=maf_id.size*2) out_dict[chrom] = chrom_viz_job.rv() return out_dict @@ -192,12 +193,15 @@ def maf_viz_chrom(job, config, options, maf_id, tai_id, chrom): for hap_list in sample_map.values(): genome_groups.append(hap_list) + print(genome_groups) genome_dict = {} for genome_group in genome_groups: - viz_job = job.addChildJobFn(maf_viz, config, options, maf_id, chrom, genome_group) + viz_job = job.addChildJobFn(maf_viz, config, options, maf_id, chrom, genome_group, + disk=maf_id.size*3) genome_dict[genome_group[0]] = viz_job.rv() + print(genome_dict) return genome_dict def maf_viz(job, config, options, maf_id, chrom, genomes): @@ -216,7 +220,9 @@ def maf_viz(job, config, options, maf_id, chrom, genomes): if '.' in genome: new_genome = '_'.join(genome.rsplit('.', 1)) sed_cmd += ['-e', 's/{}/{}/g'.format(genome, new_genome)] - new_genomes.append(new_genome) + else: + new_genome = genome + new_genomes.append(new_genome) if len(sed_cmd) > 1: extract_cmd.append(sed_cmd) extract_cmd.append(['mafFilter', '-m', '-', '-i', ','.join(new_genomes + [options.refGenome])]) From 8d4568916607221175e9d00c6ee8d052871d58d3 Mon Sep 17 00:00:00 2001 From: Glenn Hickey Date: Wed, 18 Dec 2024 16:13:19 -0500 Subject: [PATCH 7/8] fix stuff --- src/cactus/maf/cactus_mafviz.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/cactus/maf/cactus_mafviz.py b/src/cactus/maf/cactus_mafviz.py index 9d9d28b52..e29e7a869 100644 --- a/src/cactus/maf/cactus_mafviz.py +++ b/src/cactus/maf/cactus_mafviz.py @@ -230,7 +230,7 @@ def maf_viz(job, config, options, maf_id, chrom, genomes): extract_cmd.append(['bgzip']) - extract_maf_path = os.path.join(work_dir, genomes[0] + '.' + os.path.basename(options.mafFile)) + extract_maf_path = os.path.join(work_dir, prefix + genomes[0] + '.' + os.path.basename(options.mafFile)) if not extract_maf_path.endswith('.gz'): extract_maf_path += '.gz' cactus_call(parameters=extract_cmd, outfile=extract_maf_path) From bcb2023554d8fb4d10d093d9a9fff99e28081527 Mon Sep 17 00:00:00 2001 From: Glenn Hickey Date: Fri, 20 Dec 2024 13:50:54 -0500 Subject: [PATCH 8/8] stable name --- src/cactus/maf/cactus_mafviz.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/cactus/maf/cactus_mafviz.py b/src/cactus/maf/cactus_mafviz.py index e29e7a869..f5df51812 100644 --- a/src/cactus/maf/cactus_mafviz.py +++ b/src/cactus/maf/cactus_mafviz.py @@ -199,7 +199,7 @@ def maf_viz_chrom(job, config, options, maf_id, tai_id, chrom): for genome_group in genome_groups: viz_job = job.addChildJobFn(maf_viz, config, options, maf_id, chrom, genome_group, disk=maf_id.size*3) - genome_dict[genome_group[0]] = viz_job.rv() + genome_dict[sorted(genome_group)[0]] = viz_job.rv() print(genome_dict) return genome_dict