From 94c55bb860b57ec52e224dd779cbdd69db517e0d Mon Sep 17 00:00:00 2001 From: Victor Lin Date: Tue, 26 Aug 2025 14:07:06 -0700 Subject: [PATCH 1/3] Add rule that writes run time config This will be used for augur subsample, and is generally useful for debugging. --- phylogenetic/Snakefile | 1 + phylogenetic/rules/write_config.smk | 19 +++++++++++++++++++ 2 files changed, 20 insertions(+) create mode 100644 phylogenetic/rules/write_config.smk diff --git a/phylogenetic/Snakefile b/phylogenetic/Snakefile index f9a67e9d..dba3dfe0 100644 --- a/phylogenetic/Snakefile +++ b/phylogenetic/Snakefile @@ -39,6 +39,7 @@ rule all: include: "rules/config.smk" +include: "rules/write_config.smk" include: "rules/prepare_sequences.smk" include: "rules/construct_phylogeny.smk" include: "rules/annotate_phylogeny.smk" diff --git a/phylogenetic/rules/write_config.smk b/phylogenetic/rules/write_config.smk new file mode 100644 index 00000000..7091a23e --- /dev/null +++ b/phylogenetic/rules/write_config.smk @@ -0,0 +1,19 @@ +""" +This part of the workflow writes run time configuration to a YAML file. + +OUTPUTS: + + results/{build_name}/run_config.yaml +""" + +rule write_config: + output: + config=build_dir + "/{build_name}/run_config.yaml", + log: + "logs/{build_name}/write_config.txt", + benchmark: + "benchmarks/{build_name}/write_config.txt", + run: + import yaml + with open(output.config, 'w') as f: + yaml.dump(config, f, sort_keys=False) From f83e274cc97de30d11861bfbfed16c2662ee914c Mon Sep 17 00:00:00 2001 From: Victor Lin Date: Tue, 26 Aug 2025 14:08:23 -0700 Subject: [PATCH 2/3] =?UTF-8?q?=F0=9F=9A=A7=20Use=20augur=20subsample?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The augur subsample command is built to replace the pattern of multiple intermediate augur filter calls with simpler Snakemake rules and configuration. Note that this is a breaking change and the old configuration will no longer work. Due to limitations of augur subsample, I had to move the "include" file from the top level of config to under a sample. For configs with multiple samples, I chose the sample somewhat arbitrarily. We could easily update augur subsample to accept the --include CLI option, but I'm not sure it's necessary given this easy workaround. --- CHANGELOG.md | 1 + phylogenetic/Snakefile | 2 +- phylogenetic/build-configs/ci/config.yaml | 103 ++++++++++--------- phylogenetic/build-configs/inrb/config.yaml | 4 +- phylogenetic/defaults/clade-i/config.yaml | 8 +- phylogenetic/defaults/hmpxv1/config.yaml | 103 ++++++++++--------- phylogenetic/defaults/hmpxv1_big/config.yaml | 35 ++++--- phylogenetic/defaults/mpxv/config.yaml | 95 +++++++++-------- phylogenetic/rules/config.smk | 11 -- phylogenetic/rules/prepare_sequences.smk | 47 ++------- 10 files changed, 199 insertions(+), 210 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 52db0d1a..2b3d3200 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -10,6 +10,7 @@ Instead, changes appear below grouped by the date they were added to the workflo ## 2025 +* TBD: [breaking] Switch to use `augur subsample`. * 02 July 2025: phylogenetic - config schema updates for easier config overlays ([#321][]) * new required config params * `exclude` - path to exclude.txt for `augur filter` diff --git a/phylogenetic/Snakefile b/phylogenetic/Snakefile index dba3dfe0..b666e5db 100644 --- a/phylogenetic/Snakefile +++ b/phylogenetic/Snakefile @@ -2,7 +2,7 @@ from packaging import version from augur.__version__ import __version__ as augur_version import sys -min_augur_version = "22.2.0" +min_augur_version = "31.5.0" if version.parse(augur_version) < version.parse(min_augur_version): print("This pipeline needs a newer version of augur than you currently have...") print( diff --git a/phylogenetic/build-configs/ci/config.yaml b/phylogenetic/build-configs/ci/config.yaml index e9637f7c..66ccb81a 100644 --- a/phylogenetic/build-configs/ci/config.yaml +++ b/phylogenetic/build-configs/ci/config.yaml @@ -4,7 +4,6 @@ custom_rules: reference: "defaults/reference.fasta" genome_annotation: "defaults/genome_annotation.gff3" genbank_reference: "defaults/reference.gb" -include: "defaults/hmpxv1/include.txt" exclude: "defaults/exclude.txt" clades: "defaults/clades.tsv" lat_longs: "defaults/lat_longs.tsv" @@ -30,52 +29,62 @@ filter: ### Set 1: Non-B.1 sequences: use all ### Set 2: B.1 sequences: small sample across year/country, maybe month subsample: - non_b1: >- - --group-by lineage year country - --sequences-per-group 50 - --exclude-where - outbreak!=hMPXV-1 - clade!=IIb - lineage=B.1 - lineage=B.1.1 - lineage=B.1.2 - lineage=B.1.3 - lineage=C.1 - lineage=C.1.1 - lineage=E.1 - lineage=E.2 - lineage=E.3 - lineage=B.1.4 - lineage=B.1.5 - lineage=B.1.6 - lineage=D.1 - lineage=B.1.7 - lineage=B.1.8 - lineage=B.1.9 - lineage=B.1.10 - lineage=B.1.11 - lineage=B.1.12 - lineage=B.1.13 - lineage=B.1.14 - lineage=B.1.15 - lineage=B.1.16 - lineage=B.1.17 - lineage=B.1.18 - lineage=B.1.19 - lineage=B.1.20 - lineage=F.1 - lineage=F.2 - lineage=F.3 - lineage=F.4 - lineage=F.5 - lineage=F.6 - lineage=B.1.21 - lineage=B.1.22 - lineage=B.1.23 - b1: >- - --group-by country year - --subsample-max-sequences 300 - --exclude-where outbreak!=hMPXV-1 clade!=IIb + samples: + non_b1: + group_by: + - lineage + - year + - country + sequences_per_group: 50 + exclude_where: + - outbreak!=hMPXV-1 + - clade!=IIb + - lineage=B.1 + - lineage=B.1.1 + - lineage=B.1.2 + - lineage=B.1.3 + - lineage=C.1 + - lineage=C.1.1 + - lineage=E.1 + - lineage=E.2 + - lineage=E.3 + - lineage=B.1.4 + - lineage=B.1.5 + - lineage=B.1.6 + - lineage=D.1 + - lineage=B.1.7 + - lineage=B.1.8 + - lineage=B.1.9 + - lineage=B.1.10 + - lineage=B.1.11 + - lineage=B.1.12 + - lineage=B.1.13 + - lineage=B.1.14 + - lineage=B.1.15 + - lineage=B.1.16 + - lineage=B.1.17 + - lineage=B.1.18 + - lineage=B.1.19 + - lineage=B.1.20 + - lineage=F.1 + - lineage=F.2 + - lineage=F.3 + - lineage=F.4 + - lineage=F.5 + - lineage=F.6 + - lineage=B.1.21 + - lineage=B.1.22 + - lineage=B.1.23 + include: + - defaults/hmpxv1/include.txt + b1: + group_by: + - country + - year + max_sequences: 300 + exclude_where: + - outbreak!=hMPXV-1 + - clade!=IIb ## align max_indel: 10000 diff --git a/phylogenetic/build-configs/inrb/config.yaml b/phylogenetic/build-configs/inrb/config.yaml index 3569c337..2a3abc44 100644 --- a/phylogenetic/build-configs/inrb/config.yaml +++ b/phylogenetic/build-configs/inrb/config.yaml @@ -21,5 +21,5 @@ traits: # Private INRB data doesn't have clade annotations so allow empty clade fields # (i.e. we're assuming all INRB data is clade I) subsample: - everything: >- - --query 'clade in ["I", "Ia", "Ib", ""]' + everything: + query: clade in ["I", "Ia", "Ib", ""] diff --git a/phylogenetic/defaults/clade-i/config.yaml b/phylogenetic/defaults/clade-i/config.yaml index 82b6170c..19564a08 100644 --- a/phylogenetic/defaults/clade-i/config.yaml +++ b/phylogenetic/defaults/clade-i/config.yaml @@ -1,7 +1,6 @@ reference: "defaults/clade-i/reference.fasta" genome_annotation: "defaults/clade-i/genome_annotation.gff3" genbank_reference: "defaults/clade-i/reference.gb" -include: "defaults/clade-i/include.txt" exclude: "defaults/exclude.txt" clades: "defaults/clades.tsv" lat_longs: "defaults/lat_longs.tsv" @@ -26,8 +25,11 @@ filter: ### Filter to only Clade I sequences subsample: - everything: >- - --query 'clade in ["I", "Ia", "Ib"]' + samples: + everything: + query: clade in ["I", "Ia", "Ib"] + include: + - defaults/clade-i/include.txt ## align max_indel: 10000 diff --git a/phylogenetic/defaults/hmpxv1/config.yaml b/phylogenetic/defaults/hmpxv1/config.yaml index 6f95b154..181491cc 100644 --- a/phylogenetic/defaults/hmpxv1/config.yaml +++ b/phylogenetic/defaults/hmpxv1/config.yaml @@ -1,7 +1,6 @@ reference: "defaults/reference.fasta" genome_annotation: "defaults/genome_annotation.gff3" genbank_reference: "defaults/reference.gb" -include: "defaults/hmpxv1/include.txt" exclude: "defaults/exclude.txt" clades: "defaults/clades.tsv" lat_longs: "defaults/lat_longs.tsv" @@ -27,52 +26,62 @@ filter: ### Set 1: Non-B.1 sequences: use all ### Set 2: B.1 sequences: small sample across year/country, maybe month subsample: - non_b1: >- - --group-by lineage year country - --sequences-per-group 50 - --exclude-where - outbreak!=hMPXV-1 - clade!=IIb - lineage=B.1 - lineage=B.1.1 - lineage=B.1.2 - lineage=B.1.3 - lineage=C.1 - lineage=C.1.1 - lineage=E.1 - lineage=E.2 - lineage=E.3 - lineage=B.1.4 - lineage=B.1.5 - lineage=B.1.6 - lineage=D.1 - lineage=B.1.7 - lineage=B.1.8 - lineage=B.1.9 - lineage=B.1.10 - lineage=B.1.11 - lineage=B.1.12 - lineage=B.1.13 - lineage=B.1.14 - lineage=B.1.15 - lineage=B.1.16 - lineage=B.1.17 - lineage=B.1.18 - lineage=B.1.19 - lineage=B.1.20 - lineage=F.1 - lineage=F.2 - lineage=F.3 - lineage=F.4 - lineage=F.5 - lineage=F.6 - lineage=B.1.21 - lineage=B.1.22 - lineage=B.1.23 - b1: >- - --group-by country year - --subsample-max-sequences 300 - --exclude-where outbreak!=hMPXV-1 clade!=IIb + samples: + non_b1: + group_by: + - lineage + - year + - country + sequences_per_group: 50 + exclude_where: + - outbreak!=hMPXV-1 + - clade!=IIb + - lineage=B.1 + - lineage=B.1.1 + - lineage=B.1.2 + - lineage=B.1.3 + - lineage=C.1 + - lineage=C.1.1 + - lineage=E.1 + - lineage=E.2 + - lineage=E.3 + - lineage=B.1.4 + - lineage=B.1.5 + - lineage=B.1.6 + - lineage=D.1 + - lineage=B.1.7 + - lineage=B.1.8 + - lineage=B.1.9 + - lineage=B.1.10 + - lineage=B.1.11 + - lineage=B.1.12 + - lineage=B.1.13 + - lineage=B.1.14 + - lineage=B.1.15 + - lineage=B.1.16 + - lineage=B.1.17 + - lineage=B.1.18 + - lineage=B.1.19 + - lineage=B.1.20 + - lineage=F.1 + - lineage=F.2 + - lineage=F.3 + - lineage=F.4 + - lineage=F.5 + - lineage=F.6 + - lineage=B.1.21 + - lineage=B.1.22 + - lineage=B.1.23 + include: + - defaults/hmpxv1/include.txt + b1: + group_by: + - country + - year + max_sequences: 300 + exclude_where: + - outbreak!=hMPXV-1 + - clade!=IIb ## align max_indel: 10000 diff --git a/phylogenetic/defaults/hmpxv1_big/config.yaml b/phylogenetic/defaults/hmpxv1_big/config.yaml index 0e8220dd..44862053 100644 --- a/phylogenetic/defaults/hmpxv1_big/config.yaml +++ b/phylogenetic/defaults/hmpxv1_big/config.yaml @@ -1,7 +1,6 @@ reference: "defaults/reference.fasta" genome_annotation: "defaults/genome_annotation.gff3" genbank_reference: "defaults/reference.gb" -include: "defaults/hmpxv1_big/include.txt" exclude: "defaults/exclude.txt" clades: "defaults/clades.tsv" lat_longs: "defaults/lat_longs.tsv" @@ -24,20 +23,26 @@ filter: query: "(QC_rare_mutations == 'good' | QC_rare_mutations == 'mediocre')" subsample: - b1: >- - --group-by year month country - --subsample-max-sequences 5000 - --exclude-where - outbreak!=hMPXV-1 - clade!=IIb - lineage=A - lineage=A.1 - lineage=A.1.1 - lineage=A.2 - lineage=A.2.1 - lineage=A.2.2 - lineage=A.2.3 - lineage=A.3 + samples: + b1: + group_by: + - year + - month + - country + max_sequences: 5000 + exclude_where: + - outbreak!=hMPXV-1 + - clade!=IIb + - lineage=A + - lineage=A.1 + - lineage=A.1.1 + - lineage=A.2 + - lineage=A.2.1 + - lineage=A.2.2 + - lineage=A.2.3 + - lineage=A.3 + include: + - defaults/hmpxv1_big/include.txt ## align max_indel: 10000 diff --git a/phylogenetic/defaults/mpxv/config.yaml b/phylogenetic/defaults/mpxv/config.yaml index 7f39b283..98b85512 100644 --- a/phylogenetic/defaults/mpxv/config.yaml +++ b/phylogenetic/defaults/mpxv/config.yaml @@ -1,5 +1,4 @@ auspice_config: "defaults/mpxv/auspice_config.json" -include: "defaults/mpxv/include.txt" exclude: "defaults/exclude.txt" reference: "defaults/reference.fasta" genome_annotation: "defaults/genome_annotation.gff3" @@ -26,49 +25,57 @@ filter: ### Set 1: Non-B.1 sequences: use all ### Set 2: B.1 sequences: small sample across year/country, maybe month subsample: - non_b1: >- - --group-by clade year country - --sequences-per-group 50 - --exclude-where - lineage=B.1 - lineage=B.1.1 - lineage=B.1.2 - lineage=B.1.3 - lineage=C.1 - lineage=C.1.1 - lineage=E.1 - lineage=E.2 - lineage=E.3 - lineage=B.1.4 - lineage=B.1.5 - lineage=B.1.6 - lineage=D.1 - lineage=B.1.7 - lineage=B.1.8 - lineage=B.1.9 - lineage=B.1.10 - lineage=B.1.11 - lineage=B.1.12 - lineage=B.1.13 - lineage=B.1.14 - lineage=B.1.15 - lineage=B.1.16 - lineage=B.1.17 - lineage=B.1.18 - lineage=B.1.19 - lineage=B.1.20 - lineage=F.1 - lineage=F.2 - lineage=F.3 - lineage=F.4 - lineage=F.5 - lineage=F.6 - lineage=B.1.21 - lineage=B.1.22 - lineage=B.1.23 - b1: >- - --group-by country year - --subsample-max-sequences 100 + samples: + non_b1: + group_by: + - clade + - year + - country + sequences_per_group: 50 + exclude_where: + - lineage=B.1 + - lineage=B.1.1 + - lineage=B.1.2 + - lineage=B.1.3 + - lineage=C.1 + - lineage=C.1.1 + - lineage=E.1 + - lineage=E.2 + - lineage=E.3 + - lineage=B.1.4 + - lineage=B.1.5 + - lineage=B.1.6 + - lineage=D.1 + - lineage=B.1.7 + - lineage=B.1.8 + - lineage=B.1.9 + - lineage=B.1.10 + - lineage=B.1.11 + - lineage=B.1.12 + - lineage=B.1.13 + - lineage=B.1.14 + - lineage=B.1.15 + - lineage=B.1.16 + - lineage=B.1.17 + - lineage=B.1.18 + - lineage=B.1.19 + - lineage=B.1.20 + - lineage=F.1 + - lineage=F.2 + - lineage=F.3 + - lineage=F.4 + - lineage=F.5 + - lineage=F.6 + - lineage=B.1.21 + - lineage=B.1.22 + - lineage=B.1.23 + include: + - defaults/mpxv/include.txt + b1: + group_by: + - country + - year + max_sequences: 100 ## align max_indel: 10000 diff --git a/phylogenetic/rules/config.smk b/phylogenetic/rules/config.smk index 0d94d6b4..980a4e65 100644 --- a/phylogenetic/rules/config.smk +++ b/phylogenetic/rules/config.smk @@ -44,17 +44,6 @@ if isinstance(config.get("colors", {}).get("ignore_categories"), str): # Looping over a shallow copy of the dict since we remove disabled subsampling groups for name, params in config["subsample"].copy().items(): - if isinstance(params, dict): - print(f"Converting config['subsample']['{name}'] from a dictionary to a string; " - "consider updating the config param in the config file.", file=sys.stderr) - config["subsample"][name] = " ".join(( - params["group_by"], - params["sequences_per_group"], - f"--query {params['query']}" if "query" in params else "", - f"--exclude-where {' '.join([f'lineage={l}' for l in params['exclude_lineages']])}" if "exclude_lineages" in params else "", - params.get("other_filters", ""), - )) - # Null subsample params represent disabled subsampling groups # This allows config overlays to disable the default subsampling groups with # YAML null value (~) which gets mapped to the Python `None` diff --git a/phylogenetic/rules/prepare_sequences.smk b/phylogenetic/rules/prepare_sequences.smk index 9b7abc02..9c1439d0 100644 --- a/phylogenetic/rules/prepare_sequences.smk +++ b/phylogenetic/rules/prepare_sequences.smk @@ -138,40 +138,6 @@ rule add_private_data: rule subsample: input: - metadata=( - build_dir + "/{build_name}/good_metadata_combined.tsv" - if config.get("private_metadata", False) - else build_dir + "/{build_name}/good_metadata.tsv" - ), - output: - strains=build_dir + "/{build_name}/{sample}_strains.txt", - log=build_dir + "/{build_name}/{sample}_filter.log", - params: - augur_filter_args=lambda w: config["subsample"][w.sample], - strain_id=config["strain_id_field"], - log: - "logs/{build_name}/{sample}_subsample.txt", - benchmark: - "benchmarks/{build_name}/{sample}_subsample.txt" - shell: - r""" - exec &> >(tee {log:q}) - - augur filter \ - --metadata {input.metadata:q} \ - --metadata-id-columns {params.strain_id:q} \ - --output-strains {output.strains:q} \ - {params.augur_filter_args} \ - --output-log {output.log:q} - """ - - -rule combine_samples: - input: - strains=lambda w: [ - f"{build_dir}/{w.build_name}/{sample}_strains.txt" - for sample in config["subsample"] - ], sequences=( build_dir + "/{build_name}/good_sequences_combined.fasta" if config.get("private_sequences", False) @@ -182,26 +148,27 @@ rule combine_samples: if config.get("private_metadata", False) else build_dir + "/{build_name}/good_metadata.tsv" ), - include=config["include"], + config=build_dir + "/{build_name}/run_config.yaml", output: sequences=build_dir + "/{build_name}/filtered.fasta", metadata=build_dir + "/{build_name}/metadata.tsv", params: strain_id=config["strain_id_field"], + config_root="subsample", log: - "logs/{build_name}/combine_samples.txt", + "logs/{build_name}/subsample.txt", benchmark: - "benchmarks/{build_name}/combine_samples.txt" + "benchmarks/{build_name}/subsample.txt" shell: r""" exec &> >(tee {log:q}) - augur filter \ + augur subsample \ --metadata-id-columns {params.strain_id:q} \ --sequences {input.sequences:q} \ --metadata {input.metadata:q} \ - --exclude-all \ - --include {input.strains:q} {input.include:q}\ + --config {input.config:q} \ + --config-root {params.config_root:q} \ --output-sequences {output.sequences:q} \ --output-metadata {output.metadata:q} """ From bb1144174e777509b1284c2e2d8c3fc05116cd6a Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Tue, 26 Aug 2025 21:51:00 +0000 Subject: [PATCH 3/3] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- phylogenetic/rules/write_config.smk | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/phylogenetic/rules/write_config.smk b/phylogenetic/rules/write_config.smk index 7091a23e..481c9d23 100644 --- a/phylogenetic/rules/write_config.smk +++ b/phylogenetic/rules/write_config.smk @@ -6,14 +6,16 @@ OUTPUTS: results/{build_name}/run_config.yaml """ + rule write_config: output: config=build_dir + "/{build_name}/run_config.yaml", log: "logs/{build_name}/write_config.txt", benchmark: - "benchmarks/{build_name}/write_config.txt", + "benchmarks/{build_name}/write_config.txt" run: import yaml - with open(output.config, 'w') as f: + + with open(output.config, "w") as f: yaml.dump(config, f, sort_keys=False)