From d7f401af5fdfbe08bc029e18d7724b45e53158a0 Mon Sep 17 00:00:00 2001 From: Charlotte Capitanchik Date: Tue, 14 May 2024 14:27:11 +0100 Subject: [PATCH 01/10] add encode eclip mode --- main.nf | 9 ++++- .../goodwright/clipseq/encode_moveumi/main.nf | 24 +++++++++++ .../clipseq/encode_moveumi/meta.yml | 40 +++++++++++++++++++ .../templates/encode_moveumi.py | 29 ++++++++++++++ nextflow.config | 1 + schema/clipseq.json | 5 +++ 6 files changed, 107 insertions(+), 1 deletion(-) create mode 100644 modules/goodwright/clipseq/encode_moveumi/main.nf create mode 100644 modules/goodwright/clipseq/encode_moveumi/meta.yml create mode 100644 modules/goodwright/clipseq/encode_moveumi/templates/encode_moveumi.py diff --git a/main.nf b/main.nf index 7d32840..b457fa4 100644 --- a/main.nf +++ b/main.nf @@ -116,6 +116,7 @@ include { CLIPPY as CLIPPY_TRANSCRIPT } from './modules/goodwrigh include { PEKA } from './modules/goodwright/peka/main' include { DUMP_SOFTWARE_VERSIONS } from './modules/goodwright/dump_software_versions/main' include { CLIPSEQ_CLIPQC } from './modules/goodwright/clipseq/clipqc/main' +include { ENCODE_MOVEUMI } from './modules/goodwright/clipseq/encode_moveumi/main' // // SUBWORKFLOWS @@ -277,7 +278,13 @@ workflow CLIPSEQ { } //EXAMPLE CHANNEL STRUCT: [[id:h3k27me3_R1, group:h3k27me3, replicate:1, single_end:false], [FASTQ]] //ch_fastq | view - + if(params.encode_eclip){ + ENCODE_MOVEUMI ( + ch_fastq + ) + ch_versions = ch_versions.mix(ENCODE_MOVEUMI.out.versions) + ch_fastq = ENCODE_MOVEUMI.out.reads + } if(params.run_move_umi_to_header){ UMITOOLS_EXTRACT ( ch_fastq diff --git a/modules/goodwright/clipseq/encode_moveumi/main.nf b/modules/goodwright/clipseq/encode_moveumi/main.nf new file mode 100644 index 0000000..81154a9 --- /dev/null +++ b/modules/goodwright/clipseq/encode_moveumi/main.nf @@ -0,0 +1,24 @@ +process ENCODE_MOVEUMI { + label "process_single" + + conda "bioconda::biopython=1.70" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/biopython:1.81 ' : + 'biocontainers/biopython:1.81' }" + + input: + tuple val(meta), path(reads) + + output: + tuple val(meta), path("*.fastq.gz"), emit: reads + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + shell: + def args = task.ext.args ?: '' + prefix = task.ext.prefix ?: "${meta.id}" + process_name = task.process + template 'encode_moveumi.py' +} diff --git a/modules/goodwright/clipseq/encode_moveumi/meta.yml b/modules/goodwright/clipseq/encode_moveumi/meta.yml new file mode 100644 index 0000000..8429c6e --- /dev/null +++ b/modules/goodwright/clipseq/encode_moveumi/meta.yml @@ -0,0 +1,40 @@ +name: clipseq_clipqc +description: Runs python-based clip quality control and outputs to a set of tsv files +keywords: + - iCLIP + - eCLIP + - CLIP + - qc +input: + - premap: + type: file + description: Premap metrics files + - mapped: + type: file + description: Alignment metrics files + - collapse: + type: file + description: UMI collapse metrics files + - xlinks: + type: file + description: xlinks metric files + - icount: + type: file + description: iCount metrics files + - paraclu: + type: file + description: paraclu metrics files + - clippy: + type: file + description: Clippy metrics files +output: + - tsv: + type: file + description: All tsv file outputs + pattern: "*.tsv" + - version: + type: file + description: File containing software version + pattern: "*.{version.txt}" +authors: + - "@chris-cheshire" diff --git a/modules/goodwright/clipseq/encode_moveumi/templates/encode_moveumi.py b/modules/goodwright/clipseq/encode_moveumi/templates/encode_moveumi.py new file mode 100644 index 0000000..7794903 --- /dev/null +++ b/modules/goodwright/clipseq/encode_moveumi/templates/encode_moveumi.py @@ -0,0 +1,29 @@ +#!/usr/bin/env python + +import os +import sys +import gzip +import platform +from Bio import SeqIO +import Bio + +input_fq = "!{reads}" +output_fq = "!{prefix}.umi.fastq" + +with gzip.open(input_fq, mode = 'rt') as f_in: + with open(output_fq, mode = 'w') as f_out: + for record in SeqIO.parse(f_in, 'fastq'): + header = record.id.split(":") + if '_' not in header[-1]: + rearranged = ":".join(header[1:]) + '_' + header[0] + record.id = rearranged + record.name = rearranged + record.description = rearranged + SeqIO.write(record, f_out, 'fastq') + +os.system('gzip ' + output_fq) + +with open("versions.yml", "w") as out_f: + out_f.write("!{process_name}" + ":\n") + out_f.write(" python: " + platform.python_version() + "\n") + out_f.write(" biopython: " + Bio.__version__ + "\n") \ No newline at end of file diff --git a/nextflow.config b/nextflow.config index 6bbe635..8c471a2 100644 --- a/nextflow.config +++ b/nextflow.config @@ -69,6 +69,7 @@ params { save_unaligned_res = true // Pipeline params + encode_eclip = false move_umi_to_header = false umi_header_format = null save_unaligned = true // DO NOT CHANGE diff --git a/schema/clipseq.json b/schema/clipseq.json index 2ca3536..2ca109d 100644 --- a/schema/clipseq.json +++ b/schema/clipseq.json @@ -270,6 +270,11 @@ "description": "Additional pipeline configuration options.", "advanced": true, "properties": { + "encode_eclip": { + "name": "ENCODE eCLIP mode", + "description": "Use just read 2 and enable this option which moves the header UMI to correct location for downstream processing.", + "type": "boolean" + }, "move_umi_to_header": { "name": "Extract UMI to header", "description": "Runs UMI to header extraction based on the head format provided in UMI header format.", From 986b723a186a013032032d1f469c49afc3bcde9a Mon Sep 17 00:00:00 2001 From: Charlotte Capitanchik Date: Tue, 14 May 2024 15:44:58 +0100 Subject: [PATCH 02/10] update encode_moveumi --- .../clipseq/encode_moveumi/templates/encode_moveumi.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/modules/goodwright/clipseq/encode_moveumi/templates/encode_moveumi.py b/modules/goodwright/clipseq/encode_moveumi/templates/encode_moveumi.py index 7794903..ef58777 100644 --- a/modules/goodwright/clipseq/encode_moveumi/templates/encode_moveumi.py +++ b/modules/goodwright/clipseq/encode_moveumi/templates/encode_moveumi.py @@ -15,7 +15,7 @@ for record in SeqIO.parse(f_in, 'fastq'): header = record.id.split(":") if '_' not in header[-1]: - rearranged = ":".join(header[1:]) + '_' + header[0] + rearranged = ":".join(header[1:]) + '_rbc:' + header[0] record.id = rearranged record.name = rearranged record.description = rearranged From 47a9c391d347106f1687872d9e53f99b610ad334 Mon Sep 17 00:00:00 2001 From: Charlotte Capitanchik Date: Tue, 14 May 2024 15:53:54 +0100 Subject: [PATCH 03/10] update zipping to pigz for speed --- modules/goodwright/clipseq/encode_moveumi/main.nf | 6 ++---- .../clipseq/encode_moveumi/templates/encode_moveumi.py | 2 +- 2 files changed, 3 insertions(+), 5 deletions(-) diff --git a/modules/goodwright/clipseq/encode_moveumi/main.nf b/modules/goodwright/clipseq/encode_moveumi/main.nf index 81154a9..2e2069d 100644 --- a/modules/goodwright/clipseq/encode_moveumi/main.nf +++ b/modules/goodwright/clipseq/encode_moveumi/main.nf @@ -1,10 +1,8 @@ process ENCODE_MOVEUMI { label "process_single" - conda "bioconda::biopython=1.70" - container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? - 'https://depot.galaxyproject.org/singularity/biopython:1.81 ' : - 'biocontainers/biopython:1.81' }" + conda "bioconda::biopython=1.78 pigz=2.6" + container "quay.io/biocontainers/mulled-v2-877c4e5a8fad685ea5bde487e04924ac447923b9:b7daa641364165419b9a87d9988bc803f913c5b6-0" input: tuple val(meta), path(reads) diff --git a/modules/goodwright/clipseq/encode_moveumi/templates/encode_moveumi.py b/modules/goodwright/clipseq/encode_moveumi/templates/encode_moveumi.py index ef58777..4b4ff31 100644 --- a/modules/goodwright/clipseq/encode_moveumi/templates/encode_moveumi.py +++ b/modules/goodwright/clipseq/encode_moveumi/templates/encode_moveumi.py @@ -21,7 +21,7 @@ record.description = rearranged SeqIO.write(record, f_out, 'fastq') -os.system('gzip ' + output_fq) +os.system('pigz ' + output_fq) with open("versions.yml", "w") as out_f: out_f.write("!{process_name}" + ":\n") From 9efb325ba33bdd96a3f1a7bbdd1902bc268c6892 Mon Sep 17 00:00:00 2001 From: Charlotte Capitanchik Date: Tue, 7 Jan 2025 17:21:50 +0000 Subject: [PATCH 04/10] correct "skip umi dedupe" path previously many processes were skipped when skip_umi_dedupe param = true. this was because ch_genome_bam and ch_genome_bai were not set. this is now fixed by adding an else clause to the logic in main.nf --- main.nf | 3 +++ 1 file changed, 3 insertions(+) diff --git a/main.nf b/main.nf index b457fa4..687b2c2 100644 --- a/main.nf +++ b/main.nf @@ -440,6 +440,9 @@ workflow CLIPSEQ { ch_versions = ch_versions.mix(TRANSCRIPT_DEDUP.out.versions) ch_transcript_bam = TRANSCRIPT_DEDUP.out.bam ch_transcript_bai = TRANSCRIPT_DEDUP.out.bai + } else { + ch_genome_bam = ch_genome_unique_bam + ch_genome_bai = ch_genome_unique_bai } ch_genome_crosslink_bed = Channel.empty() From 35f55e873a62cf66773240319edc1211cc8f01db Mon Sep 17 00:00:00 2001 From: Charlotte Capitanchik Date: Tue, 7 Jan 2025 19:13:28 +0000 Subject: [PATCH 05/10] update get crosslinks convert from subworkflow to more manageable module enable ability to choose crosslink position as start middle or end of read --- conf/modules.config | 123 +----------------- main.nf | 22 ++-- modules/local/get_crosslinks.nf | 78 +++++++++++ nextflow.config | 1 + schema/clipseq.json | 16 ++- .../clip_calc_crosslinks/default.config | 49 ------- .../goodwright/clip_calc_crosslinks/main.nf | 120 ----------------- .../goodwright/clip_calc_crosslinks/meta.yml | 53 -------- 8 files changed, 106 insertions(+), 356 deletions(-) create mode 100644 modules/local/get_crosslinks.nf delete mode 100644 subworkflows/goodwright/clip_calc_crosslinks/default.config delete mode 100644 subworkflows/goodwright/clip_calc_crosslinks/main.nf delete mode 100644 subworkflows/goodwright/clip_calc_crosslinks/meta.yml diff --git a/conf/modules.config b/conf/modules.config index 9301b85..62e5729 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -568,55 +568,8 @@ if(params.run_umi_dedup) { if(params.run_calc_crosslinks) { process { - withName: '.*CROSSLINKS:BEDTOOLS_BAMTOBED' { - publishDir = [ - enabled: false - ] - } - - withName: '.*CROSSLINKS:BEDTOOLS_SHIFT' { - ext.args = '-m 1 -p -1' - publishDir = [ - enabled: false - ] - } - - withName: '.*CROSSLINKS:BEDTOOLS_GENOMECOV_POS' { - ext.args = '-dz -strand + -5' - publishDir = [ - enabled: false - ] - } - - withName: '.*CROSSLINKS:BEDTOOLS_GENOMECOV_NEG' { - ext.args = '-dz -strand - -5' - publishDir = [ - enabled: false - ] - } - - withName: '.*CROSSLINKS:SELECT_BED_POS' { - ext.cmd1 = 'awk \'{OFS="\\t"}{print \$1, \$2, \$2+1, ".", \$3, "+"}\'' - ext.suffix = '.pos' - ext.ext = 'bed' - publishDir = [ - enabled: false - ] - } - - withName: '.*CROSSLINKS:SELECT_BED_NEG' { - ext.cmd1 = 'awk \'{OFS="\\t"}{print \$1, \$2, \$2+1, ".", \$3, "-"}\'' - ext.suffix = '.neg' - ext.ext = 'bed' - publishDir = [ - enabled: false - ] - } - - withName: 'CLIPSEQ:CALC_GENOME_CROSSLINKS:MERGE_AND_SORT' { - ext.cmd1 = 'sort -k1,1 -k2,2n' + withName: 'CLIPSEQ:CALC_GENOME_CROSSLINKS' { ext.suffix = '.genome' - ext.ext = 'bed' publishDir = [ path: { "${params.outdir}/04_crosslinks" }, mode: "${params.publish_dir_mode}", @@ -624,66 +577,8 @@ if(params.run_calc_crosslinks) { ] } - withName: 'CLIPSEQ:CALC_GENOME_CROSSLINKS:CROSSLINK_COVERAGE' { - ext.cmd1 = 'awk \'{OFS = "\t"}{if (\$6 == "+") {print \$1, \$2, \$3, \$5} else {print \$1, \$2, \$3, -\$5}}\' | sort -k1,1 -k2,2n' - ext.suffix = '.genome' - ext.ext = 'bedgraph' - publishDir = [ - path: { "${params.outdir}/04_crosslinks" }, - mode: "${params.publish_dir_mode}", - saveAs: { filename -> filename.equals('versions.yml') ? null : filename } - ] - } - - withName: 'CLIPSEQ:CALC_GENOME_CROSSLINKS:CROSSLINK_NORMCOVERAGE' { - ext.cmd1 = 'awk -v total=\$CMD2 \'{printf "%s\\t%i\\t%i\\t%s\\t%f\\t%s\\n", \$1, \$2, \$3, \$4, 1000000*\$5/total, \$6}\' | awk \'{OFS = "\t"}{if (\$6 == "+") {print \$1, \$2, \$3, \$5} else {print \$1, \$2, \$3, -\$5}}\' | sort -k1,1 -k2,2n' - ext.cmd2 = 'awk \'BEGIN {total=0} {total=total+\$5} END {print total}\'' - ext.suffix = '.norm.genome' - ext.ext = 'bedgraph' - publishDir = [ - path: { "${params.outdir}/04_crosslinks" }, - mode: "${params.publish_dir_mode}", - saveAs: { filename -> filename.equals('versions.yml') ? null : filename } - ] - } - - withName: 'CLIPSEQ:CALC_TRANSCRIPT_CROSSLINKS:MERGE_AND_SORT' { - ext.cmd1 = 'sort -k1,1 -k2,2n' + withName: 'CLIPSEQ:CALC_TRANSCRIPT_CROSSLINKS' { ext.suffix = '.transcript' - ext.ext = 'bed' - publishDir = [ - path: { "${params.outdir}/04_crosslinks" }, - mode: "${params.publish_dir_mode}", - saveAs: { filename -> filename.equals('versions.yml') ? null : filename } - ] - } - - withName: 'CLIPSEQ:CALC_TRANSCRIPT_CROSSLINKS:CROSSLINK_COVERAGE' { - ext.cmd1 = 'awk \'{OFS = "\t"}{if (\$6 == "+") {print \$1, \$2, \$3, \$5} else {print \$1, \$2, \$3, -\$5}}\' | sort -k1,1 -k2,2n' - ext.suffix = '.transcript' - ext.ext = 'bedgraph' - publishDir = [ - path: { "${params.outdir}/04_crosslinks" }, - mode: "${params.publish_dir_mode}", - saveAs: { filename -> filename.equals('versions.yml') ? null : filename } - ] - } - - withName: 'CLIPSEQ:CALC_TRANSCRIPT_CROSSLINKS:CROSSLINK_NORMCOVERAGE' { - ext.cmd1 = 'awk -v total=\$CMD2 \'{printf "%s\\t%i\\t%i\\t%s\\t%f\\t%s\\n", \$1, \$2, \$3, \$4, 1000000*\$5/total, \$6}\' | awk \'{OFS = "\t"}{if (\$6 == "+") {print \$1, \$2, \$3, \$5} else {print \$1, \$2, \$3, -\$5}}\' | sort -k1,1 -k2,2n' - ext.cmd2 = 'awk \'BEGIN {total=0} {total=total+\$5} END {print total}\'' - ext.suffix = '.norm.transcript' - ext.ext = 'bedgraph' - publishDir = [ - path: { "${params.outdir}/04_crosslinks" }, - mode: "${params.publish_dir_mode}", - saveAs: { filename -> filename.equals('versions.yml') ? null : filename } - ] - } - withName: 'CLIPSEQ:CALC_SMRNA_K1_CROSSLINKS:MERGE_AND_SORT' { - ext.cmd1 = 'sort -k1,1 -k2,2n' - ext.suffix = '.smrna_withk1' - ext.ext = 'bed' publishDir = [ path: { "${params.outdir}/04_crosslinks" }, mode: "${params.publish_dir_mode}", @@ -691,26 +586,14 @@ if(params.run_calc_crosslinks) { ] } - withName: 'CLIPSEQ:CALC_SMRNA_K1_CROSSLINKS:CROSSLINK_COVERAGE' { - ext.cmd1 = 'awk \'{OFS = "\t"}{if (\$6 == "+") {print \$1, \$2, \$3, \$5} else {print \$1, \$2, \$3, -\$5}}\' | sort -k1,1 -k2,2n' + withName: 'CLIPSEQ:CALC_SMRNA_K1_CROSSLINKS' { ext.suffix = '.smrna_withk1' - ext.ext = 'bedgraph' publishDir = [ path: { "${params.outdir}/04_crosslinks" }, mode: "${params.publish_dir_mode}", saveAs: { filename -> filename.equals('versions.yml') ? null : filename } ] } - - withName: 'CLIPSEQ:CALC_SMRNA_K1_CROSSLINKS:CROSSLINK_NORMCOVERAGE' { - ext.cmd1 = 'awk -v total=\$CMD2 \'{printf "%s\\t%i\\t%i\\t%s\\t%f\\t%s\\n", \$1, \$2, \$3, \$4, 1000000*\$5/total, \$6}\' | awk \'{OFS = "\t"}{if (\$6 == "+") {print \$1, \$2, \$3, \$5} else {print \$1, \$2, \$3, -\$5}}\' | sort -k1,1 -k2,2n' - ext.cmd2 = 'awk \'BEGIN {total=0} {total=total+\$5} END {print total}\'' - ext.suffix = '.norm.smrna_withk1' - ext.ext = 'bedgraph' - publishDir = [ - enabled: false - ] - } } } diff --git a/main.nf b/main.nf index 687b2c2..bd0d336 100644 --- a/main.nf +++ b/main.nf @@ -95,6 +95,10 @@ ch_multiqc_config = file("$projectDir/assets/multiqc_config.yml", checkIfExists: // include { MULTIQC } from './modules/local/multiqc' +include { GET_CROSSLINKS as CALC_SMRNA_K1_CROSSLINKS } from './modules/local/get_crosslinks' +include { GET_CROSSLINKS as CALC_GENOME_CROSSLINKS } from './modules/local/get_crosslinks' +include { GET_CROSSLINKS as CALC_TRANSCRIPT_CROSSLINKS } from './modules/local/get_crosslinks' + // // SUBWORKFLOWS @@ -131,9 +135,6 @@ include { BAM_DEDUP_SAMTOOLS_UMITOOLS as GENOME_MULTI_DEDUP } from './subwor include { BAM_DEDUP_SAMTOOLS_UMITOOLS as SMRNA_DEDUP } from './subworkflows/goodwright/bam_dedup_samtools_umitools/main' include { BAM_DEDUP_SAMTOOLS_UMITOOLS as SMRNA_K1_DEDUP } from './subworkflows/goodwright/bam_dedup_samtools_umitools/main' include { BAM_DEDUP_SAMTOOLS_UMITOOLS as TRANSCRIPT_DEDUP } from './subworkflows/goodwright/bam_dedup_samtools_umitools/main' -include { CLIP_CALC_CROSSLINKS as CALC_SMRNA_K1_CROSSLINKS } from './subworkflows/goodwright/clip_calc_crosslinks/main' -include { CLIP_CALC_CROSSLINKS as CALC_GENOME_CROSSLINKS } from './subworkflows/goodwright/clip_calc_crosslinks/main' -include { CLIP_CALC_CROSSLINKS as CALC_TRANSCRIPT_CROSSLINKS } from './subworkflows/goodwright/clip_calc_crosslinks/main' include { PARACLU_ANALYSE as PARACLU_ANALYSE_GENOME } from './subworkflows/goodwright/paraclu_analyse/main' include { PARACLU_ANALYSE as PARACLU_ANALYSE_TRANSCRIPT } from './subworkflows/goodwright/paraclu_analyse/main' include { ICOUNT_ANALYSE } from './subworkflows/goodwright/icount_analyse/main' @@ -456,8 +457,9 @@ workflow CLIPSEQ { * SUBWORKFLOW: Run crosslink calculation for smRNA with -k 1 */ CALC_SMRNA_K1_CROSSLINKS ( - ch_smrna_k1_bam, - ch_smrna_fasta_fai.collect{ it[1] } + ch_smrna_k1_bam.join(ch_smrna_k1_bai), + ch_smrna_fasta_fai, + params.crosslink_position ) ch_versions = ch_versions.mix(CALC_SMRNA_K1_CROSSLINKS.out.versions) ch_smrna_crosslink_bed = CALC_SMRNA_K1_CROSSLINKS.out.bed @@ -468,8 +470,9 @@ workflow CLIPSEQ { * SUBWORKFLOW: Run crosslink calculation for genome */ CALC_GENOME_CROSSLINKS ( - ch_genome_bam, - ch_fasta_fai.collect{ it[1] } + ch_genome_bam.join(ch_genome_bai), + ch_fasta_fai, + params.crosslink_position ) ch_versions = ch_versions.mix(CALC_GENOME_CROSSLINKS.out.versions) ch_genome_crosslink_bed = CALC_GENOME_CROSSLINKS.out.bed @@ -480,8 +483,9 @@ workflow CLIPSEQ { * SUBWORKFLOW: Run crosslink calculation for transcripts */ CALC_TRANSCRIPT_CROSSLINKS ( - ch_transcript_bam, - ch_longest_transcript_fai.collect{ it[1] } + ch_transcript_bam.join(ch_transcript_bai), + ch_longest_transcript_fai, + params.crosslink_position ) ch_versions = ch_versions.mix(CALC_TRANSCRIPT_CROSSLINKS.out.versions) ch_trans_crosslink_bed = CALC_TRANSCRIPT_CROSSLINKS.out.bed diff --git a/modules/local/get_crosslinks.nf b/modules/local/get_crosslinks.nf new file mode 100644 index 0000000..02ce4ad --- /dev/null +++ b/modules/local/get_crosslinks.nf @@ -0,0 +1,78 @@ +process GET_CROSSLINKS { + tag "$meta.id" + label 'process_medium' + + conda "bioconda::bedtools=2.30.0" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/bedtools:2.30.0--hc088bd4_0' : + 'biocontainers/bedtools:2.30.0--hc088bd4_0' }" + + input: + tuple val(meta), path(bam), path(bai) + tuple val(meta2), path(fai) + val crosslink_position + + output: + tuple val(meta), path("*.xl.bed") , emit: bed + tuple val(meta), path("*.xl.bedgraph") , emit: coverage + tuple val(meta), path("*.xl.CPMnorm.bedgraph") , emit: coverage_norm + path "versions.yml" ,emit: versions + + + script: + def prefix = task.ext.suffix ? "${meta.id}${task.ext.suffix}" : "${meta.id}" + if (crosslink_position == "start"){ + """ + bedtools bamtobed -i $bam > dedup.bed + bedtools shift -m 1 -p -1 -i dedup.bed -g $fai > shifted.bed + bedtools genomecov -dz -strand + -5 -i shifted.bed -g $fai | awk '{OFS="\t"}{print \$1, \$2, \$2+1, ".", \$3, "+"}' > pos.bed + bedtools genomecov -dz -strand - -5 -i shifted.bed -g $fai | awk '{OFS="\t"}{print \$1, \$2, \$2+1, ".", \$3, "-"}' > neg.bed + cat pos.bed neg.bed | sort -k1,1 -k2,2n -k3,3 -k6,6 > ${prefix}.xl.bed + cat ${prefix}.xl.bed | awk '{OFS = "\t"}{if (\$6 == "+") {print \$1, \$2, \$3, \$5} else {print \$1, \$2, \$3, -\$5}}' > ${prefix}.xl.bedgraph + TOTAL_VARIABLE=`cat ${prefix}.xl.bed | awk \'BEGIN {total=0} {total=total+\$5} END {print total}\'` + cat ${prefix}.xl.bed | awk -v total=\$TOTAL_VARIABLE \'{printf "%s\\t%i\\t%i\\t%s\\t%f\\t%s\\n", \$1, \$2, \$3, \$4, 1000000*\$5/total, \$6}\' | awk \'{OFS = "\t"}{if (\$6 == "+") {print \$1, \$2, \$3, \$5} else {print \$1, \$2, \$3, -\$5}}\' | sort -k1,1 -k2,2n > ${prefix}.xl.CPMnorm.bedgraph + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + linux: NOVERSION + bedtools: `bedtools --version | head -n 1` + END_VERSIONS + """ + } else if (crosslink_position == "middle"){ + """ + bedtools bamtobed -i $bam > dedup.bed + awk '{OFS="\t"}{mid=int((\$2+\$3)/2); print \$1, mid, mid+1, \$4, \$5, \$6}' dedup.bed > shifted.bed + bedtools genomecov -dz -strand + -5 -i shifted.bed -g $fai | awk '{OFS="\t"}{print \$1, \$2, \$2+1, ".", \$3, "+"}' > pos.bed + bedtools genomecov -dz -strand - -5 -i shifted.bed -g $fai | awk '{OFS="\t"}{print \$1, \$2, \$2+1, ".", \$3, "-"}' > neg.bed + cat pos.bed neg.bed | sort -k1,1 -k2,2n -k3,3 -k6,6 > ${prefix}.xl.bed + cat ${prefix}.xl.bed | awk '{OFS = "\t"}{if (\$6 == "+") {print \$1, \$2, \$3, \$5} else {print \$1, \$2, \$3, -\$5}}' > ${prefix}.xl.bedgraph + TOTAL_VARIABLE=`cat ${prefix}.xl.bed | awk \'BEGIN {total=0} {total=total+\$5} END {print total}\'` + cat ${prefix}.xl.bed | awk -v total=\$TOTAL_VARIABLE \'{printf "%s\\t%i\\t%i\\t%s\\t%f\\t%s\\n", \$1, \$2, \$3, \$4, 1000000*\$5/total, \$6}\' | awk \'{OFS = "\t"}{if (\$6 == "+") {print \$1, \$2, \$3, \$5} else {print \$1, \$2, \$3, -\$5}}\' | sort -k1,1 -k2,2n > ${prefix}.xl.CPMnorm.bedgraph + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + linux: NOVERSION + bedtools: `bedtools --version | head -n 1` + END_VERSIONS + """ + } else if (crosslink_position == "end"){ + """ + bedtools bamtobed -i $bam > dedup.bed + awk -v OFS="\t" '\$6=="+" {print \$1,\$3,\$3+1,\$4,\$5,\$6} \$6=="-" {print \$1,\$2-1,\$2,\$4,\$5,\$6}' dedup.bed > shifted.bed + bedtools genomecov -dz -strand + -3 -i shifted.bed -g $fai | awk '{OFS="\t"}{print \$1, \$2, \$2+1, ".", \$3, "+"}' > pos.bed + bedtools genomecov -dz -strand - -3 -i shifted.bed -g $fai | awk '{OFS="\t"}{print \$1, \$2, \$2+1, ".", \$3, "-"}' > neg.bed + cat pos.bed neg.bed | sort -k1,1 -k2,2n -k3,3 -k6,6 > ${prefix}.xl.bed + cat ${prefix}.xl.bed | awk '{OFS = "\t"}{if (\$6 == "+") {print \$1, \$2, \$3, \$5} else {print \$1, \$2, \$3, -\$5}}' > ${prefix}.xl.bedgraph + TOTAL_VARIABLE=`cat ${prefix}.xl.bed | awk \'BEGIN {total=0} {total=total+\$5} END {print total}\'` + cat ${prefix}.xl.bed | awk -v total=\$TOTAL_VARIABLE \'{printf "%s\\t%i\\t%i\\t%s\\t%f\\t%s\\n", \$1, \$2, \$3, \$4, 1000000*\$5/total, \$6}\' | awk \'{OFS = "\t"}{if (\$6 == "+") {print \$1, \$2, \$3, \$5} else {print \$1, \$2, \$3, -\$5}}\' | sort -k1,1 -k2,2n > ${prefix}.xl.CPMnorm.bedgraph + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + linux: NOVERSION + bedtools: `bedtools --version | head -n 1` + END_VERSIONS + """ + } else { + error "Invalid crosslink_position: ${crosslink_position}" + } +} diff --git a/nextflow.config b/nextflow.config index 8c471a2..e857fac 100644 --- a/nextflow.config +++ b/nextflow.config @@ -69,6 +69,7 @@ params { save_unaligned_res = true // Pipeline params + crosslink_position = "start" encode_eclip = false move_umi_to_header = false umi_header_format = null diff --git a/schema/clipseq.json b/schema/clipseq.json index 0fe828e..4e0aff5 100644 --- a/schema/clipseq.json +++ b/schema/clipseq.json @@ -287,6 +287,12 @@ "description": "Additional pipeline configuration options.", "advanced": true, "params": { + "crosslink_position": { + "name": "Crosslink position", + "description": "The position of the crosslink in the read. Options are 'start', 'end' or 'middle'. Note that 'start' and 'end' correspond to start and end of read minus and plus 1 respectively.", + "type": "string", + "default": "start" + }, "move_umi_to_header": { "name": "Extract UMI to header", "description": "Runs UMI to header extraction based on the head format provided in UMI header format.", @@ -350,15 +356,15 @@ ], "outputs": [ { - "name": "Normalised genome crosslink bedgraph", - "description": "Genomic crosslinks normalised to total sample crosslinks in bedgraph format for genome browser viewing.", - "filetype": "bedgraph", - "process": "CROSSLINK_NORMCOVERAGE" + "name": "Raw genome crosslink bedgraph", + "description": "Genomic crosslinks in bedgraph format for genome browser viewing.", + "filetype": ".xl.bedgraph", + "process": "CALC_GENOME_CROSSLINKS" }, { "name": "Genomic peaks", "description": "Genomic peaks generated by Clippy.", - "filetype": "bed", + "filetype": "Peaks.bed", "process": "CLIPPY_GENOME" }, { diff --git a/subworkflows/goodwright/clip_calc_crosslinks/default.config b/subworkflows/goodwright/clip_calc_crosslinks/default.config deleted file mode 100644 index 4b0cc0a..0000000 --- a/subworkflows/goodwright/clip_calc_crosslinks/default.config +++ /dev/null @@ -1,49 +0,0 @@ -process { - - withName: BEDTOOLS_SHIFT { - ext.args = '-m 1 -p -1' - } - - withName: BEDTOOLS_GENOMECOV_POS { - ext.args = '-dz -strand + -5' - } - - withName: BEDTOOLS_GENOMECOV_NEG { - ext.args = '-dz -strand - -5' - } - - withName: SELECT_BED_POS { - ext.cmd1 = 'awk \'{OFS="\\t"}{print \$1, \$2, \$2+1, ".", \$3, "+"}\'' - ext.suffix = '.pos' - ext.ext = 'bed' - } - - withName: SELECT_BED_NEG { - ext.cmd1 = 'awk \'{OFS="\\t"}{print \$1, \$2, \$2+1, ".", \$3, "-"}\'' - ext.suffix = '.neg' - ext.ext = 'bed' - } - - withName: MERGE_AND_SORT { - ext.cmd1 = 'sort -k1,1 -k2,2n' - ext.suffix = '.sorted' - ext.ext = 'bed' - } - - withName: CROSSLINK_COVERAGE { - ext.cmd1 = 'awk \'{OFS = "\t"}{if (\$6 == "+") {print \$1, \$2, \$3, \$5} else {print \$1, \$2, \$3, -\$5}}\' | sort -k1,1 -k2,2n' - ext.suffix = '.sorted' - ext.ext = 'bedgraph' - } - - withName: CROSSLINK_NORMCOVERAGE { - ext.cmd1 = """ - awk -v total=\$CMD2 \'{printf "%s\\t%i\\t%i\\t%s\\t%f\\t%s\\n", \$1, \$2, \$3, \$4, 1000000*\$5/total, \$6}\' | - awk \'{OFS = "\t"}{if (\$6 == "+") {print \$1, \$2, \$3, \$5} else {print \$1, \$2, \$3, -\$5}}\' | - sort -k1,1 -k2,2n - """ - ext.cmd2 = 'awk \'BEGIN {total=0} {total=total+\$5} END {print total}\'' - ext.suffix = '.norm.sorted' - ext.ext = 'bedgraph' - } -} diff --git a/subworkflows/goodwright/clip_calc_crosslinks/main.nf b/subworkflows/goodwright/clip_calc_crosslinks/main.nf deleted file mode 100644 index b53c949..0000000 --- a/subworkflows/goodwright/clip_calc_crosslinks/main.nf +++ /dev/null @@ -1,120 +0,0 @@ -// -// Calculate clip crosslinks using an input BAM file and genome index file. -// Crosslinks are outputed as a BED file and additional coverage and normalised coverage -// tracks are calculated in BEDGRAPH format -// - -/* -* MODULES -*/ -include { BEDTOOLS_BAMTOBED } from '../../../modules/nf-core/bedtools/bamtobed/main.nf' -include { BEDTOOLS_SHIFT } from '../../../modules/goodwright/bedtools/shift/main.nf' -include { BEDTOOLS_GENOMECOV as BEDTOOLS_GENOMECOV_POS } from '../../../modules/nf-core/bedtools/genomecov/main.nf' -include { BEDTOOLS_GENOMECOV as BEDTOOLS_GENOMECOV_NEG } from '../../../modules/nf-core/bedtools/genomecov/main.nf' -include { LINUX_COMMAND as SELECT_BED_POS } from '../../../modules/goodwright/linux/command/main.nf' -include { LINUX_COMMAND as SELECT_BED_NEG } from '../../../modules/goodwright/linux/command/main.nf' -include { LINUX_COMMAND as MERGE_AND_SORT } from '../../../modules/goodwright/linux/command/main.nf' -include { LINUX_COMMAND as CROSSLINK_COVERAGE } from '../../../modules/goodwright/linux/command/main.nf' -include { LINUX_COMMAND as CROSSLINK_NORMCOVERAGE } from '../../../modules/goodwright/linux/command/main.nf' - -workflow CLIP_CALC_CROSSLINKS { - take: - bam // channel: [ val(meta), [ bam ] ] - fai // channel: [ fai ] - - main: - ch_versions = Channel.empty() - - /* - * MODULE: Convert input BAM file to BED format - */ - BEDTOOLS_BAMTOBED ( - bam - ) - ch_versions = ch_versions.mix(BEDTOOLS_BAMTOBED.out.versions) - - /* - * MODULE: Shift BED file according to parameters suppied in config (default is -s 0) - */ - BEDTOOLS_SHIFT ( - BEDTOOLS_BAMTOBED.out.bed, - fai - ) - ch_versions = ch_versions.mix(BEDTOOLS_SHIFT.out.versions) - - /* - * MODULE: Report depth at each position on the pos strand - */ - BEDTOOLS_GENOMECOV_POS ( - BEDTOOLS_SHIFT.out.bed.map{ [ it[0], it[1], 1 ] }, - fai, - 'pos.bed' - ) - ch_versions = ch_versions.mix(BEDTOOLS_GENOMECOV_POS.out.versions) - - /* - * MODULE: Report depth at each position on the neg strand - */ - BEDTOOLS_GENOMECOV_NEG ( - BEDTOOLS_SHIFT.out.bed.map{ [ it[0], it[1], 1 ] }, - fai, - 'neg.bed' - ) - - /* - * MODULE: Select columns in BED file using AWK - */ - SELECT_BED_POS ( - BEDTOOLS_GENOMECOV_POS.out.genomecov, - [], - false - ) - SELECT_BED_NEG ( - BEDTOOLS_GENOMECOV_NEG.out.genomecov, - [], - false - ) - - /* - * CHANNEL: Join POS/NEG files into one channel so they can be merged in the next module - */ - ch_merge_and_sort_input = SELECT_BED_POS.out.file - .map{ [ it[0].id, it[0], it[1] ] } - .join( SELECT_BED_NEG.out.file.map{ [ it[0].id, it[0], it[1] ] } ) - .map { [ it[1], [ it[2], it[4] ] ] } - //EXAMPLE CHANNEL STRUCT: [ [id:test], [ BED(pos), BED(neg) ] ] - //ch_merge_and_sort_input | view - - /* - * MODULE: Select columns in BED file using AWK - */ - MERGE_AND_SORT ( - ch_merge_and_sort_input, - [], - false - ) - - /* - * MODULE: Create coverage track using AWK - */ - CROSSLINK_COVERAGE ( - MERGE_AND_SORT.out.file, - [], - false - ) - - /* - * MODULE: Create normalised coverage track using AWK - */ - CROSSLINK_NORMCOVERAGE ( - MERGE_AND_SORT.out.file, - [], - true - ) - - emit: - bed = MERGE_AND_SORT.out.file // channel: [ val(meta), [ bed ] ] - coverage = CROSSLINK_COVERAGE.out.file // channel: [ val(meta), [ bedgraph ] ] - coverage_norm = CROSSLINK_NORMCOVERAGE.out.file // channel: [ val(meta), [ bedgraph ] ] - versions = ch_versions // channel: [ versions.yml ] -} diff --git a/subworkflows/goodwright/clip_calc_crosslinks/meta.yml b/subworkflows/goodwright/clip_calc_crosslinks/meta.yml deleted file mode 100644 index 0d6ac85..0000000 --- a/subworkflows/goodwright/clip_calc_crosslinks/meta.yml +++ /dev/null @@ -1,53 +0,0 @@ -name: clip_calc_crosslinks -description: | - Calculate clip crosslinks using an input BAM file and genome index file. Crosslinks are outputed as a BED file and additional - coverage and normalised coverage tracks are calculated in BEDGRAPH format. -keywords: - - bedtools - - crosslinks - - coverage - - fai - - bam -modules: - - goodwright/linux/command - - goodwright/bedtools/shift - - nf-core/bedtools/bamtobed - - nf-core/bedtools/genomecov -input: - - meta: - type: map - description: | - Groovy Map containing sample information - e.g. [ id:'test', single_end:false ] - - bam: - type: file - description: BAM file - pattern: "*.bam" - - fai: - type: file - description: FAI file - pattern: "*.fai" -output: - - meta: - type: map - description: | - Groovy Map containing sample information - e.g. [ id:'test', single_end:false ] - - bed: - type: file - description: Bedfile containing the crosslink depths at genomic positions - pattern: "*.bed" - - coverage: - type: file - description: The crosslinks bed file turned into a bedgraph coverage file. - pattern: "*.bedgraph" - - coverage_norm: - type: file - description: The crosslinks bed file turned into a bedgraph coverage file normalised against total count - pattern: "*.bedgraph" - - versions: - type: file - description: File containing software versions - pattern: "versions.yml" -authors: - - "@chris-cheshire" From 6cd0860f05553b673faf9ae6a004d81feaa73b7a Mon Sep 17 00:00:00 2001 From: Charlotte Capitanchik Date: Tue, 7 Jan 2025 21:37:25 +0000 Subject: [PATCH 06/10] fix edge case crosslinks going over edge of chr --- modules/local/get_crosslinks.nf | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/modules/local/get_crosslinks.nf b/modules/local/get_crosslinks.nf index 02ce4ad..6119720 100644 --- a/modules/local/get_crosslinks.nf +++ b/modules/local/get_crosslinks.nf @@ -24,7 +24,8 @@ process GET_CROSSLINKS { if (crosslink_position == "start"){ """ bedtools bamtobed -i $bam > dedup.bed - bedtools shift -m 1 -p -1 -i dedup.bed -g $fai > shifted.bed + bedtools shift -m 1 -p -1 -i dedup.bed -g $fai > shiftedtemp.bed + awk -v OFS="\t" 'BEGIN {while (getline < ARGV[2]) {chrom[\$1] = \$2} ARGV[2] = ""} {start=\$2; end=\$3; if(start<0){start=0; end=1} if(end>chrom[\$1]){start=chrom[\$1]-1; end=chrom[\$1]} print \$1,start,end,\$4,\$5,\$6}' shiftedtemp.bed $fai > shifted.bed bedtools genomecov -dz -strand + -5 -i shifted.bed -g $fai | awk '{OFS="\t"}{print \$1, \$2, \$2+1, ".", \$3, "+"}' > pos.bed bedtools genomecov -dz -strand - -5 -i shifted.bed -g $fai | awk '{OFS="\t"}{print \$1, \$2, \$2+1, ".", \$3, "-"}' > neg.bed cat pos.bed neg.bed | sort -k1,1 -k2,2n -k3,3 -k6,6 > ${prefix}.xl.bed @@ -41,7 +42,8 @@ process GET_CROSSLINKS { } else if (crosslink_position == "middle"){ """ bedtools bamtobed -i $bam > dedup.bed - awk '{OFS="\t"}{mid=int((\$2+\$3)/2); print \$1, mid, mid+1, \$4, \$5, \$6}' dedup.bed > shifted.bed + awk '{OFS="\t"}{mid=int((\$2+\$3)/2); print \$1, mid, mid+1, \$4, \$5, \$6}' dedup.bed > shiftedtemp.bed + awk -v OFS="\t" 'BEGIN {while (getline < ARGV[2]) {chrom[\$1] = \$2} ARGV[2] = ""} {start=\$2; end=\$3; if(start<0){start=0; end=1} if(end>chrom[\$1]){start=chrom[\$1]-1; end=chrom[\$1]} print \$1,start,end,\$4,\$5,\$6}' shiftedtemp.bed $fai > shifted.bed bedtools genomecov -dz -strand + -5 -i shifted.bed -g $fai | awk '{OFS="\t"}{print \$1, \$2, \$2+1, ".", \$3, "+"}' > pos.bed bedtools genomecov -dz -strand - -5 -i shifted.bed -g $fai | awk '{OFS="\t"}{print \$1, \$2, \$2+1, ".", \$3, "-"}' > neg.bed cat pos.bed neg.bed | sort -k1,1 -k2,2n -k3,3 -k6,6 > ${prefix}.xl.bed @@ -58,7 +60,8 @@ process GET_CROSSLINKS { } else if (crosslink_position == "end"){ """ bedtools bamtobed -i $bam > dedup.bed - awk -v OFS="\t" '\$6=="+" {print \$1,\$3,\$3+1,\$4,\$5,\$6} \$6=="-" {print \$1,\$2-1,\$2,\$4,\$5,\$6}' dedup.bed > shifted.bed + awk -v OFS="\t" '\$6=="+" {print \$1,\$3,\$3+1,\$4,\$5,\$6} \$6=="-" {print \$1,\$2-1,\$2,\$4,\$5,\$6}' dedup.bed > shiftedtemp.bed + awk -v OFS="\t" 'BEGIN {while (getline < ARGV[2]) {chrom[\$1] = \$2} ARGV[2] = ""} {start=\$2; end=\$3; if(start<0){start=0; end=1} if(end>chrom[\$1]){start=chrom[\$1]-1; end=chrom[\$1]} print \$1,start,end,\$4,\$5,\$6}' shiftedtemp.bed $fai > shifted.bed bedtools genomecov -dz -strand + -3 -i shifted.bed -g $fai | awk '{OFS="\t"}{print \$1, \$2, \$2+1, ".", \$3, "+"}' > pos.bed bedtools genomecov -dz -strand - -3 -i shifted.bed -g $fai | awk '{OFS="\t"}{print \$1, \$2, \$2+1, ".", \$3, "-"}' > neg.bed cat pos.bed neg.bed | sort -k1,1 -k2,2n -k3,3 -k6,6 > ${prefix}.xl.bed From 722f20050746170744ae5481dbb23ae6fe6cdab5 Mon Sep 17 00:00:00 2001 From: Charlotte Capitanchik Date: Wed, 8 Jan 2025 13:22:46 +0000 Subject: [PATCH 07/10] fix middle crosslink position for spliced reads --- modules/local/get_crosslinks.nf | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/modules/local/get_crosslinks.nf b/modules/local/get_crosslinks.nf index 6119720..48b9ed6 100644 --- a/modules/local/get_crosslinks.nf +++ b/modules/local/get_crosslinks.nf @@ -41,16 +41,16 @@ process GET_CROSSLINKS { """ } else if (crosslink_position == "middle"){ """ - bedtools bamtobed -i $bam > dedup.bed - awk '{OFS="\t"}{mid=int((\$2+\$3)/2); print \$1, mid, mid+1, \$4, \$5, \$6}' dedup.bed > shiftedtemp.bed - awk -v OFS="\t" 'BEGIN {while (getline < ARGV[2]) {chrom[\$1] = \$2} ARGV[2] = ""} {start=\$2; end=\$3; if(start<0){start=0; end=1} if(end>chrom[\$1]){start=chrom[\$1]-1; end=chrom[\$1]} print \$1,start,end,\$4,\$5,\$6}' shiftedtemp.bed $fai > shifted.bed - bedtools genomecov -dz -strand + -5 -i shifted.bed -g $fai | awk '{OFS="\t"}{print \$1, \$2, \$2+1, ".", \$3, "+"}' > pos.bed - bedtools genomecov -dz -strand - -5 -i shifted.bed -g $fai | awk '{OFS="\t"}{print \$1, \$2, \$2+1, ".", \$3, "-"}' > neg.bed + bedtools bamtobed -bed12 -i $bam > dedup.bed12 + awk -F'\\t' -v OFS='\\t' 'function abs(x){return x<0?-x:x}{split(\$11,sizes,",");split(\$12,starts,",");total_covered=0;for(i=1;i<=\$10;i++){total_covered+=sizes[i]};target_pos=int(total_covered/2);covered_so_far=0;for(i=1;i<=\$10;i++){if(covered_so_far+sizes[i]>=target_pos){offset=target_pos-covered_so_far;mid_pos=\$2+starts[i]+offset;print \$1,mid_pos,mid_pos+1,\$4,\$5,\$6;break};covered_so_far+=sizes[i]}}' dedup.bed12 > shiftedtemp.bed + awk -v OFS="\\t" 'BEGIN {while (getline < ARGV[2]) {chrom[\$1] = \$2} ARGV[2] = ""} {start=\$2; end=\$3; if(start<0){start=0; end=1} if(end>chrom[\$1]){start=chrom[\$1]-1; end=chrom[\$1]} print \$1,start,end,\$4,\$5,\$6}' shiftedtemp.bed $fai > shifted.bed + bedtools genomecov -dz -strand + -5 -i shifted.bed -g $fai | awk '{OFS="\\t"}{print \$1, \$2, \$2+1, ".", \$3, "+"}' > pos.bed + bedtools genomecov -dz -strand - -5 -i shifted.bed -g $fai | awk '{OFS="\\t"}{print \$1, \$2, \$2+1, ".", \$3, "-"}' > neg.bed cat pos.bed neg.bed | sort -k1,1 -k2,2n -k3,3 -k6,6 > ${prefix}.xl.bed - cat ${prefix}.xl.bed | awk '{OFS = "\t"}{if (\$6 == "+") {print \$1, \$2, \$3, \$5} else {print \$1, \$2, \$3, -\$5}}' > ${prefix}.xl.bedgraph + cat ${prefix}.xl.bed | awk '{OFS = "\\t"}{if (\$6 == "+") {print \$1, \$2, \$3, \$5} else {print \$1, \$2, \$3, -\$5}}' > ${prefix}.xl.bedgraph TOTAL_VARIABLE=`cat ${prefix}.xl.bed | awk \'BEGIN {total=0} {total=total+\$5} END {print total}\'` - cat ${prefix}.xl.bed | awk -v total=\$TOTAL_VARIABLE \'{printf "%s\\t%i\\t%i\\t%s\\t%f\\t%s\\n", \$1, \$2, \$3, \$4, 1000000*\$5/total, \$6}\' | awk \'{OFS = "\t"}{if (\$6 == "+") {print \$1, \$2, \$3, \$5} else {print \$1, \$2, \$3, -\$5}}\' | sort -k1,1 -k2,2n > ${prefix}.xl.CPMnorm.bedgraph - + cat ${prefix}.xl.bed | awk -v total=\$TOTAL_VARIABLE \'{printf "%s\\t%i\\t%i\\t%s\\t%f\\t%s\\n", \$1, \$2, \$3, \$4, 1000000*\$5/total, \$6}\' | awk \'{OFS = "\\t"}{if (\$6 == "+") {print \$1, \$2, \$3, \$5} else {print \$1, \$2, \$3, -\$5}}\' | sort -k1,1 -k2,2n > ${prefix}.xl.CPMnorm.bedgraph + cat <<-END_VERSIONS > versions.yml "${task.process}": linux: NOVERSION From 80b9775517d529f7e0a55d04201f85c8fa373201 Mon Sep 17 00:00:00 2001 From: Charlotte Capitanchik Date: Fri, 10 Jan 2025 13:11:51 +0000 Subject: [PATCH 08/10] correct all samples not running when fai is provided --- conf/test.config | 2 +- main.nf | 6 +++--- modules/local/get_crosslinks.nf | 2 +- 3 files changed, 5 insertions(+), 5 deletions(-) diff --git a/conf/test.config b/conf/test.config index 60fd638..4d3397c 100644 --- a/conf/test.config +++ b/conf/test.config @@ -21,7 +21,7 @@ params { max_time = '6.h' // Input data - samplesheet = './tests/data/samplesheets/small-single-sample-se.csv' + samplesheet = './tests/data/samplesheets/small-dual-sample-se.csv' fasta = './tests/data/genome/yeast_MitoV.fa.gz' smrna_fasta = './tests/data/genome/homosapiens_smallRNA.fa.gz' gtf = './tests/data/genome/yeast_MitoV.gtf.gz' diff --git a/main.nf b/main.nf index bd0d336..7bb5e8e 100644 --- a/main.nf +++ b/main.nf @@ -458,7 +458,7 @@ workflow CLIPSEQ { */ CALC_SMRNA_K1_CROSSLINKS ( ch_smrna_k1_bam.join(ch_smrna_k1_bai), - ch_smrna_fasta_fai, + ch_smrna_fasta_fai.collect{ it[1] }, params.crosslink_position ) ch_versions = ch_versions.mix(CALC_SMRNA_K1_CROSSLINKS.out.versions) @@ -471,7 +471,7 @@ workflow CLIPSEQ { */ CALC_GENOME_CROSSLINKS ( ch_genome_bam.join(ch_genome_bai), - ch_fasta_fai, + ch_fasta_fai.collect{ it[1] }, params.crosslink_position ) ch_versions = ch_versions.mix(CALC_GENOME_CROSSLINKS.out.versions) @@ -484,7 +484,7 @@ workflow CLIPSEQ { */ CALC_TRANSCRIPT_CROSSLINKS ( ch_transcript_bam.join(ch_transcript_bai), - ch_longest_transcript_fai, + ch_longest_transcript_fai.collect{ it[1] }, params.crosslink_position ) ch_versions = ch_versions.mix(CALC_TRANSCRIPT_CROSSLINKS.out.versions) diff --git a/modules/local/get_crosslinks.nf b/modules/local/get_crosslinks.nf index 48b9ed6..74a05d1 100644 --- a/modules/local/get_crosslinks.nf +++ b/modules/local/get_crosslinks.nf @@ -9,7 +9,7 @@ process GET_CROSSLINKS { input: tuple val(meta), path(bam), path(bai) - tuple val(meta2), path(fai) + path fai val crosslink_position output: From 9d44d8789bbe90039605b5c439f88d1c93f86a68 Mon Sep 17 00:00:00 2001 From: Charlotte Capitanchik Date: Tue, 14 Jan 2025 14:06:40 +0000 Subject: [PATCH 09/10] Update clipseq.json --- schema/clipseq.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/schema/clipseq.json b/schema/clipseq.json index 4e0aff5..1d1917f 100644 --- a/schema/clipseq.json +++ b/schema/clipseq.json @@ -371,7 +371,7 @@ "name": "Crosslink summary", "description": "Crosslinks summarised by gene, type (eg. CDS, intron) and subtype (eg. lncRNA, mRNA).", "filetype": "tsv", - "process": "ICOUNT_SUMMARY" + "process": "MERGE_SUMMARY" }, { "name": "K-mer enrichment", From e60e1a4e44850c96f73f64d5c93fe2da805d1b6e Mon Sep 17 00:00:00 2001 From: Charlotte Capitanchik Date: Mon, 20 Jan 2025 18:10:38 +0000 Subject: [PATCH 10/10] expose trim galore params --- schema/clipseq.json | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/schema/clipseq.json b/schema/clipseq.json index 4e0aff5..086c439 100644 --- a/schema/clipseq.json +++ b/schema/clipseq.json @@ -350,6 +350,11 @@ "name": "PEKA parameters", "description": "Parameters for PEKA K-mer enrichment analysis", "type": "string" + }, + "trimgalore_params": { + "name": "Trim Galore! parameters", + "description": "Parameters for Trim Galore! We reccomend to include the default params along with any new ones you add: '--fastqc --length 10 -q 20'", + "type": "string" } } }