diff --git a/conf/modules.config b/conf/modules.config index 9301b85..62e5729 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -568,55 +568,8 @@ if(params.run_umi_dedup) { if(params.run_calc_crosslinks) { process { - withName: '.*CROSSLINKS:BEDTOOLS_BAMTOBED' { - publishDir = [ - enabled: false - ] - } - - withName: '.*CROSSLINKS:BEDTOOLS_SHIFT' { - ext.args = '-m 1 -p -1' - publishDir = [ - enabled: false - ] - } - - withName: '.*CROSSLINKS:BEDTOOLS_GENOMECOV_POS' { - ext.args = '-dz -strand + -5' - publishDir = [ - enabled: false - ] - } - - withName: '.*CROSSLINKS:BEDTOOLS_GENOMECOV_NEG' { - ext.args = '-dz -strand - -5' - publishDir = [ - enabled: false - ] - } - - withName: '.*CROSSLINKS:SELECT_BED_POS' { - ext.cmd1 = 'awk \'{OFS="\\t"}{print \$1, \$2, \$2+1, ".", \$3, "+"}\'' - ext.suffix = '.pos' - ext.ext = 'bed' - publishDir = [ - enabled: false - ] - } - - withName: '.*CROSSLINKS:SELECT_BED_NEG' { - ext.cmd1 = 'awk \'{OFS="\\t"}{print \$1, \$2, \$2+1, ".", \$3, "-"}\'' - ext.suffix = '.neg' - ext.ext = 'bed' - publishDir = [ - enabled: false - ] - } - - withName: 'CLIPSEQ:CALC_GENOME_CROSSLINKS:MERGE_AND_SORT' { - ext.cmd1 = 'sort -k1,1 -k2,2n' + withName: 'CLIPSEQ:CALC_GENOME_CROSSLINKS' { ext.suffix = '.genome' - ext.ext = 'bed' publishDir = [ path: { "${params.outdir}/04_crosslinks" }, mode: "${params.publish_dir_mode}", @@ -624,66 +577,8 @@ if(params.run_calc_crosslinks) { ] } - withName: 'CLIPSEQ:CALC_GENOME_CROSSLINKS:CROSSLINK_COVERAGE' { - ext.cmd1 = 'awk \'{OFS = "\t"}{if (\$6 == "+") {print \$1, \$2, \$3, \$5} else {print \$1, \$2, \$3, -\$5}}\' | sort -k1,1 -k2,2n' - ext.suffix = '.genome' - ext.ext = 'bedgraph' - publishDir = [ - path: { "${params.outdir}/04_crosslinks" }, - mode: "${params.publish_dir_mode}", - saveAs: { filename -> filename.equals('versions.yml') ? null : filename } - ] - } - - withName: 'CLIPSEQ:CALC_GENOME_CROSSLINKS:CROSSLINK_NORMCOVERAGE' { - ext.cmd1 = 'awk -v total=\$CMD2 \'{printf "%s\\t%i\\t%i\\t%s\\t%f\\t%s\\n", \$1, \$2, \$3, \$4, 1000000*\$5/total, \$6}\' | awk \'{OFS = "\t"}{if (\$6 == "+") {print \$1, \$2, \$3, \$5} else {print \$1, \$2, \$3, -\$5}}\' | sort -k1,1 -k2,2n' - ext.cmd2 = 'awk \'BEGIN {total=0} {total=total+\$5} END {print total}\'' - ext.suffix = '.norm.genome' - ext.ext = 'bedgraph' - publishDir = [ - path: { "${params.outdir}/04_crosslinks" }, - mode: "${params.publish_dir_mode}", - saveAs: { filename -> filename.equals('versions.yml') ? null : filename } - ] - } - - withName: 'CLIPSEQ:CALC_TRANSCRIPT_CROSSLINKS:MERGE_AND_SORT' { - ext.cmd1 = 'sort -k1,1 -k2,2n' + withName: 'CLIPSEQ:CALC_TRANSCRIPT_CROSSLINKS' { ext.suffix = '.transcript' - ext.ext = 'bed' - publishDir = [ - path: { "${params.outdir}/04_crosslinks" }, - mode: "${params.publish_dir_mode}", - saveAs: { filename -> filename.equals('versions.yml') ? null : filename } - ] - } - - withName: 'CLIPSEQ:CALC_TRANSCRIPT_CROSSLINKS:CROSSLINK_COVERAGE' { - ext.cmd1 = 'awk \'{OFS = "\t"}{if (\$6 == "+") {print \$1, \$2, \$3, \$5} else {print \$1, \$2, \$3, -\$5}}\' | sort -k1,1 -k2,2n' - ext.suffix = '.transcript' - ext.ext = 'bedgraph' - publishDir = [ - path: { "${params.outdir}/04_crosslinks" }, - mode: "${params.publish_dir_mode}", - saveAs: { filename -> filename.equals('versions.yml') ? null : filename } - ] - } - - withName: 'CLIPSEQ:CALC_TRANSCRIPT_CROSSLINKS:CROSSLINK_NORMCOVERAGE' { - ext.cmd1 = 'awk -v total=\$CMD2 \'{printf "%s\\t%i\\t%i\\t%s\\t%f\\t%s\\n", \$1, \$2, \$3, \$4, 1000000*\$5/total, \$6}\' | awk \'{OFS = "\t"}{if (\$6 == "+") {print \$1, \$2, \$3, \$5} else {print \$1, \$2, \$3, -\$5}}\' | sort -k1,1 -k2,2n' - ext.cmd2 = 'awk \'BEGIN {total=0} {total=total+\$5} END {print total}\'' - ext.suffix = '.norm.transcript' - ext.ext = 'bedgraph' - publishDir = [ - path: { "${params.outdir}/04_crosslinks" }, - mode: "${params.publish_dir_mode}", - saveAs: { filename -> filename.equals('versions.yml') ? null : filename } - ] - } - withName: 'CLIPSEQ:CALC_SMRNA_K1_CROSSLINKS:MERGE_AND_SORT' { - ext.cmd1 = 'sort -k1,1 -k2,2n' - ext.suffix = '.smrna_withk1' - ext.ext = 'bed' publishDir = [ path: { "${params.outdir}/04_crosslinks" }, mode: "${params.publish_dir_mode}", @@ -691,26 +586,14 @@ if(params.run_calc_crosslinks) { ] } - withName: 'CLIPSEQ:CALC_SMRNA_K1_CROSSLINKS:CROSSLINK_COVERAGE' { - ext.cmd1 = 'awk \'{OFS = "\t"}{if (\$6 == "+") {print \$1, \$2, \$3, \$5} else {print \$1, \$2, \$3, -\$5}}\' | sort -k1,1 -k2,2n' + withName: 'CLIPSEQ:CALC_SMRNA_K1_CROSSLINKS' { ext.suffix = '.smrna_withk1' - ext.ext = 'bedgraph' publishDir = [ path: { "${params.outdir}/04_crosslinks" }, mode: "${params.publish_dir_mode}", saveAs: { filename -> filename.equals('versions.yml') ? null : filename } ] } - - withName: 'CLIPSEQ:CALC_SMRNA_K1_CROSSLINKS:CROSSLINK_NORMCOVERAGE' { - ext.cmd1 = 'awk -v total=\$CMD2 \'{printf "%s\\t%i\\t%i\\t%s\\t%f\\t%s\\n", \$1, \$2, \$3, \$4, 1000000*\$5/total, \$6}\' | awk \'{OFS = "\t"}{if (\$6 == "+") {print \$1, \$2, \$3, \$5} else {print \$1, \$2, \$3, -\$5}}\' | sort -k1,1 -k2,2n' - ext.cmd2 = 'awk \'BEGIN {total=0} {total=total+\$5} END {print total}\'' - ext.suffix = '.norm.smrna_withk1' - ext.ext = 'bedgraph' - publishDir = [ - enabled: false - ] - } } } diff --git a/conf/test.config b/conf/test.config index 60fd638..4d3397c 100644 --- a/conf/test.config +++ b/conf/test.config @@ -21,7 +21,7 @@ params { max_time = '6.h' // Input data - samplesheet = './tests/data/samplesheets/small-single-sample-se.csv' + samplesheet = './tests/data/samplesheets/small-dual-sample-se.csv' fasta = './tests/data/genome/yeast_MitoV.fa.gz' smrna_fasta = './tests/data/genome/homosapiens_smallRNA.fa.gz' gtf = './tests/data/genome/yeast_MitoV.gtf.gz' diff --git a/main.nf b/main.nf index 7d32840..7bb5e8e 100644 --- a/main.nf +++ b/main.nf @@ -95,6 +95,10 @@ ch_multiqc_config = file("$projectDir/assets/multiqc_config.yml", checkIfExists: // include { MULTIQC } from './modules/local/multiqc' +include { GET_CROSSLINKS as CALC_SMRNA_K1_CROSSLINKS } from './modules/local/get_crosslinks' +include { GET_CROSSLINKS as CALC_GENOME_CROSSLINKS } from './modules/local/get_crosslinks' +include { GET_CROSSLINKS as CALC_TRANSCRIPT_CROSSLINKS } from './modules/local/get_crosslinks' + // // SUBWORKFLOWS @@ -116,6 +120,7 @@ include { CLIPPY as CLIPPY_TRANSCRIPT } from './modules/goodwrigh include { PEKA } from './modules/goodwright/peka/main' include { DUMP_SOFTWARE_VERSIONS } from './modules/goodwright/dump_software_versions/main' include { CLIPSEQ_CLIPQC } from './modules/goodwright/clipseq/clipqc/main' +include { ENCODE_MOVEUMI } from './modules/goodwright/clipseq/encode_moveumi/main' // // SUBWORKFLOWS @@ -130,9 +135,6 @@ include { BAM_DEDUP_SAMTOOLS_UMITOOLS as GENOME_MULTI_DEDUP } from './subwor include { BAM_DEDUP_SAMTOOLS_UMITOOLS as SMRNA_DEDUP } from './subworkflows/goodwright/bam_dedup_samtools_umitools/main' include { BAM_DEDUP_SAMTOOLS_UMITOOLS as SMRNA_K1_DEDUP } from './subworkflows/goodwright/bam_dedup_samtools_umitools/main' include { BAM_DEDUP_SAMTOOLS_UMITOOLS as TRANSCRIPT_DEDUP } from './subworkflows/goodwright/bam_dedup_samtools_umitools/main' -include { CLIP_CALC_CROSSLINKS as CALC_SMRNA_K1_CROSSLINKS } from './subworkflows/goodwright/clip_calc_crosslinks/main' -include { CLIP_CALC_CROSSLINKS as CALC_GENOME_CROSSLINKS } from './subworkflows/goodwright/clip_calc_crosslinks/main' -include { CLIP_CALC_CROSSLINKS as CALC_TRANSCRIPT_CROSSLINKS } from './subworkflows/goodwright/clip_calc_crosslinks/main' include { PARACLU_ANALYSE as PARACLU_ANALYSE_GENOME } from './subworkflows/goodwright/paraclu_analyse/main' include { PARACLU_ANALYSE as PARACLU_ANALYSE_TRANSCRIPT } from './subworkflows/goodwright/paraclu_analyse/main' include { ICOUNT_ANALYSE } from './subworkflows/goodwright/icount_analyse/main' @@ -277,7 +279,13 @@ workflow CLIPSEQ { } //EXAMPLE CHANNEL STRUCT: [[id:h3k27me3_R1, group:h3k27me3, replicate:1, single_end:false], [FASTQ]] //ch_fastq | view - + if(params.encode_eclip){ + ENCODE_MOVEUMI ( + ch_fastq + ) + ch_versions = ch_versions.mix(ENCODE_MOVEUMI.out.versions) + ch_fastq = ENCODE_MOVEUMI.out.reads + } if(params.run_move_umi_to_header){ UMITOOLS_EXTRACT ( ch_fastq @@ -433,6 +441,9 @@ workflow CLIPSEQ { ch_versions = ch_versions.mix(TRANSCRIPT_DEDUP.out.versions) ch_transcript_bam = TRANSCRIPT_DEDUP.out.bam ch_transcript_bai = TRANSCRIPT_DEDUP.out.bai + } else { + ch_genome_bam = ch_genome_unique_bam + ch_genome_bai = ch_genome_unique_bai } ch_genome_crosslink_bed = Channel.empty() @@ -446,8 +457,9 @@ workflow CLIPSEQ { * SUBWORKFLOW: Run crosslink calculation for smRNA with -k 1 */ CALC_SMRNA_K1_CROSSLINKS ( - ch_smrna_k1_bam, - ch_smrna_fasta_fai.collect{ it[1] } + ch_smrna_k1_bam.join(ch_smrna_k1_bai), + ch_smrna_fasta_fai.collect{ it[1] }, + params.crosslink_position ) ch_versions = ch_versions.mix(CALC_SMRNA_K1_CROSSLINKS.out.versions) ch_smrna_crosslink_bed = CALC_SMRNA_K1_CROSSLINKS.out.bed @@ -458,8 +470,9 @@ workflow CLIPSEQ { * SUBWORKFLOW: Run crosslink calculation for genome */ CALC_GENOME_CROSSLINKS ( - ch_genome_bam, - ch_fasta_fai.collect{ it[1] } + ch_genome_bam.join(ch_genome_bai), + ch_fasta_fai.collect{ it[1] }, + params.crosslink_position ) ch_versions = ch_versions.mix(CALC_GENOME_CROSSLINKS.out.versions) ch_genome_crosslink_bed = CALC_GENOME_CROSSLINKS.out.bed @@ -470,8 +483,9 @@ workflow CLIPSEQ { * SUBWORKFLOW: Run crosslink calculation for transcripts */ CALC_TRANSCRIPT_CROSSLINKS ( - ch_transcript_bam, - ch_longest_transcript_fai.collect{ it[1] } + ch_transcript_bam.join(ch_transcript_bai), + ch_longest_transcript_fai.collect{ it[1] }, + params.crosslink_position ) ch_versions = ch_versions.mix(CALC_TRANSCRIPT_CROSSLINKS.out.versions) ch_trans_crosslink_bed = CALC_TRANSCRIPT_CROSSLINKS.out.bed diff --git a/modules/goodwright/clipseq/encode_moveumi/main.nf b/modules/goodwright/clipseq/encode_moveumi/main.nf new file mode 100644 index 0000000..2e2069d --- /dev/null +++ b/modules/goodwright/clipseq/encode_moveumi/main.nf @@ -0,0 +1,22 @@ +process ENCODE_MOVEUMI { + label "process_single" + + conda "bioconda::biopython=1.78 pigz=2.6" + container "quay.io/biocontainers/mulled-v2-877c4e5a8fad685ea5bde487e04924ac447923b9:b7daa641364165419b9a87d9988bc803f913c5b6-0" + + input: + tuple val(meta), path(reads) + + output: + tuple val(meta), path("*.fastq.gz"), emit: reads + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + shell: + def args = task.ext.args ?: '' + prefix = task.ext.prefix ?: "${meta.id}" + process_name = task.process + template 'encode_moveumi.py' +} diff --git a/modules/goodwright/clipseq/encode_moveumi/meta.yml b/modules/goodwright/clipseq/encode_moveumi/meta.yml new file mode 100644 index 0000000..8429c6e --- /dev/null +++ b/modules/goodwright/clipseq/encode_moveumi/meta.yml @@ -0,0 +1,40 @@ +name: clipseq_clipqc +description: Runs python-based clip quality control and outputs to a set of tsv files +keywords: + - iCLIP + - eCLIP + - CLIP + - qc +input: + - premap: + type: file + description: Premap metrics files + - mapped: + type: file + description: Alignment metrics files + - collapse: + type: file + description: UMI collapse metrics files + - xlinks: + type: file + description: xlinks metric files + - icount: + type: file + description: iCount metrics files + - paraclu: + type: file + description: paraclu metrics files + - clippy: + type: file + description: Clippy metrics files +output: + - tsv: + type: file + description: All tsv file outputs + pattern: "*.tsv" + - version: + type: file + description: File containing software version + pattern: "*.{version.txt}" +authors: + - "@chris-cheshire" diff --git a/modules/goodwright/clipseq/encode_moveumi/templates/encode_moveumi.py b/modules/goodwright/clipseq/encode_moveumi/templates/encode_moveumi.py new file mode 100644 index 0000000..4b4ff31 --- /dev/null +++ b/modules/goodwright/clipseq/encode_moveumi/templates/encode_moveumi.py @@ -0,0 +1,29 @@ +#!/usr/bin/env python + +import os +import sys +import gzip +import platform +from Bio import SeqIO +import Bio + +input_fq = "!{reads}" +output_fq = "!{prefix}.umi.fastq" + +with gzip.open(input_fq, mode = 'rt') as f_in: + with open(output_fq, mode = 'w') as f_out: + for record in SeqIO.parse(f_in, 'fastq'): + header = record.id.split(":") + if '_' not in header[-1]: + rearranged = ":".join(header[1:]) + '_rbc:' + header[0] + record.id = rearranged + record.name = rearranged + record.description = rearranged + SeqIO.write(record, f_out, 'fastq') + +os.system('pigz ' + output_fq) + +with open("versions.yml", "w") as out_f: + out_f.write("!{process_name}" + ":\n") + out_f.write(" python: " + platform.python_version() + "\n") + out_f.write(" biopython: " + Bio.__version__ + "\n") \ No newline at end of file diff --git a/modules/local/get_crosslinks.nf b/modules/local/get_crosslinks.nf new file mode 100644 index 0000000..74a05d1 --- /dev/null +++ b/modules/local/get_crosslinks.nf @@ -0,0 +1,81 @@ +process GET_CROSSLINKS { + tag "$meta.id" + label 'process_medium' + + conda "bioconda::bedtools=2.30.0" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/bedtools:2.30.0--hc088bd4_0' : + 'biocontainers/bedtools:2.30.0--hc088bd4_0' }" + + input: + tuple val(meta), path(bam), path(bai) + path fai + val crosslink_position + + output: + tuple val(meta), path("*.xl.bed") , emit: bed + tuple val(meta), path("*.xl.bedgraph") , emit: coverage + tuple val(meta), path("*.xl.CPMnorm.bedgraph") , emit: coverage_norm + path "versions.yml" ,emit: versions + + + script: + def prefix = task.ext.suffix ? "${meta.id}${task.ext.suffix}" : "${meta.id}" + if (crosslink_position == "start"){ + """ + bedtools bamtobed -i $bam > dedup.bed + bedtools shift -m 1 -p -1 -i dedup.bed -g $fai > shiftedtemp.bed + awk -v OFS="\t" 'BEGIN {while (getline < ARGV[2]) {chrom[\$1] = \$2} ARGV[2] = ""} {start=\$2; end=\$3; if(start<0){start=0; end=1} if(end>chrom[\$1]){start=chrom[\$1]-1; end=chrom[\$1]} print \$1,start,end,\$4,\$5,\$6}' shiftedtemp.bed $fai > shifted.bed + bedtools genomecov -dz -strand + -5 -i shifted.bed -g $fai | awk '{OFS="\t"}{print \$1, \$2, \$2+1, ".", \$3, "+"}' > pos.bed + bedtools genomecov -dz -strand - -5 -i shifted.bed -g $fai | awk '{OFS="\t"}{print \$1, \$2, \$2+1, ".", \$3, "-"}' > neg.bed + cat pos.bed neg.bed | sort -k1,1 -k2,2n -k3,3 -k6,6 > ${prefix}.xl.bed + cat ${prefix}.xl.bed | awk '{OFS = "\t"}{if (\$6 == "+") {print \$1, \$2, \$3, \$5} else {print \$1, \$2, \$3, -\$5}}' > ${prefix}.xl.bedgraph + TOTAL_VARIABLE=`cat ${prefix}.xl.bed | awk \'BEGIN {total=0} {total=total+\$5} END {print total}\'` + cat ${prefix}.xl.bed | awk -v total=\$TOTAL_VARIABLE \'{printf "%s\\t%i\\t%i\\t%s\\t%f\\t%s\\n", \$1, \$2, \$3, \$4, 1000000*\$5/total, \$6}\' | awk \'{OFS = "\t"}{if (\$6 == "+") {print \$1, \$2, \$3, \$5} else {print \$1, \$2, \$3, -\$5}}\' | sort -k1,1 -k2,2n > ${prefix}.xl.CPMnorm.bedgraph + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + linux: NOVERSION + bedtools: `bedtools --version | head -n 1` + END_VERSIONS + """ + } else if (crosslink_position == "middle"){ + """ + bedtools bamtobed -bed12 -i $bam > dedup.bed12 + awk -F'\\t' -v OFS='\\t' 'function abs(x){return x<0?-x:x}{split(\$11,sizes,",");split(\$12,starts,",");total_covered=0;for(i=1;i<=\$10;i++){total_covered+=sizes[i]};target_pos=int(total_covered/2);covered_so_far=0;for(i=1;i<=\$10;i++){if(covered_so_far+sizes[i]>=target_pos){offset=target_pos-covered_so_far;mid_pos=\$2+starts[i]+offset;print \$1,mid_pos,mid_pos+1,\$4,\$5,\$6;break};covered_so_far+=sizes[i]}}' dedup.bed12 > shiftedtemp.bed + awk -v OFS="\\t" 'BEGIN {while (getline < ARGV[2]) {chrom[\$1] = \$2} ARGV[2] = ""} {start=\$2; end=\$3; if(start<0){start=0; end=1} if(end>chrom[\$1]){start=chrom[\$1]-1; end=chrom[\$1]} print \$1,start,end,\$4,\$5,\$6}' shiftedtemp.bed $fai > shifted.bed + bedtools genomecov -dz -strand + -5 -i shifted.bed -g $fai | awk '{OFS="\\t"}{print \$1, \$2, \$2+1, ".", \$3, "+"}' > pos.bed + bedtools genomecov -dz -strand - -5 -i shifted.bed -g $fai | awk '{OFS="\\t"}{print \$1, \$2, \$2+1, ".", \$3, "-"}' > neg.bed + cat pos.bed neg.bed | sort -k1,1 -k2,2n -k3,3 -k6,6 > ${prefix}.xl.bed + cat ${prefix}.xl.bed | awk '{OFS = "\\t"}{if (\$6 == "+") {print \$1, \$2, \$3, \$5} else {print \$1, \$2, \$3, -\$5}}' > ${prefix}.xl.bedgraph + TOTAL_VARIABLE=`cat ${prefix}.xl.bed | awk \'BEGIN {total=0} {total=total+\$5} END {print total}\'` + cat ${prefix}.xl.bed | awk -v total=\$TOTAL_VARIABLE \'{printf "%s\\t%i\\t%i\\t%s\\t%f\\t%s\\n", \$1, \$2, \$3, \$4, 1000000*\$5/total, \$6}\' | awk \'{OFS = "\\t"}{if (\$6 == "+") {print \$1, \$2, \$3, \$5} else {print \$1, \$2, \$3, -\$5}}\' | sort -k1,1 -k2,2n > ${prefix}.xl.CPMnorm.bedgraph + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + linux: NOVERSION + bedtools: `bedtools --version | head -n 1` + END_VERSIONS + """ + } else if (crosslink_position == "end"){ + """ + bedtools bamtobed -i $bam > dedup.bed + awk -v OFS="\t" '\$6=="+" {print \$1,\$3,\$3+1,\$4,\$5,\$6} \$6=="-" {print \$1,\$2-1,\$2,\$4,\$5,\$6}' dedup.bed > shiftedtemp.bed + awk -v OFS="\t" 'BEGIN {while (getline < ARGV[2]) {chrom[\$1] = \$2} ARGV[2] = ""} {start=\$2; end=\$3; if(start<0){start=0; end=1} if(end>chrom[\$1]){start=chrom[\$1]-1; end=chrom[\$1]} print \$1,start,end,\$4,\$5,\$6}' shiftedtemp.bed $fai > shifted.bed + bedtools genomecov -dz -strand + -3 -i shifted.bed -g $fai | awk '{OFS="\t"}{print \$1, \$2, \$2+1, ".", \$3, "+"}' > pos.bed + bedtools genomecov -dz -strand - -3 -i shifted.bed -g $fai | awk '{OFS="\t"}{print \$1, \$2, \$2+1, ".", \$3, "-"}' > neg.bed + cat pos.bed neg.bed | sort -k1,1 -k2,2n -k3,3 -k6,6 > ${prefix}.xl.bed + cat ${prefix}.xl.bed | awk '{OFS = "\t"}{if (\$6 == "+") {print \$1, \$2, \$3, \$5} else {print \$1, \$2, \$3, -\$5}}' > ${prefix}.xl.bedgraph + TOTAL_VARIABLE=`cat ${prefix}.xl.bed | awk \'BEGIN {total=0} {total=total+\$5} END {print total}\'` + cat ${prefix}.xl.bed | awk -v total=\$TOTAL_VARIABLE \'{printf "%s\\t%i\\t%i\\t%s\\t%f\\t%s\\n", \$1, \$2, \$3, \$4, 1000000*\$5/total, \$6}\' | awk \'{OFS = "\t"}{if (\$6 == "+") {print \$1, \$2, \$3, \$5} else {print \$1, \$2, \$3, -\$5}}\' | sort -k1,1 -k2,2n > ${prefix}.xl.CPMnorm.bedgraph + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + linux: NOVERSION + bedtools: `bedtools --version | head -n 1` + END_VERSIONS + """ + } else { + error "Invalid crosslink_position: ${crosslink_position}" + } +} diff --git a/nextflow.config b/nextflow.config index 6bbe635..e857fac 100644 --- a/nextflow.config +++ b/nextflow.config @@ -69,6 +69,8 @@ params { save_unaligned_res = true // Pipeline params + crosslink_position = "start" + encode_eclip = false move_umi_to_header = false umi_header_format = null save_unaligned = true // DO NOT CHANGE diff --git a/schema/clipseq.json b/schema/clipseq.json index 0fe828e..535e8e2 100644 --- a/schema/clipseq.json +++ b/schema/clipseq.json @@ -287,6 +287,12 @@ "description": "Additional pipeline configuration options.", "advanced": true, "params": { + "crosslink_position": { + "name": "Crosslink position", + "description": "The position of the crosslink in the read. Options are 'start', 'end' or 'middle'. Note that 'start' and 'end' correspond to start and end of read minus and plus 1 respectively.", + "type": "string", + "default": "start" + }, "move_umi_to_header": { "name": "Extract UMI to header", "description": "Runs UMI to header extraction based on the head format provided in UMI header format.", @@ -344,28 +350,33 @@ "name": "PEKA parameters", "description": "Parameters for PEKA K-mer enrichment analysis", "type": "string" + }, + "trimgalore_params": { + "name": "Trim Galore! parameters", + "description": "Parameters for Trim Galore! We reccomend to include the default params along with any new ones you add: '--fastqc --length 10 -q 20'", + "type": "string" } } } ], "outputs": [ { - "name": "Normalised genome crosslink bedgraph", - "description": "Genomic crosslinks normalised to total sample crosslinks in bedgraph format for genome browser viewing.", - "filetype": "bedgraph", - "process": "CROSSLINK_NORMCOVERAGE" + "name": "Raw genome crosslink bedgraph", + "description": "Genomic crosslinks in bedgraph format for genome browser viewing.", + "filetype": ".xl.bedgraph", + "process": "CALC_GENOME_CROSSLINKS" }, { "name": "Genomic peaks", "description": "Genomic peaks generated by Clippy.", - "filetype": "bed", + "filetype": "Peaks.bed", "process": "CLIPPY_GENOME" }, { "name": "Crosslink summary", "description": "Crosslinks summarised by gene, type (eg. CDS, intron) and subtype (eg. lncRNA, mRNA).", "filetype": "tsv", - "process": "ICOUNT_SUMMARY" + "process": "MERGE_SUMMARY" }, { "name": "K-mer enrichment", diff --git a/subworkflows/goodwright/clip_calc_crosslinks/default.config b/subworkflows/goodwright/clip_calc_crosslinks/default.config deleted file mode 100644 index 4b0cc0a..0000000 --- a/subworkflows/goodwright/clip_calc_crosslinks/default.config +++ /dev/null @@ -1,49 +0,0 @@ -process { - - withName: BEDTOOLS_SHIFT { - ext.args = '-m 1 -p -1' - } - - withName: BEDTOOLS_GENOMECOV_POS { - ext.args = '-dz -strand + -5' - } - - withName: BEDTOOLS_GENOMECOV_NEG { - ext.args = '-dz -strand - -5' - } - - withName: SELECT_BED_POS { - ext.cmd1 = 'awk \'{OFS="\\t"}{print \$1, \$2, \$2+1, ".", \$3, "+"}\'' - ext.suffix = '.pos' - ext.ext = 'bed' - } - - withName: SELECT_BED_NEG { - ext.cmd1 = 'awk \'{OFS="\\t"}{print \$1, \$2, \$2+1, ".", \$3, "-"}\'' - ext.suffix = '.neg' - ext.ext = 'bed' - } - - withName: MERGE_AND_SORT { - ext.cmd1 = 'sort -k1,1 -k2,2n' - ext.suffix = '.sorted' - ext.ext = 'bed' - } - - withName: CROSSLINK_COVERAGE { - ext.cmd1 = 'awk \'{OFS = "\t"}{if (\$6 == "+") {print \$1, \$2, \$3, \$5} else {print \$1, \$2, \$3, -\$5}}\' | sort -k1,1 -k2,2n' - ext.suffix = '.sorted' - ext.ext = 'bedgraph' - } - - withName: CROSSLINK_NORMCOVERAGE { - ext.cmd1 = """ - awk -v total=\$CMD2 \'{printf "%s\\t%i\\t%i\\t%s\\t%f\\t%s\\n", \$1, \$2, \$3, \$4, 1000000*\$5/total, \$6}\' | - awk \'{OFS = "\t"}{if (\$6 == "+") {print \$1, \$2, \$3, \$5} else {print \$1, \$2, \$3, -\$5}}\' | - sort -k1,1 -k2,2n - """ - ext.cmd2 = 'awk \'BEGIN {total=0} {total=total+\$5} END {print total}\'' - ext.suffix = '.norm.sorted' - ext.ext = 'bedgraph' - } -} diff --git a/subworkflows/goodwright/clip_calc_crosslinks/main.nf b/subworkflows/goodwright/clip_calc_crosslinks/main.nf deleted file mode 100644 index b53c949..0000000 --- a/subworkflows/goodwright/clip_calc_crosslinks/main.nf +++ /dev/null @@ -1,120 +0,0 @@ -// -// Calculate clip crosslinks using an input BAM file and genome index file. -// Crosslinks are outputed as a BED file and additional coverage and normalised coverage -// tracks are calculated in BEDGRAPH format -// - -/* -* MODULES -*/ -include { BEDTOOLS_BAMTOBED } from '../../../modules/nf-core/bedtools/bamtobed/main.nf' -include { BEDTOOLS_SHIFT } from '../../../modules/goodwright/bedtools/shift/main.nf' -include { BEDTOOLS_GENOMECOV as BEDTOOLS_GENOMECOV_POS } from '../../../modules/nf-core/bedtools/genomecov/main.nf' -include { BEDTOOLS_GENOMECOV as BEDTOOLS_GENOMECOV_NEG } from '../../../modules/nf-core/bedtools/genomecov/main.nf' -include { LINUX_COMMAND as SELECT_BED_POS } from '../../../modules/goodwright/linux/command/main.nf' -include { LINUX_COMMAND as SELECT_BED_NEG } from '../../../modules/goodwright/linux/command/main.nf' -include { LINUX_COMMAND as MERGE_AND_SORT } from '../../../modules/goodwright/linux/command/main.nf' -include { LINUX_COMMAND as CROSSLINK_COVERAGE } from '../../../modules/goodwright/linux/command/main.nf' -include { LINUX_COMMAND as CROSSLINK_NORMCOVERAGE } from '../../../modules/goodwright/linux/command/main.nf' - -workflow CLIP_CALC_CROSSLINKS { - take: - bam // channel: [ val(meta), [ bam ] ] - fai // channel: [ fai ] - - main: - ch_versions = Channel.empty() - - /* - * MODULE: Convert input BAM file to BED format - */ - BEDTOOLS_BAMTOBED ( - bam - ) - ch_versions = ch_versions.mix(BEDTOOLS_BAMTOBED.out.versions) - - /* - * MODULE: Shift BED file according to parameters suppied in config (default is -s 0) - */ - BEDTOOLS_SHIFT ( - BEDTOOLS_BAMTOBED.out.bed, - fai - ) - ch_versions = ch_versions.mix(BEDTOOLS_SHIFT.out.versions) - - /* - * MODULE: Report depth at each position on the pos strand - */ - BEDTOOLS_GENOMECOV_POS ( - BEDTOOLS_SHIFT.out.bed.map{ [ it[0], it[1], 1 ] }, - fai, - 'pos.bed' - ) - ch_versions = ch_versions.mix(BEDTOOLS_GENOMECOV_POS.out.versions) - - /* - * MODULE: Report depth at each position on the neg strand - */ - BEDTOOLS_GENOMECOV_NEG ( - BEDTOOLS_SHIFT.out.bed.map{ [ it[0], it[1], 1 ] }, - fai, - 'neg.bed' - ) - - /* - * MODULE: Select columns in BED file using AWK - */ - SELECT_BED_POS ( - BEDTOOLS_GENOMECOV_POS.out.genomecov, - [], - false - ) - SELECT_BED_NEG ( - BEDTOOLS_GENOMECOV_NEG.out.genomecov, - [], - false - ) - - /* - * CHANNEL: Join POS/NEG files into one channel so they can be merged in the next module - */ - ch_merge_and_sort_input = SELECT_BED_POS.out.file - .map{ [ it[0].id, it[0], it[1] ] } - .join( SELECT_BED_NEG.out.file.map{ [ it[0].id, it[0], it[1] ] } ) - .map { [ it[1], [ it[2], it[4] ] ] } - //EXAMPLE CHANNEL STRUCT: [ [id:test], [ BED(pos), BED(neg) ] ] - //ch_merge_and_sort_input | view - - /* - * MODULE: Select columns in BED file using AWK - */ - MERGE_AND_SORT ( - ch_merge_and_sort_input, - [], - false - ) - - /* - * MODULE: Create coverage track using AWK - */ - CROSSLINK_COVERAGE ( - MERGE_AND_SORT.out.file, - [], - false - ) - - /* - * MODULE: Create normalised coverage track using AWK - */ - CROSSLINK_NORMCOVERAGE ( - MERGE_AND_SORT.out.file, - [], - true - ) - - emit: - bed = MERGE_AND_SORT.out.file // channel: [ val(meta), [ bed ] ] - coverage = CROSSLINK_COVERAGE.out.file // channel: [ val(meta), [ bedgraph ] ] - coverage_norm = CROSSLINK_NORMCOVERAGE.out.file // channel: [ val(meta), [ bedgraph ] ] - versions = ch_versions // channel: [ versions.yml ] -} diff --git a/subworkflows/goodwright/clip_calc_crosslinks/meta.yml b/subworkflows/goodwright/clip_calc_crosslinks/meta.yml deleted file mode 100644 index 0d6ac85..0000000 --- a/subworkflows/goodwright/clip_calc_crosslinks/meta.yml +++ /dev/null @@ -1,53 +0,0 @@ -name: clip_calc_crosslinks -description: | - Calculate clip crosslinks using an input BAM file and genome index file. Crosslinks are outputed as a BED file and additional - coverage and normalised coverage tracks are calculated in BEDGRAPH format. -keywords: - - bedtools - - crosslinks - - coverage - - fai - - bam -modules: - - goodwright/linux/command - - goodwright/bedtools/shift - - nf-core/bedtools/bamtobed - - nf-core/bedtools/genomecov -input: - - meta: - type: map - description: | - Groovy Map containing sample information - e.g. [ id:'test', single_end:false ] - - bam: - type: file - description: BAM file - pattern: "*.bam" - - fai: - type: file - description: FAI file - pattern: "*.fai" -output: - - meta: - type: map - description: | - Groovy Map containing sample information - e.g. [ id:'test', single_end:false ] - - bed: - type: file - description: Bedfile containing the crosslink depths at genomic positions - pattern: "*.bed" - - coverage: - type: file - description: The crosslinks bed file turned into a bedgraph coverage file. - pattern: "*.bedgraph" - - coverage_norm: - type: file - description: The crosslinks bed file turned into a bedgraph coverage file normalised against total count - pattern: "*.bedgraph" - - versions: - type: file - description: File containing software versions - pattern: "versions.yml" -authors: - - "@chris-cheshire"