goodwright · CharlotteAnne · Feb 27, 2025 · May 14, 2024 · May 14, 2024 · May 14, 2024
diff --git a/conf/modules.config b/conf/modules.config
@@ -568,149 +568,32 @@ if(params.run_umi_dedup) {
 
 if(params.run_calc_crosslinks) {
     process {
-        withName: '.*CROSSLINKS:BEDTOOLS_BAMTOBED' {
-            publishDir = [
-                enabled: false
-            ]
-        }
-
-        withName: '.*CROSSLINKS:BEDTOOLS_SHIFT' {
-            ext.args = '-m 1 -p -1'
-            publishDir = [
-                enabled: false
-            ]
-        }
-
-        withName: '.*CROSSLINKS:BEDTOOLS_GENOMECOV_POS' {
-            ext.args = '-dz -strand + -5'
-            publishDir = [
-                enabled: false
-            ]
-        }
-
-        withName: '.*CROSSLINKS:BEDTOOLS_GENOMECOV_NEG' {
-            ext.args = '-dz -strand - -5'
-            publishDir = [
-                enabled: false
-            ]
-        }
-
-        withName: '.*CROSSLINKS:SELECT_BED_POS' {
-            ext.cmd1 = 'awk \'{OFS="\\t"}{print \$1, \$2, \$2+1, ".", \$3, "+"}\''
-            ext.suffix = '.pos'
-            ext.ext = 'bed'
-            publishDir = [
-                enabled: false
-            ]
-        }
-
-        withName: '.*CROSSLINKS:SELECT_BED_NEG' {
-            ext.cmd1 = 'awk \'{OFS="\\t"}{print \$1, \$2, \$2+1, ".", \$3, "-"}\''
-            ext.suffix = '.neg'
-            ext.ext = 'bed'
-            publishDir = [
-                enabled: false
-            ]
-        }
-
-        withName: 'CLIPSEQ:CALC_GENOME_CROSSLINKS:MERGE_AND_SORT' {
-            ext.cmd1 = 'sort -k1,1 -k2,2n'
+        withName: 'CLIPSEQ:CALC_GENOME_CROSSLINKS' {
             ext.suffix = '.genome'
-            ext.ext = 'bed'
             publishDir = [
                 path: { "${params.outdir}/04_crosslinks" },
                 mode: "${params.publish_dir_mode}",
                 saveAs: { filename -> filename.equals('versions.yml') ? null : filename }
             ]
         }
 
-        withName: 'CLIPSEQ:CALC_GENOME_CROSSLINKS:CROSSLINK_COVERAGE' {
-            ext.cmd1 = 'awk \'{OFS = "\t"}{if (\$6 == "+") {print \$1, \$2, \$3, \$5} else {print \$1, \$2, \$3, -\$5}}\' | sort -k1,1 -k2,2n'
-            ext.suffix = '.genome'
-            ext.ext = 'bedgraph'
-            publishDir = [
-                path: { "${params.outdir}/04_crosslinks" },
-                mode: "${params.publish_dir_mode}",
-                saveAs: { filename -> filename.equals('versions.yml') ? null : filename }
-            ]
-        }
-
-        withName: 'CLIPSEQ:CALC_GENOME_CROSSLINKS:CROSSLINK_NORMCOVERAGE' {
-            ext.cmd1 = 'awk -v total=\$CMD2 \'{printf "%s\\t%i\\t%i\\t%s\\t%f\\t%s\\n", \$1, \$2, \$3, \$4, 1000000*\$5/total, \$6}\' | awk \'{OFS = "\t"}{if (\$6 == "+") {print \$1, \$2, \$3, \$5} else {print \$1, \$2, \$3, -\$5}}\' | sort -k1,1 -k2,2n'
-            ext.cmd2 = 'awk \'BEGIN {total=0} {total=total+\$5} END {print total}\''
-            ext.suffix = '.norm.genome'
-            ext.ext = 'bedgraph'
-            publishDir = [
-                path: { "${params.outdir}/04_crosslinks" },
-                mode: "${params.publish_dir_mode}",
-                saveAs: { filename -> filename.equals('versions.yml') ? null : filename }
-            ]
-        }
-
-        withName: 'CLIPSEQ:CALC_TRANSCRIPT_CROSSLINKS:MERGE_AND_SORT' {
-            ext.cmd1 = 'sort -k1,1 -k2,2n'
+        withName: 'CLIPSEQ:CALC_TRANSCRIPT_CROSSLINKS' {
             ext.suffix = '.transcript'
-            ext.ext = 'bed'
-            publishDir = [
-                path: { "${params.outdir}/04_crosslinks" },
-                mode: "${params.publish_dir_mode}",
-                saveAs: { filename -> filename.equals('versions.yml') ? null : filename }
-            ]
-        }
-
-        withName: 'CLIPSEQ:CALC_TRANSCRIPT_CROSSLINKS:CROSSLINK_COVERAGE' {
-            ext.cmd1 = 'awk \'{OFS = "\t"}{if (\$6 == "+") {print \$1, \$2, \$3, \$5} else {print \$1, \$2, \$3, -\$5}}\' | sort -k1,1 -k2,2n'
-            ext.suffix = '.transcript'
-            ext.ext = 'bedgraph'
-            publishDir = [
-                path: { "${params.outdir}/04_crosslinks" },
-                mode: "${params.publish_dir_mode}",
-                saveAs: { filename -> filename.equals('versions.yml') ? null : filename }
-            ]
-        }
-
-        withName: 'CLIPSEQ:CALC_TRANSCRIPT_CROSSLINKS:CROSSLINK_NORMCOVERAGE' {
-            ext.cmd1 = 'awk -v total=\$CMD2 \'{printf "%s\\t%i\\t%i\\t%s\\t%f\\t%s\\n", \$1, \$2, \$3, \$4, 1000000*\$5/total, \$6}\' | awk \'{OFS = "\t"}{if (\$6 == "+") {print \$1, \$2, \$3, \$5} else {print \$1, \$2, \$3, -\$5}}\' | sort -k1,1 -k2,2n'
-            ext.cmd2 = 'awk \'BEGIN {total=0} {total=total+\$5} END {print total}\''
-            ext.suffix = '.norm.transcript'
-            ext.ext = 'bedgraph'
-            publishDir = [
-                path: { "${params.outdir}/04_crosslinks" },
-                mode: "${params.publish_dir_mode}",
-                saveAs: { filename -> filename.equals('versions.yml') ? null : filename }
-            ]
-        }
-        withName: 'CLIPSEQ:CALC_SMRNA_K1_CROSSLINKS:MERGE_AND_SORT' {
-            ext.cmd1 = 'sort -k1,1 -k2,2n'
-            ext.suffix = '.smrna_withk1'
-            ext.ext = 'bed'
             publishDir = [
                 path: { "${params.outdir}/04_crosslinks" },
                 mode: "${params.publish_dir_mode}",
                 saveAs: { filename -> filename.equals('versions.yml') ? null : filename }
             ]
         }
 
-        withName: 'CLIPSEQ:CALC_SMRNA_K1_CROSSLINKS:CROSSLINK_COVERAGE' {
-            ext.cmd1 = 'awk \'{OFS = "\t"}{if (\$6 == "+") {print \$1, \$2, \$3, \$5} else {print \$1, \$2, \$3, -\$5}}\' | sort -k1,1 -k2,2n'
+        withName: 'CLIPSEQ:CALC_SMRNA_K1_CROSSLINKS' {
             ext.suffix = '.smrna_withk1'
-            ext.ext = 'bedgraph'
             publishDir = [
                 path: { "${params.outdir}/04_crosslinks" },
                 mode: "${params.publish_dir_mode}",
                 saveAs: { filename -> filename.equals('versions.yml') ? null : filename }
             ]
         }
-
-        withName: 'CLIPSEQ:CALC_SMRNA_K1_CROSSLINKS:CROSSLINK_NORMCOVERAGE' {
-            ext.cmd1 = 'awk -v total=\$CMD2 \'{printf "%s\\t%i\\t%i\\t%s\\t%f\\t%s\\n", \$1, \$2, \$3, \$4, 1000000*\$5/total, \$6}\' | awk \'{OFS = "\t"}{if (\$6 == "+") {print \$1, \$2, \$3, \$5} else {print \$1, \$2, \$3, -\$5}}\' | sort -k1,1 -k2,2n'
-            ext.cmd2 = 'awk \'BEGIN {total=0} {total=total+\$5} END {print total}\''
-            ext.suffix = '.norm.smrna_withk1'
-            ext.ext = 'bedgraph'
-            publishDir = [
-                enabled: false
-            ]
-        }
     }
 }
 

diff --git a/conf/test.config b/conf/test.config
@@ -21,7 +21,7 @@ params {
     max_time   = '6.h'
 
     // Input data
-    samplesheet = './tests/data/samplesheets/small-single-sample-se.csv'
+    samplesheet = './tests/data/samplesheets/small-dual-sample-se.csv'
     fasta       = './tests/data/genome/yeast_MitoV.fa.gz'
     smrna_fasta = './tests/data/genome/homosapiens_smallRNA.fa.gz'
     gtf         = './tests/data/genome/yeast_MitoV.gtf.gz'

diff --git a/main.nf b/main.nf
@@ -95,6 +95,10 @@ ch_multiqc_config = file("$projectDir/assets/multiqc_config.yml", checkIfExists:
 //
 
 include { MULTIQC } from './modules/local/multiqc'
+include { GET_CROSSLINKS as CALC_SMRNA_K1_CROSSLINKS      } from './modules/local/get_crosslinks'
+include { GET_CROSSLINKS as CALC_GENOME_CROSSLINKS        } from './modules/local/get_crosslinks'
+include { GET_CROSSLINKS as CALC_TRANSCRIPT_CROSSLINKS    } from './modules/local/get_crosslinks'
+
 
 //
 // SUBWORKFLOWS
@@ -116,6 +120,7 @@ include { CLIPPY as CLIPPY_TRANSCRIPT                } from './modules/goodwrigh
 include { PEKA                                       } from './modules/goodwright/peka/main'
 include { DUMP_SOFTWARE_VERSIONS                     } from './modules/goodwright/dump_software_versions/main'
 include { CLIPSEQ_CLIPQC                             } from './modules/goodwright/clipseq/clipqc/main'
+include { ENCODE_MOVEUMI                             } from './modules/goodwright/clipseq/encode_moveumi/main'
 
 //
 // SUBWORKFLOWS
@@ -130,9 +135,6 @@ include { BAM_DEDUP_SAMTOOLS_UMITOOLS as GENOME_MULTI_DEDUP     } from './subwor
 include { BAM_DEDUP_SAMTOOLS_UMITOOLS as SMRNA_DEDUP            } from './subworkflows/goodwright/bam_dedup_samtools_umitools/main'
 include { BAM_DEDUP_SAMTOOLS_UMITOOLS as SMRNA_K1_DEDUP         } from './subworkflows/goodwright/bam_dedup_samtools_umitools/main'
 include { BAM_DEDUP_SAMTOOLS_UMITOOLS as TRANSCRIPT_DEDUP       } from './subworkflows/goodwright/bam_dedup_samtools_umitools/main'
-include { CLIP_CALC_CROSSLINKS as CALC_SMRNA_K1_CROSSLINKS      } from './subworkflows/goodwright/clip_calc_crosslinks/main'
-include { CLIP_CALC_CROSSLINKS as CALC_GENOME_CROSSLINKS        } from './subworkflows/goodwright/clip_calc_crosslinks/main'
-include { CLIP_CALC_CROSSLINKS as CALC_TRANSCRIPT_CROSSLINKS    } from './subworkflows/goodwright/clip_calc_crosslinks/main'
 include { PARACLU_ANALYSE as PARACLU_ANALYSE_GENOME             } from './subworkflows/goodwright/paraclu_analyse/main'
 include { PARACLU_ANALYSE as PARACLU_ANALYSE_TRANSCRIPT         } from './subworkflows/goodwright/paraclu_analyse/main'
 include { ICOUNT_ANALYSE                                        } from './subworkflows/goodwright/icount_analyse/main'
@@ -277,7 +279,13 @@ workflow CLIPSEQ {
     }
     //EXAMPLE CHANNEL STRUCT: [[id:h3k27me3_R1, group:h3k27me3, replicate:1, single_end:false], [FASTQ]]
     //ch_fastq | view
-
+    if(params.encode_eclip){
+        ENCODE_MOVEUMI (
+            ch_fastq
+        )
+        ch_versions = ch_versions.mix(ENCODE_MOVEUMI.out.versions)
+        ch_fastq    = ENCODE_MOVEUMI.out.reads
+    }
     if(params.run_move_umi_to_header){
         UMITOOLS_EXTRACT (
             ch_fastq
@@ -433,6 +441,9 @@ workflow CLIPSEQ {
         ch_versions       = ch_versions.mix(TRANSCRIPT_DEDUP.out.versions)
         ch_transcript_bam = TRANSCRIPT_DEDUP.out.bam
         ch_transcript_bai = TRANSCRIPT_DEDUP.out.bai
+    } else {
+        ch_genome_bam = ch_genome_unique_bam
+        ch_genome_bai = ch_genome_unique_bai
     }
 
     ch_genome_crosslink_bed           = Channel.empty()
@@ -446,8 +457,9 @@ workflow CLIPSEQ {
         * SUBWORKFLOW: Run crosslink calculation for smRNA with -k 1
         */
         CALC_SMRNA_K1_CROSSLINKS (
-            ch_smrna_k1_bam,
-            ch_smrna_fasta_fai.collect{ it[1] }
+            ch_smrna_k1_bam.join(ch_smrna_k1_bai),
+            ch_smrna_fasta_fai.collect{ it[1] },
+            params.crosslink_position
         )
         ch_versions                      = ch_versions.mix(CALC_SMRNA_K1_CROSSLINKS.out.versions)
         ch_smrna_crosslink_bed           = CALC_SMRNA_K1_CROSSLINKS.out.bed
@@ -458,8 +470,9 @@ workflow CLIPSEQ {
         * SUBWORKFLOW: Run crosslink calculation for genome
         */
         CALC_GENOME_CROSSLINKS (
-            ch_genome_bam,
-            ch_fasta_fai.collect{ it[1] }
+            ch_genome_bam.join(ch_genome_bai),
+            ch_fasta_fai.collect{ it[1] },
+            params.crosslink_position
         )
         ch_versions                       = ch_versions.mix(CALC_GENOME_CROSSLINKS.out.versions)
         ch_genome_crosslink_bed           = CALC_GENOME_CROSSLINKS.out.bed
@@ -470,8 +483,9 @@ workflow CLIPSEQ {
         * SUBWORKFLOW: Run crosslink calculation for transcripts
         */
         CALC_TRANSCRIPT_CROSSLINKS (
-            ch_transcript_bam,
-            ch_longest_transcript_fai.collect{ it[1] }
+            ch_transcript_bam.join(ch_transcript_bai),
+            ch_longest_transcript_fai.collect{ it[1] },
+            params.crosslink_position
         )
         ch_versions                      = ch_versions.mix(CALC_TRANSCRIPT_CROSSLINKS.out.versions)
         ch_trans_crosslink_bed           = CALC_TRANSCRIPT_CROSSLINKS.out.bed

diff --git a/modules/goodwright/clipseq/encode_moveumi/main.nf b/modules/goodwright/clipseq/encode_moveumi/main.nf
@@ -0,0 +1,22 @@
+process ENCODE_MOVEUMI {
+    label "process_single"
+
+    conda "bioconda::biopython=1.78 pigz=2.6"
+    container "quay.io/biocontainers/mulled-v2-877c4e5a8fad685ea5bde487e04924ac447923b9:b7daa641364165419b9a87d9988bc803f913c5b6-0"
+
+    input:
+    tuple val(meta), path(reads)
+
+    output:
+    tuple val(meta), path("*.fastq.gz"), emit: reads
+    path  "versions.yml"               , emit: versions
+
+    when:
+    task.ext.when == null || task.ext.when
+
+    shell:
+    def args     = task.ext.args ?: ''
+    prefix       = task.ext.prefix ?: "${meta.id}"
+    process_name = task.process
+    template 'encode_moveumi.py'
+}
diff --git a/modules/goodwright/clipseq/encode_moveumi/meta.yml b/modules/goodwright/clipseq/encode_moveumi/meta.yml
@@ -0,0 +1,40 @@
+name: clipseq_clipqc
+description: Runs python-based clip quality control and outputs to a set of tsv files
+keywords:
+  - iCLIP
+  - eCLIP
+  - CLIP
+  - qc
+input:
+  - premap:
+      type: file
+      description: Premap metrics files
+  - mapped:
+      type: file
+      description: Alignment metrics files
+  - collapse:
+      type: file
+      description: UMI collapse metrics files
+  - xlinks:
+      type: file
+      description: xlinks metric files
+  - icount:
+      type: file
+      description: iCount metrics files
+  - paraclu:
+      type: file
+      description: paraclu metrics files
+  - clippy:
+      type: file
+      description: Clippy metrics files
+output:
+  - tsv:
+      type: file
+      description: All tsv file outputs
+      pattern: "*.tsv"
+  - version:
+      type: file
+      description: File containing software version
+      pattern: "*.{version.txt}"
+authors:
+  - "@chris-cheshire"
diff --git a/modules/goodwright/clipseq/encode_moveumi/templates/encode_moveumi.py b/modules/goodwright/clipseq/encode_moveumi/templates/encode_moveumi.py
@@ -0,0 +1,29 @@
+#!/usr/bin/env python
+
+import os
+import sys
+import gzip
+import platform
+from Bio import SeqIO
+import Bio
+
+input_fq = "!{reads}"
+output_fq = "!{prefix}.umi.fastq"
+
+with gzip.open(input_fq, mode = 'rt') as f_in:
+    with open(output_fq, mode = 'w') as f_out:
+        for record in SeqIO.parse(f_in, 'fastq'):
+            header = record.id.split(":")
+            if '_' not in header[-1]:
+                rearranged = ":".join(header[1:]) + '_rbc:' + header[0]
+                record.id = rearranged
+                record.name = rearranged
+                record.description = rearranged
+            SeqIO.write(record, f_out, 'fastq')
+
+os.system('pigz ' + output_fq)
+
+with open("versions.yml", "w") as out_f:
+    out_f.write("!{process_name}" + ":\n")
+    out_f.write("    python: " + platform.python_version() + "\n")
+    out_f.write("    biopython: " + Bio.__version__ + "\n")