From d7f401af5fdfbe08bc029e18d7724b45e53158a0 Mon Sep 17 00:00:00 2001
From: Charlotte Capitanchik <charlotteannecap@gmail.com>
Date: Tue, 14 May 2024 14:27:11 +0100
Subject: [PATCH 01/10] add encode eclip mode

---
 main.nf                                       |  9 ++++-
 .../goodwright/clipseq/encode_moveumi/main.nf | 24 +++++++++++
 .../clipseq/encode_moveumi/meta.yml           | 40 +++++++++++++++++++
 .../templates/encode_moveumi.py               | 29 ++++++++++++++
 nextflow.config                               |  1 +
 schema/clipseq.json                           |  5 +++
 6 files changed, 107 insertions(+), 1 deletion(-)
 create mode 100644 modules/goodwright/clipseq/encode_moveumi/main.nf
 create mode 100644 modules/goodwright/clipseq/encode_moveumi/meta.yml
 create mode 100644 modules/goodwright/clipseq/encode_moveumi/templates/encode_moveumi.py

diff --git a/main.nf b/main.nf
index 7d32840..b457fa4 100644
--- a/main.nf
+++ b/main.nf
@@ -116,6 +116,7 @@ include { CLIPPY as CLIPPY_TRANSCRIPT                } from './modules/goodwrigh
 include { PEKA                                       } from './modules/goodwright/peka/main'
 include { DUMP_SOFTWARE_VERSIONS                     } from './modules/goodwright/dump_software_versions/main'
 include { CLIPSEQ_CLIPQC                             } from './modules/goodwright/clipseq/clipqc/main'
+include { ENCODE_MOVEUMI                             } from './modules/goodwright/clipseq/encode_moveumi/main'
 
 //
 // SUBWORKFLOWS
@@ -277,7 +278,13 @@ workflow CLIPSEQ {
     }
     //EXAMPLE CHANNEL STRUCT: [[id:h3k27me3_R1, group:h3k27me3, replicate:1, single_end:false], [FASTQ]]
     //ch_fastq | view
-
+    if(params.encode_eclip){
+        ENCODE_MOVEUMI (
+            ch_fastq
+        )
+        ch_versions = ch_versions.mix(ENCODE_MOVEUMI.out.versions)
+        ch_fastq    = ENCODE_MOVEUMI.out.reads
+    }
     if(params.run_move_umi_to_header){
         UMITOOLS_EXTRACT (
             ch_fastq
diff --git a/modules/goodwright/clipseq/encode_moveumi/main.nf b/modules/goodwright/clipseq/encode_moveumi/main.nf
new file mode 100644
index 0000000..81154a9
--- /dev/null
+++ b/modules/goodwright/clipseq/encode_moveumi/main.nf
@@ -0,0 +1,24 @@
+process ENCODE_MOVEUMI {
+    label "process_single"
+
+    conda "bioconda::biopython=1.70"
+    container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ?
+    'https://depot.galaxyproject.org/singularity/biopython:1.81 ' :
+    'biocontainers/biopython:1.81' }"
+
+    input:
+    tuple val(meta), path(reads)
+
+    output:
+    tuple val(meta), path("*.fastq.gz"), emit: reads
+    path  "versions.yml"               , emit: versions
+
+    when:
+    task.ext.when == null || task.ext.when
+
+    shell:
+    def args     = task.ext.args ?: ''
+    prefix       = task.ext.prefix ?: "${meta.id}"
+    process_name = task.process
+    template 'encode_moveumi.py'
+}
diff --git a/modules/goodwright/clipseq/encode_moveumi/meta.yml b/modules/goodwright/clipseq/encode_moveumi/meta.yml
new file mode 100644
index 0000000..8429c6e
--- /dev/null
+++ b/modules/goodwright/clipseq/encode_moveumi/meta.yml
@@ -0,0 +1,40 @@
+name: clipseq_clipqc
+description: Runs python-based clip quality control and outputs to a set of tsv files
+keywords:
+  - iCLIP
+  - eCLIP
+  - CLIP
+  - qc
+input:
+  - premap:
+      type: file
+      description: Premap metrics files
+  - mapped:
+      type: file
+      description: Alignment metrics files
+  - collapse:
+      type: file
+      description: UMI collapse metrics files
+  - xlinks:
+      type: file
+      description: xlinks metric files
+  - icount:
+      type: file
+      description: iCount metrics files
+  - paraclu:
+      type: file
+      description: paraclu metrics files
+  - clippy:
+      type: file
+      description: Clippy metrics files
+output:
+  - tsv:
+      type: file
+      description: All tsv file outputs
+      pattern: "*.tsv"
+  - version:
+      type: file
+      description: File containing software version
+      pattern: "*.{version.txt}"
+authors:
+  - "@chris-cheshire"
diff --git a/modules/goodwright/clipseq/encode_moveumi/templates/encode_moveumi.py b/modules/goodwright/clipseq/encode_moveumi/templates/encode_moveumi.py
new file mode 100644
index 0000000..7794903
--- /dev/null
+++ b/modules/goodwright/clipseq/encode_moveumi/templates/encode_moveumi.py
@@ -0,0 +1,29 @@
+#!/usr/bin/env python
+
+import os
+import sys
+import gzip
+import platform
+from Bio import SeqIO
+import Bio
+
+input_fq = "!{reads}"
+output_fq = "!{prefix}.umi.fastq"
+
+with gzip.open(input_fq, mode = 'rt') as f_in:
+    with open(output_fq, mode = 'w') as f_out:
+        for record in SeqIO.parse(f_in, 'fastq'):
+            header = record.id.split(":")
+            if '_' not in header[-1]:
+                rearranged = ":".join(header[1:]) + '_' + header[0]
+                record.id = rearranged
+                record.name = rearranged
+                record.description = rearranged
+            SeqIO.write(record, f_out, 'fastq')
+
+os.system('gzip ' + output_fq)
+
+with open("versions.yml", "w") as out_f:
+    out_f.write("!{process_name}" + ":\n")
+    out_f.write("    python: " + platform.python_version() + "\n")
+    out_f.write("    biopython: " + Bio.__version__ + "\n")
\ No newline at end of file
diff --git a/nextflow.config b/nextflow.config
index 6bbe635..8c471a2 100644
--- a/nextflow.config
+++ b/nextflow.config
@@ -69,6 +69,7 @@ params {
     save_unaligned_res  = true
 
     // Pipeline params
+    encode_eclip        = false
     move_umi_to_header  = false
     umi_header_format   = null
     save_unaligned      = true // DO NOT CHANGE
diff --git a/schema/clipseq.json b/schema/clipseq.json
index 2ca3536..2ca109d 100644
--- a/schema/clipseq.json
+++ b/schema/clipseq.json
@@ -270,6 +270,11 @@
             "description": "Additional pipeline configuration options.",
             "advanced": true,
             "properties": {
+                "encode_eclip": {
+                    "name": "ENCODE eCLIP mode",
+                    "description": "Use just read 2 and enable this option which moves the header UMI to correct location for downstream processing.",
+                    "type": "boolean"
+                },
                 "move_umi_to_header": {
                     "name": "Extract UMI to header",
                     "description": "Runs UMI to header extraction based on the head format provided in UMI header format.",

From 986b723a186a013032032d1f469c49afc3bcde9a Mon Sep 17 00:00:00 2001
From: Charlotte Capitanchik <charlotteannecap@gmail.com>
Date: Tue, 14 May 2024 15:44:58 +0100
Subject: [PATCH 02/10] update encode_moveumi

---
 .../clipseq/encode_moveumi/templates/encode_moveumi.py          | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/modules/goodwright/clipseq/encode_moveumi/templates/encode_moveumi.py b/modules/goodwright/clipseq/encode_moveumi/templates/encode_moveumi.py
index 7794903..ef58777 100644
--- a/modules/goodwright/clipseq/encode_moveumi/templates/encode_moveumi.py
+++ b/modules/goodwright/clipseq/encode_moveumi/templates/encode_moveumi.py
@@ -15,7 +15,7 @@
         for record in SeqIO.parse(f_in, 'fastq'):
             header = record.id.split(":")
             if '_' not in header[-1]:
-                rearranged = ":".join(header[1:]) + '_' + header[0]
+                rearranged = ":".join(header[1:]) + '_rbc:' + header[0]
                 record.id = rearranged
                 record.name = rearranged
                 record.description = rearranged

From 47a9c391d347106f1687872d9e53f99b610ad334 Mon Sep 17 00:00:00 2001
From: Charlotte Capitanchik <charlotteannecap@gmail.com>
Date: Tue, 14 May 2024 15:53:54 +0100
Subject: [PATCH 03/10] update zipping to pigz for speed

---
 modules/goodwright/clipseq/encode_moveumi/main.nf           | 6 ++----
 .../clipseq/encode_moveumi/templates/encode_moveumi.py      | 2 +-
 2 files changed, 3 insertions(+), 5 deletions(-)

diff --git a/modules/goodwright/clipseq/encode_moveumi/main.nf b/modules/goodwright/clipseq/encode_moveumi/main.nf
index 81154a9..2e2069d 100644
--- a/modules/goodwright/clipseq/encode_moveumi/main.nf
+++ b/modules/goodwright/clipseq/encode_moveumi/main.nf
@@ -1,10 +1,8 @@
 process ENCODE_MOVEUMI {
     label "process_single"
 
-    conda "bioconda::biopython=1.70"
-    container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ?
-    'https://depot.galaxyproject.org/singularity/biopython:1.81 ' :
-    'biocontainers/biopython:1.81' }"
+    conda "bioconda::biopython=1.78 pigz=2.6"
+    container "quay.io/biocontainers/mulled-v2-877c4e5a8fad685ea5bde487e04924ac447923b9:b7daa641364165419b9a87d9988bc803f913c5b6-0"
 
     input:
     tuple val(meta), path(reads)
diff --git a/modules/goodwright/clipseq/encode_moveumi/templates/encode_moveumi.py b/modules/goodwright/clipseq/encode_moveumi/templates/encode_moveumi.py
index ef58777..4b4ff31 100644
--- a/modules/goodwright/clipseq/encode_moveumi/templates/encode_moveumi.py
+++ b/modules/goodwright/clipseq/encode_moveumi/templates/encode_moveumi.py
@@ -21,7 +21,7 @@
                 record.description = rearranged
             SeqIO.write(record, f_out, 'fastq')
 
-os.system('gzip ' + output_fq)
+os.system('pigz ' + output_fq)
 
 with open("versions.yml", "w") as out_f:
     out_f.write("!{process_name}" + ":\n")

From 9efb325ba33bdd96a3f1a7bbdd1902bc268c6892 Mon Sep 17 00:00:00 2001
From: Charlotte Capitanchik <charlotteannecap@gmail.com>
Date: Tue, 7 Jan 2025 17:21:50 +0000
Subject: [PATCH 04/10] correct "skip umi dedupe" path

previously many processes were skipped when skip_umi_dedupe param = true. this was because ch_genome_bam and ch_genome_bai were not set. this is now fixed by adding an else clause to the logic in main.nf
---
 main.nf | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/main.nf b/main.nf
index b457fa4..687b2c2 100644
--- a/main.nf
+++ b/main.nf
@@ -440,6 +440,9 @@ workflow CLIPSEQ {
         ch_versions       = ch_versions.mix(TRANSCRIPT_DEDUP.out.versions)
         ch_transcript_bam = TRANSCRIPT_DEDUP.out.bam
         ch_transcript_bai = TRANSCRIPT_DEDUP.out.bai
+    } else {
+        ch_genome_bam = ch_genome_unique_bam
+        ch_genome_bai = ch_genome_unique_bai
     }
 
     ch_genome_crosslink_bed           = Channel.empty()

From 35f55e873a62cf66773240319edc1211cc8f01db Mon Sep 17 00:00:00 2001
From: Charlotte Capitanchik <charlotteannecap@gmail.com>
Date: Tue, 7 Jan 2025 19:13:28 +0000
Subject: [PATCH 05/10] update get crosslinks

convert from subworkflow to more manageable module
enable ability to choose crosslink position as start middle or end of read
---
 conf/modules.config                           | 123 +-----------------
 main.nf                                       |  22 ++--
 modules/local/get_crosslinks.nf               |  78 +++++++++++
 nextflow.config                               |   1 +
 schema/clipseq.json                           |  16 ++-
 .../clip_calc_crosslinks/default.config       |  49 -------
 .../goodwright/clip_calc_crosslinks/main.nf   | 120 -----------------
 .../goodwright/clip_calc_crosslinks/meta.yml  |  53 --------
 8 files changed, 106 insertions(+), 356 deletions(-)
 create mode 100644 modules/local/get_crosslinks.nf
 delete mode 100644 subworkflows/goodwright/clip_calc_crosslinks/default.config
 delete mode 100644 subworkflows/goodwright/clip_calc_crosslinks/main.nf
 delete mode 100644 subworkflows/goodwright/clip_calc_crosslinks/meta.yml

diff --git a/conf/modules.config b/conf/modules.config
index 9301b85..62e5729 100644
--- a/conf/modules.config
+++ b/conf/modules.config
@@ -568,55 +568,8 @@ if(params.run_umi_dedup) {
 
 if(params.run_calc_crosslinks) {
     process {
-        withName: '.*CROSSLINKS:BEDTOOLS_BAMTOBED' {
-            publishDir = [
-                enabled: false
-            ]
-        }
-
-        withName: '.*CROSSLINKS:BEDTOOLS_SHIFT' {
-            ext.args = '-m 1 -p -1'
-            publishDir = [
-                enabled: false
-            ]
-        }
-
-        withName: '.*CROSSLINKS:BEDTOOLS_GENOMECOV_POS' {
-            ext.args = '-dz -strand + -5'
-            publishDir = [
-                enabled: false
-            ]
-        }
-
-        withName: '.*CROSSLINKS:BEDTOOLS_GENOMECOV_NEG' {
-            ext.args = '-dz -strand - -5'
-            publishDir = [
-                enabled: false
-            ]
-        }
-
-        withName: '.*CROSSLINKS:SELECT_BED_POS' {
-            ext.cmd1 = 'awk \'{OFS="\\t"}{print \$1, \$2, \$2+1, ".", \$3, "+"}\''
-            ext.suffix = '.pos'
-            ext.ext = 'bed'
-            publishDir = [
-                enabled: false
-            ]
-        }
-
-        withName: '.*CROSSLINKS:SELECT_BED_NEG' {
-            ext.cmd1 = 'awk \'{OFS="\\t"}{print \$1, \$2, \$2+1, ".", \$3, "-"}\''
-            ext.suffix = '.neg'
-            ext.ext = 'bed'
-            publishDir = [
-                enabled: false
-            ]
-        }
-
-        withName: 'CLIPSEQ:CALC_GENOME_CROSSLINKS:MERGE_AND_SORT' {
-            ext.cmd1 = 'sort -k1,1 -k2,2n'
+        withName: 'CLIPSEQ:CALC_GENOME_CROSSLINKS' {
             ext.suffix = '.genome'
-            ext.ext = 'bed'
             publishDir = [
                 path: { "${params.outdir}/04_crosslinks" },
                 mode: "${params.publish_dir_mode}",
@@ -624,66 +577,8 @@ if(params.run_calc_crosslinks) {
             ]
         }
 
-        withName: 'CLIPSEQ:CALC_GENOME_CROSSLINKS:CROSSLINK_COVERAGE' {
-            ext.cmd1 = 'awk \'{OFS = "\t"}{if (\$6 == "+") {print \$1, \$2, \$3, \$5} else {print \$1, \$2, \$3, -\$5}}\' | sort -k1,1 -k2,2n'
-            ext.suffix = '.genome'
-            ext.ext = 'bedgraph'
-            publishDir = [
-                path: { "${params.outdir}/04_crosslinks" },
-                mode: "${params.publish_dir_mode}",
-                saveAs: { filename -> filename.equals('versions.yml') ? null : filename }
-            ]
-        }
-
-        withName: 'CLIPSEQ:CALC_GENOME_CROSSLINKS:CROSSLINK_NORMCOVERAGE' {
-            ext.cmd1 = 'awk -v total=\$CMD2 \'{printf "%s\\t%i\\t%i\\t%s\\t%f\\t%s\\n", \$1, \$2, \$3, \$4, 1000000*\$5/total, \$6}\' | awk \'{OFS = "\t"}{if (\$6 == "+") {print \$1, \$2, \$3, \$5} else {print \$1, \$2, \$3, -\$5}}\' | sort -k1,1 -k2,2n'
-            ext.cmd2 = 'awk \'BEGIN {total=0} {total=total+\$5} END {print total}\''
-            ext.suffix = '.norm.genome'
-            ext.ext = 'bedgraph'
-            publishDir = [
-                path: { "${params.outdir}/04_crosslinks" },
-                mode: "${params.publish_dir_mode}",
-                saveAs: { filename -> filename.equals('versions.yml') ? null : filename }
-            ]
-        }
-
-        withName: 'CLIPSEQ:CALC_TRANSCRIPT_CROSSLINKS:MERGE_AND_SORT' {
-            ext.cmd1 = 'sort -k1,1 -k2,2n'
+        withName: 'CLIPSEQ:CALC_TRANSCRIPT_CROSSLINKS' {
             ext.suffix = '.transcript'
-            ext.ext = 'bed'
-            publishDir = [
-                path: { "${params.outdir}/04_crosslinks" },
-                mode: "${params.publish_dir_mode}",
-                saveAs: { filename -> filename.equals('versions.yml') ? null : filename }
-            ]
-        }
-
-        withName: 'CLIPSEQ:CALC_TRANSCRIPT_CROSSLINKS:CROSSLINK_COVERAGE' {
-            ext.cmd1 = 'awk \'{OFS = "\t"}{if (\$6 == "+") {print \$1, \$2, \$3, \$5} else {print \$1, \$2, \$3, -\$5}}\' | sort -k1,1 -k2,2n'
-            ext.suffix = '.transcript'
-            ext.ext = 'bedgraph'
-            publishDir = [
-                path: { "${params.outdir}/04_crosslinks" },
-                mode: "${params.publish_dir_mode}",
-                saveAs: { filename -> filename.equals('versions.yml') ? null : filename }
-            ]
-        }
-
-        withName: 'CLIPSEQ:CALC_TRANSCRIPT_CROSSLINKS:CROSSLINK_NORMCOVERAGE' {
-            ext.cmd1 = 'awk -v total=\$CMD2 \'{printf "%s\\t%i\\t%i\\t%s\\t%f\\t%s\\n", \$1, \$2, \$3, \$4, 1000000*\$5/total, \$6}\' | awk \'{OFS = "\t"}{if (\$6 == "+") {print \$1, \$2, \$3, \$5} else {print \$1, \$2, \$3, -\$5}}\' | sort -k1,1 -k2,2n'
-            ext.cmd2 = 'awk \'BEGIN {total=0} {total=total+\$5} END {print total}\''
-            ext.suffix = '.norm.transcript'
-            ext.ext = 'bedgraph'
-            publishDir = [
-                path: { "${params.outdir}/04_crosslinks" },
-                mode: "${params.publish_dir_mode}",
-                saveAs: { filename -> filename.equals('versions.yml') ? null : filename }
-            ]
-        }
-        withName: 'CLIPSEQ:CALC_SMRNA_K1_CROSSLINKS:MERGE_AND_SORT' {
-            ext.cmd1 = 'sort -k1,1 -k2,2n'
-            ext.suffix = '.smrna_withk1'
-            ext.ext = 'bed'
             publishDir = [
                 path: { "${params.outdir}/04_crosslinks" },
                 mode: "${params.publish_dir_mode}",
@@ -691,26 +586,14 @@ if(params.run_calc_crosslinks) {
             ]
         }
 
-        withName: 'CLIPSEQ:CALC_SMRNA_K1_CROSSLINKS:CROSSLINK_COVERAGE' {
-            ext.cmd1 = 'awk \'{OFS = "\t"}{if (\$6 == "+") {print \$1, \$2, \$3, \$5} else {print \$1, \$2, \$3, -\$5}}\' | sort -k1,1 -k2,2n'
+        withName: 'CLIPSEQ:CALC_SMRNA_K1_CROSSLINKS' {
             ext.suffix = '.smrna_withk1'
-            ext.ext = 'bedgraph'
             publishDir = [
                 path: { "${params.outdir}/04_crosslinks" },
                 mode: "${params.publish_dir_mode}",
                 saveAs: { filename -> filename.equals('versions.yml') ? null : filename }
             ]
         }
-
-        withName: 'CLIPSEQ:CALC_SMRNA_K1_CROSSLINKS:CROSSLINK_NORMCOVERAGE' {
-            ext.cmd1 = 'awk -v total=\$CMD2 \'{printf "%s\\t%i\\t%i\\t%s\\t%f\\t%s\\n", \$1, \$2, \$3, \$4, 1000000*\$5/total, \$6}\' | awk \'{OFS = "\t"}{if (\$6 == "+") {print \$1, \$2, \$3, \$5} else {print \$1, \$2, \$3, -\$5}}\' | sort -k1,1 -k2,2n'
-            ext.cmd2 = 'awk \'BEGIN {total=0} {total=total+\$5} END {print total}\''
-            ext.suffix = '.norm.smrna_withk1'
-            ext.ext = 'bedgraph'
-            publishDir = [
-                enabled: false
-            ]
-        }
     }
 }
 
diff --git a/main.nf b/main.nf
index 687b2c2..bd0d336 100644
--- a/main.nf
+++ b/main.nf
@@ -95,6 +95,10 @@ ch_multiqc_config = file("$projectDir/assets/multiqc_config.yml", checkIfExists:
 //
 
 include { MULTIQC } from './modules/local/multiqc'
+include { GET_CROSSLINKS as CALC_SMRNA_K1_CROSSLINKS      } from './modules/local/get_crosslinks'
+include { GET_CROSSLINKS as CALC_GENOME_CROSSLINKS        } from './modules/local/get_crosslinks'
+include { GET_CROSSLINKS as CALC_TRANSCRIPT_CROSSLINKS    } from './modules/local/get_crosslinks'
+
 
 //
 // SUBWORKFLOWS
@@ -131,9 +135,6 @@ include { BAM_DEDUP_SAMTOOLS_UMITOOLS as GENOME_MULTI_DEDUP     } from './subwor
 include { BAM_DEDUP_SAMTOOLS_UMITOOLS as SMRNA_DEDUP            } from './subworkflows/goodwright/bam_dedup_samtools_umitools/main'
 include { BAM_DEDUP_SAMTOOLS_UMITOOLS as SMRNA_K1_DEDUP         } from './subworkflows/goodwright/bam_dedup_samtools_umitools/main'
 include { BAM_DEDUP_SAMTOOLS_UMITOOLS as TRANSCRIPT_DEDUP       } from './subworkflows/goodwright/bam_dedup_samtools_umitools/main'
-include { CLIP_CALC_CROSSLINKS as CALC_SMRNA_K1_CROSSLINKS      } from './subworkflows/goodwright/clip_calc_crosslinks/main'
-include { CLIP_CALC_CROSSLINKS as CALC_GENOME_CROSSLINKS        } from './subworkflows/goodwright/clip_calc_crosslinks/main'
-include { CLIP_CALC_CROSSLINKS as CALC_TRANSCRIPT_CROSSLINKS    } from './subworkflows/goodwright/clip_calc_crosslinks/main'
 include { PARACLU_ANALYSE as PARACLU_ANALYSE_GENOME             } from './subworkflows/goodwright/paraclu_analyse/main'
 include { PARACLU_ANALYSE as PARACLU_ANALYSE_TRANSCRIPT         } from './subworkflows/goodwright/paraclu_analyse/main'
 include { ICOUNT_ANALYSE                                        } from './subworkflows/goodwright/icount_analyse/main'
@@ -456,8 +457,9 @@ workflow CLIPSEQ {
         * SUBWORKFLOW: Run crosslink calculation for smRNA with -k 1
         */
         CALC_SMRNA_K1_CROSSLINKS (
-            ch_smrna_k1_bam,
-            ch_smrna_fasta_fai.collect{ it[1] }
+            ch_smrna_k1_bam.join(ch_smrna_k1_bai),
+            ch_smrna_fasta_fai,
+            params.crosslink_position
         )
         ch_versions                      = ch_versions.mix(CALC_SMRNA_K1_CROSSLINKS.out.versions)
         ch_smrna_crosslink_bed           = CALC_SMRNA_K1_CROSSLINKS.out.bed
@@ -468,8 +470,9 @@ workflow CLIPSEQ {
         * SUBWORKFLOW: Run crosslink calculation for genome
         */
         CALC_GENOME_CROSSLINKS (
-            ch_genome_bam,
-            ch_fasta_fai.collect{ it[1] }
+            ch_genome_bam.join(ch_genome_bai),
+            ch_fasta_fai,
+            params.crosslink_position
         )
         ch_versions                       = ch_versions.mix(CALC_GENOME_CROSSLINKS.out.versions)
         ch_genome_crosslink_bed           = CALC_GENOME_CROSSLINKS.out.bed
@@ -480,8 +483,9 @@ workflow CLIPSEQ {
         * SUBWORKFLOW: Run crosslink calculation for transcripts
         */
         CALC_TRANSCRIPT_CROSSLINKS (
-            ch_transcript_bam,
-            ch_longest_transcript_fai.collect{ it[1] }
+            ch_transcript_bam.join(ch_transcript_bai),
+            ch_longest_transcript_fai,
+            params.crosslink_position
         )
         ch_versions                      = ch_versions.mix(CALC_TRANSCRIPT_CROSSLINKS.out.versions)
         ch_trans_crosslink_bed           = CALC_TRANSCRIPT_CROSSLINKS.out.bed
diff --git a/modules/local/get_crosslinks.nf b/modules/local/get_crosslinks.nf
new file mode 100644
index 0000000..02ce4ad
--- /dev/null
+++ b/modules/local/get_crosslinks.nf
@@ -0,0 +1,78 @@
+process GET_CROSSLINKS {
+    tag "$meta.id"
+    label 'process_medium'
+
+    conda "bioconda::bedtools=2.30.0"
+    container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ?
+        'https://depot.galaxyproject.org/singularity/bedtools:2.30.0--hc088bd4_0' :
+        'biocontainers/bedtools:2.30.0--hc088bd4_0' }"
+
+    input:
+    tuple val(meta), path(bam), path(bai)
+    tuple val(meta2), path(fai)
+    val crosslink_position
+
+    output:
+    tuple val(meta), path("*.xl.bed")               , emit: bed 
+    tuple val(meta), path("*.xl.bedgraph")          , emit: coverage 
+    tuple val(meta), path("*.xl.CPMnorm.bedgraph")  , emit: coverage_norm
+    path  "versions.yml"                            ,emit: versions
+
+
+    script:
+    def prefix = task.ext.suffix ? "${meta.id}${task.ext.suffix}" : "${meta.id}"
+    if (crosslink_position == "start"){
+        """
+        bedtools bamtobed -i $bam > dedup.bed
+        bedtools shift -m 1 -p -1 -i dedup.bed -g $fai > shifted.bed
+        bedtools genomecov -dz -strand + -5 -i shifted.bed -g $fai | awk '{OFS="\t"}{print \$1, \$2, \$2+1, ".", \$3, "+"}' > pos.bed
+        bedtools genomecov -dz -strand - -5 -i shifted.bed -g $fai | awk '{OFS="\t"}{print \$1, \$2, \$2+1, ".", \$3, "-"}' > neg.bed
+        cat pos.bed neg.bed | sort -k1,1 -k2,2n -k3,3 -k6,6 > ${prefix}.xl.bed
+        cat ${prefix}.xl.bed | awk '{OFS = "\t"}{if (\$6 == "+") {print \$1, \$2, \$3, \$5} else {print \$1, \$2, \$3, -\$5}}' > ${prefix}.xl.bedgraph
+        TOTAL_VARIABLE=`cat ${prefix}.xl.bed | awk \'BEGIN {total=0} {total=total+\$5} END {print total}\'`
+        cat ${prefix}.xl.bed | awk -v total=\$TOTAL_VARIABLE \'{printf "%s\\t%i\\t%i\\t%s\\t%f\\t%s\\n", \$1, \$2, \$3, \$4, 1000000*\$5/total, \$6}\' | awk \'{OFS = "\t"}{if (\$6 == "+") {print \$1, \$2, \$3, \$5} else {print \$1, \$2, \$3, -\$5}}\' | sort -k1,1 -k2,2n > ${prefix}.xl.CPMnorm.bedgraph
+
+        cat <<-END_VERSIONS > versions.yml
+        "${task.process}":
+            linux: NOVERSION
+            bedtools: `bedtools --version | head -n 1`
+        END_VERSIONS
+        """
+    } else if (crosslink_position == "middle"){
+        """
+        bedtools bamtobed -i $bam > dedup.bed
+        awk '{OFS="\t"}{mid=int((\$2+\$3)/2); print \$1, mid, mid+1, \$4, \$5, \$6}' dedup.bed > shifted.bed
+        bedtools genomecov -dz -strand + -5 -i shifted.bed -g $fai | awk '{OFS="\t"}{print \$1, \$2, \$2+1, ".", \$3, "+"}' > pos.bed
+        bedtools genomecov -dz -strand - -5 -i shifted.bed -g $fai | awk '{OFS="\t"}{print \$1, \$2, \$2+1, ".", \$3, "-"}' > neg.bed
+        cat pos.bed neg.bed | sort -k1,1 -k2,2n -k3,3 -k6,6 > ${prefix}.xl.bed
+        cat ${prefix}.xl.bed | awk '{OFS = "\t"}{if (\$6 == "+") {print \$1, \$2, \$3, \$5} else {print \$1, \$2, \$3, -\$5}}' > ${prefix}.xl.bedgraph
+        TOTAL_VARIABLE=`cat ${prefix}.xl.bed | awk \'BEGIN {total=0} {total=total+\$5} END {print total}\'`
+        cat ${prefix}.xl.bed | awk -v total=\$TOTAL_VARIABLE \'{printf "%s\\t%i\\t%i\\t%s\\t%f\\t%s\\n", \$1, \$2, \$3, \$4, 1000000*\$5/total, \$6}\' | awk \'{OFS = "\t"}{if (\$6 == "+") {print \$1, \$2, \$3, \$5} else {print \$1, \$2, \$3, -\$5}}\' | sort -k1,1 -k2,2n > ${prefix}.xl.CPMnorm.bedgraph
+
+        cat <<-END_VERSIONS > versions.yml
+        "${task.process}":
+            linux: NOVERSION
+            bedtools: `bedtools --version | head -n 1`
+        END_VERSIONS
+        """
+    } else if (crosslink_position == "end"){
+        """
+        bedtools bamtobed -i $bam > dedup.bed
+        awk -v OFS="\t" '\$6=="+" {print \$1,\$3,\$3+1,\$4,\$5,\$6} \$6=="-" {print \$1,\$2-1,\$2,\$4,\$5,\$6}' dedup.bed > shifted.bed
+        bedtools genomecov -dz -strand + -3 -i shifted.bed -g $fai | awk '{OFS="\t"}{print \$1, \$2, \$2+1, ".", \$3, "+"}' > pos.bed
+        bedtools genomecov -dz -strand - -3 -i shifted.bed -g $fai | awk '{OFS="\t"}{print \$1, \$2, \$2+1, ".", \$3, "-"}' > neg.bed
+        cat pos.bed neg.bed | sort -k1,1 -k2,2n -k3,3 -k6,6 > ${prefix}.xl.bed
+        cat ${prefix}.xl.bed | awk '{OFS = "\t"}{if (\$6 == "+") {print \$1, \$2, \$3, \$5} else {print \$1, \$2, \$3, -\$5}}' > ${prefix}.xl.bedgraph
+        TOTAL_VARIABLE=`cat ${prefix}.xl.bed | awk \'BEGIN {total=0} {total=total+\$5} END {print total}\'`
+        cat ${prefix}.xl.bed | awk -v total=\$TOTAL_VARIABLE \'{printf "%s\\t%i\\t%i\\t%s\\t%f\\t%s\\n", \$1, \$2, \$3, \$4, 1000000*\$5/total, \$6}\' | awk \'{OFS = "\t"}{if (\$6 == "+") {print \$1, \$2, \$3, \$5} else {print \$1, \$2, \$3, -\$5}}\' | sort -k1,1 -k2,2n > ${prefix}.xl.CPMnorm.bedgraph
+
+        cat <<-END_VERSIONS > versions.yml
+        "${task.process}":
+            linux: NOVERSION
+            bedtools: `bedtools --version | head -n 1`
+        END_VERSIONS
+        """
+    } else {
+        error "Invalid crosslink_position: ${crosslink_position}"
+    }
+}
diff --git a/nextflow.config b/nextflow.config
index 8c471a2..e857fac 100644
--- a/nextflow.config
+++ b/nextflow.config
@@ -69,6 +69,7 @@ params {
     save_unaligned_res  = true
 
     // Pipeline params
+    crosslink_position  = "start"
     encode_eclip        = false
     move_umi_to_header  = false
     umi_header_format   = null
diff --git a/schema/clipseq.json b/schema/clipseq.json
index 0fe828e..4e0aff5 100644
--- a/schema/clipseq.json
+++ b/schema/clipseq.json
@@ -287,6 +287,12 @@
             "description": "Additional pipeline configuration options.",
             "advanced": true,
             "params": {
+                "crosslink_position": {
+                    "name": "Crosslink position",
+                    "description": "The position of the crosslink in the read. Options are 'start', 'end' or 'middle'. Note that 'start' and 'end' correspond to start and end of read minus and plus 1 respectively.",
+                    "type": "string",
+                    "default": "start"
+                },
                 "move_umi_to_header": {
                     "name": "Extract UMI to header",
                     "description": "Runs UMI to header extraction based on the head format provided in UMI header format.",
@@ -350,15 +356,15 @@
     ],
     "outputs": [
         {
-            "name": "Normalised genome crosslink bedgraph",
-            "description": "Genomic crosslinks normalised to total sample crosslinks in bedgraph format for genome browser viewing.",
-            "filetype": "bedgraph",
-            "process": "CROSSLINK_NORMCOVERAGE"
+            "name": "Raw genome crosslink bedgraph",
+            "description": "Genomic crosslinks in bedgraph format for genome browser viewing.",
+            "filetype": ".xl.bedgraph",
+            "process": "CALC_GENOME_CROSSLINKS"
         },
         {
             "name": "Genomic peaks",
             "description": "Genomic peaks generated by Clippy.",
-            "filetype": "bed",
+            "filetype": "Peaks.bed",
             "process": "CLIPPY_GENOME"
         },
         {
diff --git a/subworkflows/goodwright/clip_calc_crosslinks/default.config b/subworkflows/goodwright/clip_calc_crosslinks/default.config
deleted file mode 100644
index 4b0cc0a..0000000
--- a/subworkflows/goodwright/clip_calc_crosslinks/default.config
+++ /dev/null
@@ -1,49 +0,0 @@
-process {
-
-    withName: BEDTOOLS_SHIFT {
-        ext.args = '-m 1 -p -1'
-    }
-
-    withName: BEDTOOLS_GENOMECOV_POS {
-        ext.args = '-dz -strand + -5'
-    }
-
-    withName: BEDTOOLS_GENOMECOV_NEG {
-        ext.args = '-dz -strand - -5'
-    }
-
-    withName: SELECT_BED_POS {
-        ext.cmd1 = 'awk \'{OFS="\\t"}{print \$1, \$2, \$2+1, ".", \$3, "+"}\''
-        ext.suffix = '.pos'
-        ext.ext = 'bed'
-    }
-
-    withName: SELECT_BED_NEG {
-        ext.cmd1 = 'awk \'{OFS="\\t"}{print \$1, \$2, \$2+1, ".", \$3, "-"}\''
-        ext.suffix = '.neg'
-        ext.ext = 'bed'
-    }
-
-    withName: MERGE_AND_SORT {
-        ext.cmd1 = 'sort -k1,1 -k2,2n'
-        ext.suffix = '.sorted'
-        ext.ext = 'bed'
-    }
-
-    withName: CROSSLINK_COVERAGE {
-        ext.cmd1 = 'awk \'{OFS = "\t"}{if (\$6 == "+") {print \$1, \$2, \$3, \$5} else {print \$1, \$2, \$3, -\$5}}\' | sort -k1,1 -k2,2n'
-        ext.suffix = '.sorted'
-        ext.ext = 'bedgraph'
-    }
-
-    withName: CROSSLINK_NORMCOVERAGE {
-        ext.cmd1 = """
-                   awk -v total=\$CMD2 \'{printf "%s\\t%i\\t%i\\t%s\\t%f\\t%s\\n", \$1, \$2, \$3, \$4, 1000000*\$5/total, \$6}\' | 
-                   awk \'{OFS = "\t"}{if (\$6 == "+") {print \$1, \$2, \$3, \$5} else {print \$1, \$2, \$3, -\$5}}\' | 
-                   sort -k1,1 -k2,2n
-                   """
-        ext.cmd2 = 'awk \'BEGIN {total=0} {total=total+\$5} END {print total}\''
-        ext.suffix = '.norm.sorted'
-        ext.ext = 'bedgraph'
-    }
-}
diff --git a/subworkflows/goodwright/clip_calc_crosslinks/main.nf b/subworkflows/goodwright/clip_calc_crosslinks/main.nf
deleted file mode 100644
index b53c949..0000000
--- a/subworkflows/goodwright/clip_calc_crosslinks/main.nf
+++ /dev/null
@@ -1,120 +0,0 @@
-//
-// Calculate clip crosslinks using an input BAM file and genome index file.
-// Crosslinks are outputed as a BED file and additional coverage and normalised coverage
-// tracks are calculated in BEDGRAPH format
-//
-
-/*
-* MODULES
-*/
-include { BEDTOOLS_BAMTOBED                            } from '../../../modules/nf-core/bedtools/bamtobed/main.nf'
-include { BEDTOOLS_SHIFT                               } from '../../../modules/goodwright/bedtools/shift/main.nf'
-include { BEDTOOLS_GENOMECOV as BEDTOOLS_GENOMECOV_POS } from '../../../modules/nf-core/bedtools/genomecov/main.nf'
-include { BEDTOOLS_GENOMECOV as BEDTOOLS_GENOMECOV_NEG } from '../../../modules/nf-core/bedtools/genomecov/main.nf'
-include { LINUX_COMMAND as SELECT_BED_POS              } from '../../../modules/goodwright/linux/command/main.nf'
-include { LINUX_COMMAND as SELECT_BED_NEG              } from '../../../modules/goodwright/linux/command/main.nf'
-include { LINUX_COMMAND as MERGE_AND_SORT              } from '../../../modules/goodwright/linux/command/main.nf'
-include { LINUX_COMMAND as CROSSLINK_COVERAGE          } from '../../../modules/goodwright/linux/command/main.nf'
-include { LINUX_COMMAND as CROSSLINK_NORMCOVERAGE      } from '../../../modules/goodwright/linux/command/main.nf'
-
-workflow CLIP_CALC_CROSSLINKS {
-    take:
-    bam // channel: [ val(meta), [ bam ] ]
-    fai // channel: [ fai ]
-
-    main:
-    ch_versions = Channel.empty()
-
-    /*
-    * MODULE: Convert input BAM file to BED format
-    */
-    BEDTOOLS_BAMTOBED (
-        bam
-    )
-    ch_versions = ch_versions.mix(BEDTOOLS_BAMTOBED.out.versions)
-
-    /*
-    * MODULE: Shift BED file according to parameters suppied in config (default is -s 0)
-    */
-    BEDTOOLS_SHIFT (
-        BEDTOOLS_BAMTOBED.out.bed,
-        fai
-    )
-    ch_versions = ch_versions.mix(BEDTOOLS_SHIFT.out.versions)
-
-    /*
-    * MODULE: Report depth at each position on the pos strand
-    */
-    BEDTOOLS_GENOMECOV_POS (
-        BEDTOOLS_SHIFT.out.bed.map{ [ it[0], it[1], 1 ] },
-        fai,
-        'pos.bed'
-    )
-    ch_versions = ch_versions.mix(BEDTOOLS_GENOMECOV_POS.out.versions)
-
-    /*
-    * MODULE: Report depth at each position on the neg strand
-    */
-    BEDTOOLS_GENOMECOV_NEG (
-        BEDTOOLS_SHIFT.out.bed.map{ [ it[0], it[1], 1 ] },
-        fai,
-        'neg.bed'
-    )
-
-    /*
-    * MODULE: Select columns in BED file using AWK
-    */
-    SELECT_BED_POS (
-        BEDTOOLS_GENOMECOV_POS.out.genomecov,
-        [],
-        false
-    )
-    SELECT_BED_NEG (
-        BEDTOOLS_GENOMECOV_NEG.out.genomecov,
-        [],
-        false
-    )
-
-    /*
-    * CHANNEL: Join POS/NEG files into one channel so they can be merged in the next module
-    */
-    ch_merge_and_sort_input = SELECT_BED_POS.out.file
-        .map{ [ it[0].id, it[0], it[1] ] }
-        .join( SELECT_BED_NEG.out.file.map{ [ it[0].id, it[0], it[1] ] } )
-        .map { [ it[1], [ it[2], it[4] ] ] }
-    //EXAMPLE CHANNEL STRUCT: [ [id:test], [ BED(pos), BED(neg) ] ]
-    //ch_merge_and_sort_input | view 
-
-    /*
-    * MODULE: Select columns in BED file using AWK
-    */
-    MERGE_AND_SORT (
-        ch_merge_and_sort_input,
-        [],
-        false
-    )
-
-    /*
-    * MODULE: Create coverage track using AWK
-    */
-    CROSSLINK_COVERAGE (
-        MERGE_AND_SORT.out.file,
-        [],
-        false
-    )
-
-    /*
-    * MODULE: Create normalised coverage track using AWK
-    */
-    CROSSLINK_NORMCOVERAGE (
-        MERGE_AND_SORT.out.file,
-        [],
-        true
-    )
-
-    emit:
-    bed           = MERGE_AND_SORT.out.file         // channel: [ val(meta), [ bed ] ]
-    coverage      = CROSSLINK_COVERAGE.out.file     // channel: [ val(meta), [ bedgraph ] ]
-    coverage_norm = CROSSLINK_NORMCOVERAGE.out.file // channel: [ val(meta), [ bedgraph ] ]
-    versions      = ch_versions                     // channel: [ versions.yml ]
-}
diff --git a/subworkflows/goodwright/clip_calc_crosslinks/meta.yml b/subworkflows/goodwright/clip_calc_crosslinks/meta.yml
deleted file mode 100644
index 0d6ac85..0000000
--- a/subworkflows/goodwright/clip_calc_crosslinks/meta.yml
+++ /dev/null
@@ -1,53 +0,0 @@
-name: clip_calc_crosslinks
-description: |
-  Calculate clip crosslinks using an input BAM file and genome index file. Crosslinks are outputed as a BED file and additional 
-  coverage and normalised coverage tracks are calculated in BEDGRAPH format.
-keywords:
-  - bedtools
-  - crosslinks
-  - coverage
-  - fai
-  - bam
-modules:
-  - goodwright/linux/command
-  - goodwright/bedtools/shift
-  - nf-core/bedtools/bamtobed
-  - nf-core/bedtools/genomecov
-input:
-  - meta:
-      type: map
-      description: |
-        Groovy Map containing sample information
-        e.g. [ id:'test', single_end:false ]
-  - bam:
-      type: file
-      description: BAM file
-      pattern: "*.bam"
-  - fai:
-      type: file
-      description: FAI file
-      pattern: "*.fai"
-output:
-  - meta:
-      type: map
-      description: |
-        Groovy Map containing sample information
-        e.g. [ id:'test', single_end:false ]
-  - bed:
-      type: file
-      description: Bedfile containing the crosslink depths at genomic positions
-      pattern: "*.bed"
-  - coverage:
-      type: file
-      description: The crosslinks bed file turned into a bedgraph coverage file.
-      pattern: "*.bedgraph"
-  - coverage_norm:
-      type: file
-      description: The crosslinks bed file turned into a bedgraph coverage file normalised against total count
-      pattern: "*.bedgraph"
-  - versions:
-      type: file
-      description: File containing software versions
-      pattern: "versions.yml"
-authors:
-  - "@chris-cheshire"

From 6cd0860f05553b673faf9ae6a004d81feaa73b7a Mon Sep 17 00:00:00 2001
From: Charlotte Capitanchik <charlotteannecap@gmail.com>
Date: Tue, 7 Jan 2025 21:37:25 +0000
Subject: [PATCH 06/10] fix edge case crosslinks going over edge of chr

---
 modules/local/get_crosslinks.nf | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/modules/local/get_crosslinks.nf b/modules/local/get_crosslinks.nf
index 02ce4ad..6119720 100644
--- a/modules/local/get_crosslinks.nf
+++ b/modules/local/get_crosslinks.nf
@@ -24,7 +24,8 @@ process GET_CROSSLINKS {
     if (crosslink_position == "start"){
         """
         bedtools bamtobed -i $bam > dedup.bed
-        bedtools shift -m 1 -p -1 -i dedup.bed -g $fai > shifted.bed
+        bedtools shift -m 1 -p -1 -i dedup.bed -g $fai > shiftedtemp.bed
+        awk -v OFS="\t" 'BEGIN {while (getline < ARGV[2]) {chrom[\$1] = \$2} ARGV[2] = ""} {start=\$2; end=\$3; if(start<0){start=0; end=1} if(end>chrom[\$1]){start=chrom[\$1]-1; end=chrom[\$1]} print \$1,start,end,\$4,\$5,\$6}' shiftedtemp.bed $fai > shifted.bed
         bedtools genomecov -dz -strand + -5 -i shifted.bed -g $fai | awk '{OFS="\t"}{print \$1, \$2, \$2+1, ".", \$3, "+"}' > pos.bed
         bedtools genomecov -dz -strand - -5 -i shifted.bed -g $fai | awk '{OFS="\t"}{print \$1, \$2, \$2+1, ".", \$3, "-"}' > neg.bed
         cat pos.bed neg.bed | sort -k1,1 -k2,2n -k3,3 -k6,6 > ${prefix}.xl.bed
@@ -41,7 +42,8 @@ process GET_CROSSLINKS {
     } else if (crosslink_position == "middle"){
         """
         bedtools bamtobed -i $bam > dedup.bed
-        awk '{OFS="\t"}{mid=int((\$2+\$3)/2); print \$1, mid, mid+1, \$4, \$5, \$6}' dedup.bed > shifted.bed
+        awk '{OFS="\t"}{mid=int((\$2+\$3)/2); print \$1, mid, mid+1, \$4, \$5, \$6}' dedup.bed > shiftedtemp.bed
+        awk -v OFS="\t" 'BEGIN {while (getline < ARGV[2]) {chrom[\$1] = \$2} ARGV[2] = ""} {start=\$2; end=\$3; if(start<0){start=0; end=1} if(end>chrom[\$1]){start=chrom[\$1]-1; end=chrom[\$1]} print \$1,start,end,\$4,\$5,\$6}' shiftedtemp.bed $fai > shifted.bed
         bedtools genomecov -dz -strand + -5 -i shifted.bed -g $fai | awk '{OFS="\t"}{print \$1, \$2, \$2+1, ".", \$3, "+"}' > pos.bed
         bedtools genomecov -dz -strand - -5 -i shifted.bed -g $fai | awk '{OFS="\t"}{print \$1, \$2, \$2+1, ".", \$3, "-"}' > neg.bed
         cat pos.bed neg.bed | sort -k1,1 -k2,2n -k3,3 -k6,6 > ${prefix}.xl.bed
@@ -58,7 +60,8 @@ process GET_CROSSLINKS {
     } else if (crosslink_position == "end"){
         """
         bedtools bamtobed -i $bam > dedup.bed
-        awk -v OFS="\t" '\$6=="+" {print \$1,\$3,\$3+1,\$4,\$5,\$6} \$6=="-" {print \$1,\$2-1,\$2,\$4,\$5,\$6}' dedup.bed > shifted.bed
+        awk -v OFS="\t" '\$6=="+" {print \$1,\$3,\$3+1,\$4,\$5,\$6} \$6=="-" {print \$1,\$2-1,\$2,\$4,\$5,\$6}' dedup.bed > shiftedtemp.bed
+        awk -v OFS="\t" 'BEGIN {while (getline < ARGV[2]) {chrom[\$1] = \$2} ARGV[2] = ""} {start=\$2; end=\$3; if(start<0){start=0; end=1} if(end>chrom[\$1]){start=chrom[\$1]-1; end=chrom[\$1]} print \$1,start,end,\$4,\$5,\$6}' shiftedtemp.bed $fai > shifted.bed
         bedtools genomecov -dz -strand + -3 -i shifted.bed -g $fai | awk '{OFS="\t"}{print \$1, \$2, \$2+1, ".", \$3, "+"}' > pos.bed
         bedtools genomecov -dz -strand - -3 -i shifted.bed -g $fai | awk '{OFS="\t"}{print \$1, \$2, \$2+1, ".", \$3, "-"}' > neg.bed
         cat pos.bed neg.bed | sort -k1,1 -k2,2n -k3,3 -k6,6 > ${prefix}.xl.bed

From 722f20050746170744ae5481dbb23ae6fe6cdab5 Mon Sep 17 00:00:00 2001
From: Charlotte Capitanchik <charlotteannecap@gmail.com>
Date: Wed, 8 Jan 2025 13:22:46 +0000
Subject: [PATCH 07/10] fix middle crosslink position for spliced reads

---
 modules/local/get_crosslinks.nf | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/modules/local/get_crosslinks.nf b/modules/local/get_crosslinks.nf
index 6119720..48b9ed6 100644
--- a/modules/local/get_crosslinks.nf
+++ b/modules/local/get_crosslinks.nf
@@ -41,16 +41,16 @@ process GET_CROSSLINKS {
         """
     } else if (crosslink_position == "middle"){
         """
-        bedtools bamtobed -i $bam > dedup.bed
-        awk '{OFS="\t"}{mid=int((\$2+\$3)/2); print \$1, mid, mid+1, \$4, \$5, \$6}' dedup.bed > shiftedtemp.bed
-        awk -v OFS="\t" 'BEGIN {while (getline < ARGV[2]) {chrom[\$1] = \$2} ARGV[2] = ""} {start=\$2; end=\$3; if(start<0){start=0; end=1} if(end>chrom[\$1]){start=chrom[\$1]-1; end=chrom[\$1]} print \$1,start,end,\$4,\$5,\$6}' shiftedtemp.bed $fai > shifted.bed
-        bedtools genomecov -dz -strand + -5 -i shifted.bed -g $fai | awk '{OFS="\t"}{print \$1, \$2, \$2+1, ".", \$3, "+"}' > pos.bed
-        bedtools genomecov -dz -strand - -5 -i shifted.bed -g $fai | awk '{OFS="\t"}{print \$1, \$2, \$2+1, ".", \$3, "-"}' > neg.bed
+        bedtools bamtobed -bed12 -i $bam > dedup.bed12
+        awk -F'\\t' -v OFS='\\t' 'function abs(x){return x<0?-x:x}{split(\$11,sizes,",");split(\$12,starts,",");total_covered=0;for(i=1;i<=\$10;i++){total_covered+=sizes[i]};target_pos=int(total_covered/2);covered_so_far=0;for(i=1;i<=\$10;i++){if(covered_so_far+sizes[i]>=target_pos){offset=target_pos-covered_so_far;mid_pos=\$2+starts[i]+offset;print \$1,mid_pos,mid_pos+1,\$4,\$5,\$6;break};covered_so_far+=sizes[i]}}' dedup.bed12 > shiftedtemp.bed
+        awk -v OFS="\\t" 'BEGIN {while (getline < ARGV[2]) {chrom[\$1] = \$2} ARGV[2] = ""} {start=\$2; end=\$3; if(start<0){start=0; end=1} if(end>chrom[\$1]){start=chrom[\$1]-1; end=chrom[\$1]} print \$1,start,end,\$4,\$5,\$6}' shiftedtemp.bed $fai > shifted.bed
+        bedtools genomecov -dz -strand + -5 -i shifted.bed -g $fai | awk '{OFS="\\t"}{print \$1, \$2, \$2+1, ".", \$3, "+"}' > pos.bed
+        bedtools genomecov -dz -strand - -5 -i shifted.bed -g $fai | awk '{OFS="\\t"}{print \$1, \$2, \$2+1, ".", \$3, "-"}' > neg.bed
         cat pos.bed neg.bed | sort -k1,1 -k2,2n -k3,3 -k6,6 > ${prefix}.xl.bed
-        cat ${prefix}.xl.bed | awk '{OFS = "\t"}{if (\$6 == "+") {print \$1, \$2, \$3, \$5} else {print \$1, \$2, \$3, -\$5}}' > ${prefix}.xl.bedgraph
+        cat ${prefix}.xl.bed | awk '{OFS = "\\t"}{if (\$6 == "+") {print \$1, \$2, \$3, \$5} else {print \$1, \$2, \$3, -\$5}}' > ${prefix}.xl.bedgraph
         TOTAL_VARIABLE=`cat ${prefix}.xl.bed | awk \'BEGIN {total=0} {total=total+\$5} END {print total}\'`
-        cat ${prefix}.xl.bed | awk -v total=\$TOTAL_VARIABLE \'{printf "%s\\t%i\\t%i\\t%s\\t%f\\t%s\\n", \$1, \$2, \$3, \$4, 1000000*\$5/total, \$6}\' | awk \'{OFS = "\t"}{if (\$6 == "+") {print \$1, \$2, \$3, \$5} else {print \$1, \$2, \$3, -\$5}}\' | sort -k1,1 -k2,2n > ${prefix}.xl.CPMnorm.bedgraph
-
+        cat ${prefix}.xl.bed | awk -v total=\$TOTAL_VARIABLE \'{printf "%s\\t%i\\t%i\\t%s\\t%f\\t%s\\n", \$1, \$2, \$3, \$4, 1000000*\$5/total, \$6}\' | awk \'{OFS = "\\t"}{if (\$6 == "+") {print \$1, \$2, \$3, \$5} else {print \$1, \$2, \$3, -\$5}}\' | sort -k1,1 -k2,2n > ${prefix}.xl.CPMnorm.bedgraph
+        
         cat <<-END_VERSIONS > versions.yml
         "${task.process}":
             linux: NOVERSION

From 80b9775517d529f7e0a55d04201f85c8fa373201 Mon Sep 17 00:00:00 2001
From: Charlotte Capitanchik <charlotteannecap@gmail.com>
Date: Fri, 10 Jan 2025 13:11:51 +0000
Subject: [PATCH 08/10] correct all samples not running when fai is provided

---
 conf/test.config                | 2 +-
 main.nf                         | 6 +++---
 modules/local/get_crosslinks.nf | 2 +-
 3 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/conf/test.config b/conf/test.config
index 60fd638..4d3397c 100644
--- a/conf/test.config
+++ b/conf/test.config
@@ -21,7 +21,7 @@ params {
     max_time   = '6.h'
 
     // Input data
-    samplesheet = './tests/data/samplesheets/small-single-sample-se.csv'
+    samplesheet = './tests/data/samplesheets/small-dual-sample-se.csv'
     fasta       = './tests/data/genome/yeast_MitoV.fa.gz'
     smrna_fasta = './tests/data/genome/homosapiens_smallRNA.fa.gz'
     gtf         = './tests/data/genome/yeast_MitoV.gtf.gz'
diff --git a/main.nf b/main.nf
index bd0d336..7bb5e8e 100644
--- a/main.nf
+++ b/main.nf
@@ -458,7 +458,7 @@ workflow CLIPSEQ {
         */
         CALC_SMRNA_K1_CROSSLINKS (
             ch_smrna_k1_bam.join(ch_smrna_k1_bai),
-            ch_smrna_fasta_fai,
+            ch_smrna_fasta_fai.collect{ it[1] },
             params.crosslink_position
         )
         ch_versions                      = ch_versions.mix(CALC_SMRNA_K1_CROSSLINKS.out.versions)
@@ -471,7 +471,7 @@ workflow CLIPSEQ {
         */
         CALC_GENOME_CROSSLINKS (
             ch_genome_bam.join(ch_genome_bai),
-            ch_fasta_fai,
+            ch_fasta_fai.collect{ it[1] },
             params.crosslink_position
         )
         ch_versions                       = ch_versions.mix(CALC_GENOME_CROSSLINKS.out.versions)
@@ -484,7 +484,7 @@ workflow CLIPSEQ {
         */
         CALC_TRANSCRIPT_CROSSLINKS (
             ch_transcript_bam.join(ch_transcript_bai),
-            ch_longest_transcript_fai,
+            ch_longest_transcript_fai.collect{ it[1] },
             params.crosslink_position
         )
         ch_versions                      = ch_versions.mix(CALC_TRANSCRIPT_CROSSLINKS.out.versions)
diff --git a/modules/local/get_crosslinks.nf b/modules/local/get_crosslinks.nf
index 48b9ed6..74a05d1 100644
--- a/modules/local/get_crosslinks.nf
+++ b/modules/local/get_crosslinks.nf
@@ -9,7 +9,7 @@ process GET_CROSSLINKS {
 
     input:
     tuple val(meta), path(bam), path(bai)
-    tuple val(meta2), path(fai)
+    path fai
     val crosslink_position
 
     output:

From 9d44d8789bbe90039605b5c439f88d1c93f86a68 Mon Sep 17 00:00:00 2001
From: Charlotte Capitanchik <CharlotteAnne@users.noreply.github.com>
Date: Tue, 14 Jan 2025 14:06:40 +0000
Subject: [PATCH 09/10] Update clipseq.json

---
 schema/clipseq.json | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/schema/clipseq.json b/schema/clipseq.json
index 4e0aff5..1d1917f 100644
--- a/schema/clipseq.json
+++ b/schema/clipseq.json
@@ -371,7 +371,7 @@
             "name": "Crosslink summary",
             "description": "Crosslinks summarised by gene, type (eg. CDS, intron) and subtype (eg. lncRNA, mRNA).",
             "filetype": "tsv",
-            "process": "ICOUNT_SUMMARY"
+            "process": "MERGE_SUMMARY"
         },
         {
             "name": "K-mer enrichment",

From e60e1a4e44850c96f73f64d5c93fe2da805d1b6e Mon Sep 17 00:00:00 2001
From: Charlotte Capitanchik <charlotteannecap@gmail.com>
Date: Mon, 20 Jan 2025 18:10:38 +0000
Subject: [PATCH 10/10] expose trim galore params

---
 schema/clipseq.json | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/schema/clipseq.json b/schema/clipseq.json
index 4e0aff5..086c439 100644
--- a/schema/clipseq.json
+++ b/schema/clipseq.json
@@ -350,6 +350,11 @@
                     "name": "PEKA parameters",
                     "description": "Parameters for PEKA K-mer enrichment analysis",
                     "type": "string"
+                },
+                "trimgalore_params": {
+                    "name": "Trim Galore! parameters",
+                    "description": "Parameters for Trim Galore! We reccomend to include the default params along with any new ones you add: '--fastqc --length 10 -q 20'",
+                    "type": "string"
                 }
             }
         }