diff --git a/Makefile b/Makefile
index bcab4bf..4816617 100644
--- a/Makefile
+++ b/Makefile
@@ -2,6 +2,9 @@ INCL_DIR := $(CURDIR)/include
SRC_DIR := $(CURDIR)/src
LIB_DIR := $(CURDIR)/lib
+VERSION := $(shell git describe --tags --always)
+VERSION_HEADER := $(INCL_DIR)/version.h
+
# Set the library paths for the compiler
CONDA_PREFIX ?= $(shell echo $$CONDA_PREFIX)
LIBRARY_PATHS := -L$(LIB_DIR) -L$(CONDA_PREFIX)/lib
@@ -10,6 +13,11 @@ INCLUDE_PATHS := -I$(INCL_DIR) -I$(CONDA_PREFIX)/include
# All targets
all: swig_build compile
+# Rule to generate version.h
+$(VERSION_HEADER):
+ @echo "#pragma once" > $@
+ @echo "#define VERSION \"$(VERSION)\"" >> $@
+
# Generate the SWIG Python/C++ wrappers
swig_build:
mkdir -p $(LIB_DIR)
diff --git a/README.md b/README.md
index 47307b0..d820ff9 100644
--- a/README.md
+++ b/README.md
@@ -10,6 +10,7 @@ LongReadSum supports FASTA, FASTQ, BAM, FAST5, and sequencing_summary.txt file f
- [Installation using Anaconda (recommended)](#installation-using-anaconda)
- [Installation using Docker](#installation-using-anaconda)
- [Building from source](#building-from-source)
+- [MultiQC support](#multiqc-support)
- General usage for common filetypes:
- [Common parameters](#common-parameters)
- [WGS BAM](#wgs-bam)
@@ -78,6 +79,19 @@ conda activate longreadsum
make
```
+# MultiQC support
+[MultiQC](https://seqera.io/multiqc/) is a widely used open-source tool for
+aggregating bioinformatics analyses results from many tools across samples.
+
+To run MultiQC, input the LongReadSum directory containing the output JSON
+summary file, and specify the _longreadsum_ module:
+
+```
+multiqc $INPUT_DIRECTORY --module longreadsum --outdir $OUTPUT_DIRECTORY/multiqc
+```
+
+Example report:
+
## Running
Activate the conda environment and then run with arguments:
diff --git a/include/input_parameters.h b/include/input_parameters.h
index e092f5f..cdc9f44 100644
--- a/include/input_parameters.h
+++ b/include/input_parameters.h
@@ -21,7 +21,6 @@ class Input_Para{
// Parameters
int threads;
size_t num_input_files;
- std::string out_prefix;
int64_t other_flags;
int32_t user_defined_fastq_base_qual_offset;
std::string output_folder; // Output folder
@@ -36,10 +35,14 @@ class Input_Para{
bool mod_analysis; // Perform base modification analysis on BAM file
int tin_sample_size; // Number of equally spaced samples for TIN calculation
int tin_min_coverage; // Minimum coverage for TIN calculation
+ std::string sample_name; // Sample name
+ std::string version_str = ""; // Version string for the program
// Functions
std::string add_input_file(const std::string& input_filepath);
+ const std::string &getVersion() const;
+
Input_Para();
~Input_Para();
diff --git a/include/tin.h b/include/tin.h
index fd4c330..27e406c 100644
--- a/include/tin.h
+++ b/include/tin.h
@@ -15,7 +15,7 @@ typedef std::unordered_map getReadDepths(htsFile* bam_file, hts_idx_t* idx, bam_hdr_t* header, std::string chr, int start, int end, bool transcript_strand);
diff --git a/include/version.h b/include/version.h
new file mode 100644
index 0000000..767e900
--- /dev/null
+++ b/include/version.h
@@ -0,0 +1,2 @@
+#pragma once
+#define VERSION "v1.4.0-5-g9e0d2c4"
diff --git a/src/bam_module.cpp b/src/bam_module.cpp
index e73c9fb..3780a8a 100644
--- a/src/bam_module.cpp
+++ b/src/bam_module.cpp
@@ -80,7 +80,7 @@ int BAM_Module::calculateStatistics(Input_Para &input_params, Output_BAM &final_
std::cout << "Calculating TIN scores for file: " << filepath << std::endl;
TINStats tin_stats;
- calculateTIN(&tin_stats, gene_bed, input_params.input_files[i], min_cov, sample_size, input_params.output_folder, input_params.threads);
+ calculateTIN(tin_stats, gene_bed, input_params.input_files[i], min_cov, sample_size, input_params.sample_name, input_params.output_folder, input_params.threads);
// Print the TIN stats
std::cout << "Number of transcripts: " << tin_stats.num_transcripts << std::endl;
@@ -154,8 +154,6 @@ int BAM_Module::calculateStatistics(Input_Para &input_params, Output_BAM &final_
while (reader.hasNextRecord()){
// Read the next batch of records
- // std::cout << "Generating " << thread_count << " thread(s)..." <<
- // std::endl;
printMessage("Generating " + std::to_string(thread_count) + " thread(s)...");
std::vector thread_vector;
for (int thread_index=0; thread_index read_lengths = output_data.long_read_info.read_lengths;
- std::sort(read_lengths.begin(), read_lengths.end(), std::greater());
-
- // Get the max read length
- int64_t max_read_length = read_lengths.at(0);
- output_data.long_read_info.longest_read_length = max_read_length;
-
- // Get the median read length
- int64_t median_read_length = read_lengths[read_lengths.size() / 2];
- output_data.long_read_info.median_read_length = median_read_length;
-
- // Get the mean read length
- float mean_read_length = (double)total_num_bases / (double)read_lengths.size();
- output_data.long_read_info.mean_read_length = mean_read_length;
-
- // Calculate N50 and other N-scores
- for (int percent_value = 1; percent_value <= 100; percent_value++)
- {
- // Get the base percentage threshold for this N-score
- double base_threshold = (double)total_num_bases * (percent_value / 100.0);
-
- // Calculate the NXX score
- double current_base_count = 0;
- int current_read_index = -1;
- while (current_base_count < base_threshold) {
- current_read_index ++;
- current_base_count += read_lengths.at(current_read_index);
- }
- int nxx_read_length = read_lengths.at(current_read_index);
- output_data.long_read_info.NXX_read_length[percent_value] = nxx_read_length;
- }
+ // Loop through each input file and get the QC data across files
+ size_t file_count = _input_data.num_input_files;
- // Set common score variables
- output_data.long_read_info.n50_read_length = output_data.long_read_info.NXX_read_length[50];
- output_data.long_read_info.n95_read_length = output_data.long_read_info.NXX_read_length[95];
- output_data.long_read_info.n05_read_length = output_data.long_read_info.NXX_read_length[5];
-
- // Create the summary file
- std::cout << "Writing summary file: " << read_summary_file.c_str() << std::endl;
- read_summary_fp = fopen(read_summary_file.c_str(), "w");
- fprintf(read_summary_fp, "total number of reads\t%d\n", output_data.long_read_info.total_num_reads);
- fprintf(read_summary_fp, "total number of bases\t%ld\n", output_data.long_read_info.total_num_bases);
- fprintf(read_summary_fp, "longest read length\t%d\n", output_data.long_read_info.longest_read_length);
- fprintf(read_summary_fp, "N50 read length\t%d\n", output_data.long_read_info.n50_read_length);
- fprintf(read_summary_fp, "mean read length\t%.2f\n", output_data.long_read_info.mean_read_length);
- fprintf(read_summary_fp, "median read length\t%d\n", output_data.long_read_info.median_read_length);
- fprintf(read_summary_fp, "GC%%\t%.2f\n", output_data.long_read_info.gc_cnt * 100);
- fprintf(read_summary_fp, "\n\n");
- for (int percent = 5; percent < 100; percent += 5)
- {
- fprintf(read_summary_fp, "N%02d read length\t%.d\n", percent, output_data.long_read_info.NXX_read_length[percent]);
+ // Write QC details to the file
+ for (size_t i = 0; i < file_count; i++)
+ {
+ const std::string input_file = _input_data.input_files[i];
+ exit_code = writeBaseQCDetails(input_file, output_data, read_details_fp);
+ }
+ fclose(read_details_fp);
+
+ // Check if the GC content was calculated successfully
+ if (exit_code == 0) {
+
+ // Add the G + C bases
+ double g_c = output_data.long_read_info.total_g_cnt + output_data.long_read_info.total_c_cnt;
+
+ // Add all bases
+ double a_tu_g_c = g_c + output_data.long_read_info.total_a_cnt + output_data.long_read_info.total_tu_cnt;
+
+ // Calculate read length statistics if base counts are not zero
+ uint64_t total_num_bases = output_data.long_read_info.total_num_bases;
+ if (total_num_bases == 0) {
+ std::cerr << "No bases found in input files." << std::endl;
+ exit_code = 3;
+ } else {
+ // Calculate GC-content
+ output_data.long_read_info.gc_cnt = g_c / a_tu_g_c;
+
+ // Sort the read lengths in descending order
+ std::vector read_lengths = output_data.long_read_info.read_lengths;
+ std::sort(read_lengths.begin(), read_lengths.end(), std::greater());
+
+ // Get the max read length
+ int64_t max_read_length = read_lengths.at(0);
+ output_data.long_read_info.longest_read_length = max_read_length;
+
+ // Get the median read length
+ int64_t median_read_length = read_lengths[read_lengths.size() / 2];
+ output_data.long_read_info.median_read_length = median_read_length;
+
+ // Get the mean read length
+ float mean_read_length = (double)total_num_bases / (double)read_lengths.size();
+ output_data.long_read_info.mean_read_length = mean_read_length;
+
+ // Calculate N50 and other N-scores
+ for (int percent_value = 1; percent_value <= 100; percent_value++)
+ {
+ // Get the base percentage threshold for this N-score
+ double base_threshold = (double)total_num_bases * (percent_value / 100.0);
+
+ // Calculate the NXX score
+ double current_base_count = 0;
+ int current_read_index = -1;
+ while (current_base_count < base_threshold) {
+ current_read_index ++;
+ current_base_count += read_lengths.at(current_read_index);
}
+ int nxx_read_length = read_lengths.at(current_read_index);
+ output_data.long_read_info.NXX_read_length[percent_value] = nxx_read_length;
+ }
- fprintf(read_summary_fp, "\n\n");
-
- fprintf(read_summary_fp, "GC content\tnumber of reads\n");
- for (int gc_ratio = 0; gc_ratio < 100; gc_ratio++)
- {
- fprintf(read_summary_fp, "GC=%d%%\t%d\n", gc_ratio, output_data.long_read_info.read_gc_content_count[gc_ratio]);
- }
+ // Set common score variables
+ output_data.long_read_info.n50_read_length = output_data.long_read_info.NXX_read_length[50];
+ output_data.long_read_info.n95_read_length = output_data.long_read_info.NXX_read_length[95];
+ output_data.long_read_info.n05_read_length = output_data.long_read_info.NXX_read_length[5];
+
+ // Create the summary file
+ std::cout << "Writing summary file: " << read_summary_file.c_str() << std::endl;
+ read_summary_fp = fopen(read_summary_file.c_str(), "w");
+ // Write summary in JSON format
+ fprintf(read_summary_fp, "{\n");
+ fprintf(read_summary_fp, " \"filetype\": \"fast5\",\n");
+ fprintf(read_summary_fp, " \"longreadsum_version\": \"%s\",\n", _input_data.getVersion().c_str());
+ fprintf(read_summary_fp, " \"total_num_reads\": %d,\n", output_data.long_read_info.total_num_reads);
+ fprintf(read_summary_fp, " \"total_num_bases\": %ld,\n", output_data.long_read_info.total_num_bases);
+ fprintf(read_summary_fp, " \"longest_read_length\": %d,\n", output_data.long_read_info.longest_read_length);
+ fprintf(read_summary_fp, " \"n50_read_length\": %d,\n", output_data.long_read_info.n50_read_length);
+ fprintf(read_summary_fp, " \"mean_read_length\": %.2f,\n", output_data.long_read_info.mean_read_length);
+ fprintf(read_summary_fp, " \"median_read_length\": %d,\n", output_data.long_read_info.median_read_length);
+ fprintf(read_summary_fp, " \"gc_percent\": %.2f,\n", output_data.long_read_info.gc_cnt * 100);
+
+ // NXX read lengths
+ fprintf(read_summary_fp, " \"NXX_read_length\": {\n");
+ for (int percent = 5; percent < 100; percent += 5)
+ {
+ fprintf(read_summary_fp, " \"N%02d\": %d%s\n", percent, output_data.long_read_info.NXX_read_length[percent], (percent + 5 < 100) ? "," : "");
+ }
+ fprintf(read_summary_fp, " },\n");
+ // GC content distribution
+ fprintf(read_summary_fp, " \"gc_content_distribution\": {\n");
+ for (int gc_ratio = 0; gc_ratio < 100; gc_ratio++)
+ {
+ fprintf(read_summary_fp, " \"%d\": %d%s\n", gc_ratio, output_data.long_read_info.read_gc_content_count[gc_ratio], (gc_ratio + 1 < 100) ? "," : "");
+ }
+ fprintf(read_summary_fp, " },\n");
- fprintf(read_summary_fp, "\n\n");
- fprintf(read_summary_fp, "base quality\tnumber of bases\n");
- for (int baseq = 0; baseq <= 60; baseq++)
- {
- fprintf(read_summary_fp, "%d\t%ld\n", baseq, output_data.seq_quality_info.base_quality_distribution[baseq]);
- }
+ // Base quality distribution
+ fprintf(read_summary_fp, " \"base_quality_distribution\": {\n");
+ for (int baseq = 0; baseq <= 60; baseq++)
+ {
+ fprintf(read_summary_fp, " \"%d\": %ld%s\n", baseq, output_data.seq_quality_info.base_quality_distribution[baseq], (baseq < 60) ? "," : "");
+ }
+ fprintf(read_summary_fp, " },\n");
- fprintf(read_summary_fp, "\n\n");
- fprintf(read_summary_fp, "read average base quality\tnumber of reads\n");
- for (int baseq = 0; baseq <= 60; baseq++)
- {
- fprintf(read_summary_fp, "%d\t%d\n", baseq, output_data.seq_quality_info.read_average_base_quality_distribution[baseq]);
- }
- fclose(read_summary_fp);
+ // Read average base quality distribution
+ fprintf(read_summary_fp, " \"read_average_base_quality_distribution\": {\n");
+ for (int baseq = 0; baseq <= 60; baseq++)
+ {
+ fprintf(read_summary_fp, " \"%d\": %d%s\n", baseq, output_data.seq_quality_info.read_average_base_quality_distribution[baseq], (baseq < 60) ? "," : "");
}
+ fprintf(read_summary_fp, " }\n");
+
+ fprintf(read_summary_fp, "}\n");
+ fclose(read_summary_fp);
}
}
}
diff --git a/src/fasta_module.cpp b/src/fasta_module.cpp
index bcbb946..39d3dd4 100644
--- a/src/fasta_module.cpp
+++ b/src/fasta_module.cpp
@@ -205,9 +205,11 @@ int qc_fasta_files(Input_Para &_input_data, Output_FA &py_output_fa)
const char *input_file = NULL;
std::string read_details_file, read_summary_file;
FILE *read_details_fp, *read_summary_fp;
+ const std::string &sample_name = _input_data.sample_name;
- read_details_file = _input_data.output_folder + "/FASTA_details.txt";
- read_summary_file = _input_data.output_folder + "/FASTA_summary.txt";
+ read_details_file = _input_data.output_folder + "/" + sample_name + "_readqc.fasta.txt";
+ // read_summary_file = _input_data.output_folder + "/FASTA_summary.txt";
+ read_summary_file = _input_data.output_folder + "/" + sample_name + "_summary.fasta.json";
// =============
@@ -308,27 +310,36 @@ int qc_fasta_files(Input_Para &_input_data, Output_FA &py_output_fa)
py_output_fa.long_read_info.mean_read_length = (double)py_output_fa.long_read_info.total_num_bases / (double)py_output_fa.long_read_info.total_num_reads;
read_summary_fp = fopen(read_summary_file.c_str(), "w");
- fprintf(read_summary_fp, "total number of reads\t%d\n", py_output_fa.long_read_info.total_num_reads);
- fprintf(read_summary_fp, "total number of bases\t%ld\n", py_output_fa.long_read_info.total_num_bases);
- fprintf(read_summary_fp, "longest read length\t%d\n", py_output_fa.long_read_info.longest_read_length);
- fprintf(read_summary_fp, "N50 read length\t%d\n", py_output_fa.long_read_info.n50_read_length);
- fprintf(read_summary_fp, "mean read length\t%.2f\n", py_output_fa.long_read_info.mean_read_length);
- fprintf(read_summary_fp, "median read length\t%d\n", py_output_fa.long_read_info.median_read_length);
- fprintf(read_summary_fp, "GC%%\t%.2f\n", py_output_fa.long_read_info.gc_cnt * 100);
- fprintf(read_summary_fp, "\n\n");
- for (int percent = 5; percent < 100; percent += 5)
- {
- fprintf(read_summary_fp, "N%02d read length\t%.d\n", percent, py_output_fa.long_read_info.NXX_read_length[percent]);
- }
+ if (read_summary_fp) {
+ // Write JSON summary
+ fprintf(read_summary_fp, "{\n");
+ fprintf(read_summary_fp, " \"filetype\": \"fasta\",\n");
+ fprintf(read_summary_fp, " \"longreadsum_version\": \"%s\",\n", _input_data.getVersion().c_str());
+ fprintf(read_summary_fp, " \"total_num_reads\": %d,\n", py_output_fa.long_read_info.total_num_reads);
+ fprintf(read_summary_fp, " \"total_num_bases\": %ld,\n", py_output_fa.long_read_info.total_num_bases);
+ fprintf(read_summary_fp, " \"longest_read_length\": %d,\n", py_output_fa.long_read_info.longest_read_length);
+ fprintf(read_summary_fp, " \"n50_read_length\": %d,\n", py_output_fa.long_read_info.n50_read_length);
+ fprintf(read_summary_fp, " \"mean_read_length\": %.2f,\n", py_output_fa.long_read_info.mean_read_length);
+ fprintf(read_summary_fp, " \"median_read_length\": %d,\n", py_output_fa.long_read_info.median_read_length);
+ fprintf(read_summary_fp, " \"gc_percent\": %.2f,\n", py_output_fa.long_read_info.gc_cnt * 100);
+
+ // NXX read lengths
+ fprintf(read_summary_fp, " \"NXX_read_length\": {\n");
+ for (int percent = 5; percent < 100; percent += 5) {
+ fprintf(read_summary_fp, " \"N%02d\": %d%s\n", percent, py_output_fa.long_read_info.NXX_read_length[percent], (percent + 5 < 100) ? "," : "");
+ }
+ fprintf(read_summary_fp, " },\n");
- fprintf(read_summary_fp, "\n\n");
+ // GC content distribution
+ fprintf(read_summary_fp, " \"gc_content_distribution\": {\n");
+ for (int gc_ratio = 0; gc_ratio <= 100; gc_ratio++) {
+ fprintf(read_summary_fp, " \"%d\": %d%s\n", gc_ratio, py_output_fa.long_read_info.read_gc_content_count[gc_ratio], (gc_ratio < 100) ? "," : "");
+ }
+ fprintf(read_summary_fp, " }\n");
- fprintf(read_summary_fp, "GC content\tnumber of reads\n");
- for (int gc_ratio = 0; gc_ratio <= 100; gc_ratio++)
- {
- fprintf(read_summary_fp, "GC=%d%%\t%d\n", gc_ratio, py_output_fa.long_read_info.read_gc_content_count[gc_ratio]);
+ fprintf(read_summary_fp, "}\n");
+ fclose(read_summary_fp);
}
- fclose(read_summary_fp);
}
}
}
diff --git a/src/fastq_module.cpp b/src/fastq_module.cpp
index 3d87a32..0d22012 100644
--- a/src/fastq_module.cpp
+++ b/src/fastq_module.cpp
@@ -162,9 +162,9 @@ int qc_fastq_files(Input_Para &_input_data, Output_FQ &output_data)
std::string read_details_file, read_summary_file;
FILE *read_details_fp, *read_summary_fp;
- read_details_file = _input_data.output_folder + "/FASTQ_details.txt";
- read_summary_file = _input_data.output_folder + "/FASTQ_summary.txt";
-
+ read_details_file = _input_data.output_folder + "/" + _input_data.sample_name + "_readqc.fastq.txt";
+ read_summary_file = _input_data.output_folder + "/" + _input_data.sample_name + "_summary.fastq.json";
+
output_data.long_read_info.total_num_reads = ZeroDefault; // total number of long reads
output_data.long_read_info.total_num_bases = ZeroDefault; // total number of bases
@@ -281,40 +281,51 @@ int qc_fastq_files(Input_Para &_input_data, Output_FQ &output_data)
output_data.long_read_info.n05_read_length = output_data.long_read_info.NXX_read_length[5];
read_summary_fp = fopen(read_summary_file.c_str(), "w");
- fprintf(read_summary_fp, "total number of reads\t%d\n", output_data.long_read_info.total_num_reads);
- fprintf(read_summary_fp, "total number of bases\t%ld\n", output_data.long_read_info.total_num_bases);
- fprintf(read_summary_fp, "longest read length\t%d\n", output_data.long_read_info.longest_read_length);
- fprintf(read_summary_fp, "N50 read length\t%d\n", output_data.long_read_info.n50_read_length);
- fprintf(read_summary_fp, "mean read length\t%.2f\n", output_data.long_read_info.mean_read_length);
- fprintf(read_summary_fp, "median read length\t%d\n", output_data.long_read_info.median_read_length);
- fprintf(read_summary_fp, "GC%%\t%.2f\n", output_data.long_read_info.gc_cnt * 100);
- fprintf(read_summary_fp, "\n\n");
+ // Write summary in JSON format
+ fprintf(read_summary_fp, "{\n");
+ fprintf(read_summary_fp, " \"filetype\": \"fastq\",\n");
+ fprintf(read_summary_fp, " \"longreadsum_version\": \"%s\",\n", _input_data.getVersion().c_str());
+ fprintf(read_summary_fp, " \"total_num_reads\": %d,\n", output_data.long_read_info.total_num_reads);
+ fprintf(read_summary_fp, " \"total_num_bases\": %ld,\n", output_data.long_read_info.total_num_bases);
+ fprintf(read_summary_fp, " \"longest_read_length\": %d,\n", output_data.long_read_info.longest_read_length);
+ fprintf(read_summary_fp, " \"n50_read_length\": %d,\n", output_data.long_read_info.n50_read_length);
+ fprintf(read_summary_fp, " \"mean_read_length\": %.2f,\n", output_data.long_read_info.mean_read_length);
+ fprintf(read_summary_fp, " \"median_read_length\": %d,\n", output_data.long_read_info.median_read_length);
+ fprintf(read_summary_fp, " \"gc_percent\": %.2f,\n", output_data.long_read_info.gc_cnt * 100);
+
+ // NXX read lengths
+ fprintf(read_summary_fp, " \"NXX_read_length\": {\n");
for (int percent = 5; percent < 100; percent += 5)
{
- fprintf(read_summary_fp, "N%02d read length\t%.d\n", percent, output_data.long_read_info.NXX_read_length[percent]);
+ fprintf(read_summary_fp, " \"N%02d\": %d%s\n", percent, output_data.long_read_info.NXX_read_length[percent], (percent + 5 < 100) ? "," : "");
}
+ fprintf(read_summary_fp, " },\n");
- fprintf(read_summary_fp, "\n\n");
-
- fprintf(read_summary_fp, "GC content\tnumber of reads\n");
+ // GC content distribution
+ fprintf(read_summary_fp, " \"gc_content_distribution\": {\n");
for (int gc_ratio = 0; gc_ratio < 100; gc_ratio++)
{
- fprintf(read_summary_fp, "GC=%d%%\t%d\n", gc_ratio, output_data.long_read_info.read_gc_content_count[gc_ratio]);
+ fprintf(read_summary_fp, " \"%d\": %d%s\n", gc_ratio, output_data.long_read_info.read_gc_content_count[gc_ratio], (gc_ratio < 99) ? "," : "");
}
+ fprintf(read_summary_fp, " },\n");
- fprintf(read_summary_fp, "\n\n");
- fprintf(read_summary_fp, "base quality\tnumber of bases\n");
+ // Base quality distribution
+ fprintf(read_summary_fp, " \"base_quality_distribution\": {\n");
for (int baseq = 0; baseq <= 60; baseq++)
{
- fprintf(read_summary_fp, "%d\t%ld\n", baseq, output_data.seq_quality_info.base_quality_distribution[baseq]);
+ fprintf(read_summary_fp, " \"%d\": %ld%s\n", baseq, output_data.seq_quality_info.base_quality_distribution[baseq], (baseq < 60) ? "," : "");
}
+ fprintf(read_summary_fp, " },\n");
- fprintf(read_summary_fp, "\n\n");
- fprintf(read_summary_fp, "read average base quality\tnumber of reads\n");
+ // Read average base quality distribution
+ fprintf(read_summary_fp, " \"read_average_base_quality_distribution\": {\n");
for (int baseq = 0; baseq <= 60; baseq++)
{
- fprintf(read_summary_fp, "%d\t%d\n", baseq, output_data.seq_quality_info.read_average_base_quality_distribution[baseq]);
+ fprintf(read_summary_fp, " \"%d\": %d%s\n", baseq, output_data.seq_quality_info.read_average_base_quality_distribution[baseq], (baseq < 60) ? "," : "");
}
+ fprintf(read_summary_fp, " }\n");
+
+ fprintf(read_summary_fp, "}\n");
fclose(read_summary_fp);
}
}
diff --git a/src/generate_html.py b/src/generate_html.py
index dce7237..06a0f6a 100644
--- a/src/generate_html.py
+++ b/src/generate_html.py
@@ -20,9 +20,10 @@ def __init__(self, para_list, plot_filepaths, static=True):
else:
self.more_input_files = False
- def generate_header(self):
+ def generate_header(self, sample_name, filetype):
"""Format the header of the HTML file with the title and CSS."""
- html_filepath = self.input_para["output_folder"] + '/' + self.input_para["out_prefix"] + ".html"
+ html_filepath = self.input_para["output_folder"] + '/longreadsum_' + sample_name + '_' + filetype + '.html'
+ logging.info("Generating HTML file: %s", html_filepath)
self.html_writer = open(html_filepath, 'w', encoding='utf-8')
self.html_writer.write("")
self.html_writer.write("")
@@ -429,9 +430,9 @@ def generate_end(self):
self.html_writer.close()
# Main function for generating the HTML.
- def generate_html(self, signal_plots=False):
+ def generate_html(self, sample_name, filetype, signal_plots=False):
if signal_plots:
- self.generate_header()
+ self.generate_header(sample_name, filetype)
# Get the signal plots
signal_plots = self.plot_filepaths["ont_signal"]['dynamic']
read_names = signal_plots.keys()
@@ -440,7 +441,7 @@ def generate_html(self, signal_plots=False):
self.generate_end()
else:
# Format base QC
- self.generate_header()
+ self.generate_header(sample_name, filetype)
self.generate_left()
self.generate_right()
self.generate_end()
diff --git a/src/input_parameters.cpp b/src/input_parameters.cpp
index c409526..3f9b976 100644
--- a/src/input_parameters.cpp
+++ b/src/input_parameters.cpp
@@ -1,6 +1,7 @@
#include
#include "input_parameters.h"
+#include "version.h"
Input_Para::Input_Para(){
// Set default parameters
@@ -13,6 +14,8 @@ Input_Para::Input_Para(){
this->base_mod_threshold = 0.5;
this->gene_bed = "";
this->mod_analysis = false;
+ this->sample_name = "Sample";
+ this->version_str = VERSION;
}
Input_Para::~Input_Para(){
@@ -28,3 +31,7 @@ std::string Input_Para::add_input_file(const std::string& input_filepath){
return "Only "+std::to_string(MAX_INPUT_FILES)+" input files are supported!!";
}
}
+
+const std::string& Input_Para::getVersion() const {
+ return this->version_str;
+}
diff --git a/src/output_data.cpp b/src/output_data.cpp
index ec67b12..ea9116c 100644
--- a/src/output_data.cpp
+++ b/src/output_data.cpp
@@ -4,6 +4,7 @@
#include
#include
#include // std::round
+#include
#include "output_data.h"
#include "utils.h"
@@ -550,46 +551,161 @@ void Output_BAM::save_summary(std::string &output_file, Input_Para ¶ms, Outp
if (fp == NULL){
fprintf(stderr, "Error: cannot open file %s\n", output_file.c_str());
} else {
- // Save basic statistics
- fprintf(fp, "Total number of reads\t%d\n", output_data.long_read_info.total_num_reads);
- fprintf(fp, "Total number of bases\t%ld\n", output_data.long_read_info.total_num_bases);
- fprintf(fp, "Longest read length\t%d\n", output_data.long_read_info.longest_read_length);
- fprintf(fp, "N50 read length\t%d\n", output_data.long_read_info.n50_read_length);
- fprintf(fp, "Mean read length\t%.2f\n", output_data.long_read_info.mean_read_length);
- fprintf(fp, "Median read length\t%d\n", output_data.long_read_info.median_read_length);
- fprintf(fp, "GC%%\t%.2f\n", output_data.long_read_info.gc_cnt * 100);
- fprintf(fp, "\n");
-
- // Save the mapping statistics
- fprintf(fp, "Total number of mapped reads\t%d\n", output_data.mapped_long_read_info.total_num_reads);
- fprintf(fp, "Total number of mapped bases\t%ld\n", output_data.mapped_long_read_info.total_num_bases);
- fprintf(fp, "Longest mapped read length\t%d\n", output_data.mapped_long_read_info.longest_read_length);
- fprintf(fp, "N50 mapped read length\t%d\n", output_data.mapped_long_read_info.n50_read_length);
- fprintf(fp, "Mean mapped read length\t%.2f\n", output_data.mapped_long_read_info.mean_read_length);
- fprintf(fp, "Median mapped read length\t%d\n", output_data.mapped_long_read_info.median_read_length);
- fprintf(fp, "GC%%\t%.2f\n", output_data.mapped_long_read_info.gc_cnt * 100);
- fprintf(fp, "\n");
-
- // Save the read alignment statistics
- fprintf(fp, "Total number of primary alignments\t%ld\n", output_data.num_primary_alignment);
- fprintf(fp, "Total number of secondary alignments\t%ld\n", output_data.num_secondary_alignment);
- fprintf(fp, "Total number of supplementary alignments\t%ld\n", output_data.num_supplementary_alignment);
- fprintf(fp, "Total number of reads with secondary alignments\t%ld\n", output_data.num_reads_with_secondary_alignment);
- fprintf(fp, "Total number of reads with supplementary alignments\t%ld\n", output_data.num_reads_with_supplementary_alignment);
- fprintf(fp, "Total number of reads with both secondary and supplementary alignments\t%ld\n", output_data.num_reads_with_both_secondary_supplementary_alignment);
- fprintf(fp, "Total number of reads with forward alignments\t%ld\n", output_data.forward_alignment);
- fprintf(fp, "Total number of reads with reverse alignments\t%ld\n", output_data.reverse_alignment);
- fprintf(fp, "Total number of reverse alignment\t%ld\n", output_data.reverse_alignment);
- fprintf(fp, "\n");
-
- // Save the base alignment statistics
- fprintf(fp, "Total number of matched bases\t%ld\n", output_data.num_matched_bases);
- fprintf(fp, "Total number of mismatched bases\t%ld\n", output_data.num_mismatched_bases);
- fprintf(fp, "Total number of insertions\t%ld\n", output_data.num_ins_bases);
- fprintf(fp, "Total number of deletions\t%ld\n", output_data.num_del_bases);
- fprintf(fp, "Total number of primary alignment clipped bases (soft + hard)\t%ld\n", output_data.num_clip_bases);
-
- // Close the file
+ // Write JSON output
+ fprintf(fp, "{\n");
+ fprintf(fp, " \"filetype\": \"bam\",\n");
+ fprintf(fp, " \"longreadsum_version\": \"%s\",\n", params.getVersion().c_str());
+ fprintf(fp, " \"total_num_reads\": %d,\n", output_data.long_read_info.total_num_reads);
+ fprintf(fp, " \"total_num_bases\": %ld,\n", output_data.long_read_info.total_num_bases);
+ fprintf(fp, " \"longest_read_length\": %d,\n", output_data.long_read_info.longest_read_length);
+ fprintf(fp, " \"n50_read_length\": %d,\n", output_data.long_read_info.n50_read_length);
+ fprintf(fp, " \"mean_read_length\": %.2f,\n", output_data.long_read_info.mean_read_length);
+ fprintf(fp, " \"median_read_length\": %d,\n", output_data.long_read_info.median_read_length);
+ fprintf(fp, " \"gc_percent\": %.2f,\n", output_data.long_read_info.gc_cnt * 100);
+
+ // Mapping statistics
+ fprintf(fp, " \"mapped\": {\n");
+ fprintf(fp, " \"total_num_reads\": %d,\n", output_data.mapped_long_read_info.total_num_reads);
+ fprintf(fp, " \"total_num_bases\": %ld,\n", output_data.mapped_long_read_info.total_num_bases);
+ fprintf(fp, " \"longest_read_length\": %d,\n", output_data.mapped_long_read_info.longest_read_length);
+ fprintf(fp, " \"n50_read_length\": %d,\n", output_data.mapped_long_read_info.n50_read_length);
+ fprintf(fp, " \"mean_read_length\": %.2f,\n", output_data.mapped_long_read_info.mean_read_length);
+ fprintf(fp, " \"median_read_length\": %d,\n", output_data.mapped_long_read_info.median_read_length);
+ fprintf(fp, " \"gc_percent\": %.2f\n", output_data.mapped_long_read_info.gc_cnt * 100);
+ fprintf(fp, " },\n");
+
+ // Read alignment statistics
+ fprintf(fp, " \"alignments\": {\n");
+ fprintf(fp, " \"primary\": %ld,\n", output_data.num_primary_alignment);
+ fprintf(fp, " \"secondary\": %ld,\n", output_data.num_secondary_alignment);
+ fprintf(fp, " \"supplementary\": %ld,\n", output_data.num_supplementary_alignment);
+ fprintf(fp, " \"reads_with_secondary\": %ld,\n", output_data.num_reads_with_secondary_alignment);
+ fprintf(fp, " \"reads_with_supplementary\": %ld,\n", output_data.num_reads_with_supplementary_alignment);
+ fprintf(fp, " \"reads_with_both\": %ld,\n", output_data.num_reads_with_both_secondary_supplementary_alignment);
+ fprintf(fp, " \"forward\": %ld,\n", output_data.forward_alignment);
+ fprintf(fp, " \"reverse\": %ld\n", output_data.reverse_alignment);
+ fprintf(fp, " },\n");
+
+ // Base alignment statistics
+ fprintf(fp, " \"base_alignment\": {\n");
+ fprintf(fp, " \"matched\": %ld,\n", output_data.num_matched_bases);
+ fprintf(fp, " \"mismatched\": %ld,\n", output_data.num_mismatched_bases);
+ fprintf(fp, " \"insertions\": %ld,\n", output_data.num_ins_bases);
+ fprintf(fp, " \"deletions\": %ld,\n", output_data.num_del_bases);
+ fprintf(fp, " \"clipped\": %ld\n", output_data.num_clip_bases);
+
+ // Determine if there is base modification data
+ bool modification_data_exists = output_data.modified_prediction_count > 0;
+ bool tin_data_exists = !output_data.tin_data.empty();
+ if (modification_data_exists || tin_data_exists) {
+ fprintf(fp, " },\n");
+ } else {
+ fprintf(fp, " }\n");
+ }
+
+ // TIN data statistics (if any)
+ if (tin_data_exists) {
+ fprintf(fp, " \"tin_data\": {\n");
+ for (auto it = output_data.tin_data.begin(); it != output_data.tin_data.end(); ++it) {
+ // Print each BAM file's TIN statistics
+ const std::string& bam_file = it->first;
+ const TINStats& tin_data = it->second;
+ fprintf(fp, " \"%s\": {\n", bam_file.c_str());
+ fprintf(fp, " \"total_transcripts\": %d,\n", tin_data.num_transcripts);
+ fprintf(fp, " \"mean\": %.2f,\n", tin_data.mean);
+ fprintf(fp, " \"median\": %.2f,\n", tin_data.median);
+ fprintf(fp, " \"stddev\": %.2f\n", tin_data.stddev);
+ fprintf(fp, " }");
+ if (std::next(it) != output_data.tin_data.end()) {
+ fprintf(fp, ",\n");
+ } else {
+ fprintf(fp, "\n");
+ }
+ }
+ if (modification_data_exists) {
+ fprintf(fp, " },\n");
+ } else {
+ fprintf(fp, " }\n");
+ }
+ }
+
+ // Base modification statistics (if any)
+ if (modification_data_exists) {
+
+ // Map of modification character to full name
+ std::unordered_map mod_char_to_name = {
+ {'m', "5mC"}, {'h', "5hmC"}, {'f', "5fC"}, {'c', "5caC"},
+ {'g', "5hmU"}, {'e', "5fu"}, {'b', "5caU"},
+ {'a', "6mA"}, {'o', "8oxoG"}, {'n', "Xao"},
+ {'C', "Amb. C"}, {'A', "Amb. A"}, {'T', "Amb. T"}, {'G', "Amb. G"},
+ {'N', "Amb. N"},
+ {'v', "pseU"}
+ };
+
+ fprintf(fp, " \"base_modifications\": {\n");
+ fprintf(fp, " \"unfiltered_modifications\": %lu,\n", output_data.modified_prediction_count);
+ fprintf(fp, " \"filter_threshold\": %.2f,\n", params.base_mod_threshold);
+ fprintf(fp, " \"sample_modified_base_count\": %lu,\n", output_data.sample_modified_base_count);
+ fprintf(fp, " \"sample_modified_base_count_forward\": %lu,\n", output_data.sample_modified_base_count_forward);
+ fprintf(fp, " \"sample_modified_base_count_reverse\": %lu,\n", output_data.sample_modified_base_count_reverse);
+ fprintf(fp, " \"cpg_forward\": %lu,\n", output_data.sample_cpg_forward_count);
+ fprintf(fp, " \"cpg_reverse\": %lu,\n", output_data.sample_cpg_reverse_count);
+ fprintf(fp, " \"base_mod_counts\": {\n");
+ for (auto it = output_data.base_mod_counts.begin(); it != output_data.base_mod_counts.end(); ++it) {
+ char mod_type = it->first;
+ std::string mod_name = std::string(1, mod_type);
+ auto it_char = mod_char_to_name.find(mod_type);
+ if (it_char != mod_char_to_name.end()) {
+ mod_name = it_char->second;
+ }
+ uint64_t count = it->second;
+ // fprintf(fp, " \"%c\": %lu", mod_type, count);
+ fprintf(fp, " \"%s\": %lu", mod_name.c_str(), count);
+ if (std::next(it) != output_data.base_mod_counts.end()) {
+ fprintf(fp, ",\n");
+ } else {
+ fprintf(fp, "\n");
+ }
+ }
+ fprintf(fp, " },\n");
+ fprintf(fp, " \"base_mod_counts_forward\": {\n");
+ for (auto it = output_data.base_mod_counts_forward.begin(); it != output_data.base_mod_counts_forward.end(); ++it) {
+ char mod_type = it->first;
+ std::string mod_name = std::string(1, mod_type);
+ auto it_char = mod_char_to_name.find(mod_type);
+ if (it_char != mod_char_to_name.end()) {
+ mod_name = it_char->second;
+ }
+ uint64_t count = it->second;
+ fprintf(fp, " \"%s\": %lu", mod_name.c_str(), count);
+ if (std::next(it) != output_data.base_mod_counts_forward.end()) {
+ fprintf(fp, ",\n");
+ } else {
+ fprintf(fp, "\n");
+ }
+ }
+ fprintf(fp, " },\n");
+ fprintf(fp, " \"base_mod_counts_reverse\": {\n");
+ for (auto it = output_data.base_mod_counts_reverse.begin(); it != output_data.base_mod_counts_reverse.end(); ++it) {
+ char mod_type = it->first;
+ std::string mod_name = std::string(1, mod_type);
+ auto it_char = mod_char_to_name.find(mod_type);
+ if (it_char != mod_char_to_name.end()) {
+ mod_name = it_char->second;
+ }
+ uint64_t count = it->second;
+ fprintf(fp, " \"%s\": %lu", mod_name.c_str(), count);
+ if (std::next(it) != output_data.base_mod_counts_reverse.end()) {
+ fprintf(fp, ",\n");
+ } else {
+ fprintf(fp, "\n");
+ }
+ }
+ fprintf(fp, " }\n");
+ fprintf(fp, " }\n");
+ }
+
+ fprintf(fp, "}\n");
fclose(fp);
}
}
@@ -639,32 +755,32 @@ void Output_SeqTxt::global_sum(){
void Output_SeqTxt::save_summary(std::string & output_file, Input_Para & params)
{
-
FILE *fp = fopen(output_file.c_str(), "w");
if (fp == NULL){
fprintf(stderr, "Error: cannot open file %s\n", output_file.c_str());
} else {
- // Define the types explicitly
- using ReadInfo = std::tuple;
-
- // Save basic statistics for total, passed, and failed reads
- for (const ReadInfo& read_type : {
- ReadInfo("All", all_long_read_info.long_read_info),
- ReadInfo("Passed", passed_long_read_info.long_read_info),
- ReadInfo("Failed", failed_long_read_info.long_read_info)
- }) {
- std::string read_filter = std::get<0>(read_type);
- Basic_Seq_Statistics& long_read_info = std::get<1>(read_type);
-
- fprintf(fp, "%s reads:\n", read_filter.c_str());
- fprintf(fp, "Total number of reads\t%d\n", long_read_info.total_num_reads);
- fprintf(fp, "Total number of bases\t%ld\n", long_read_info.total_num_bases);
- fprintf(fp, "Longest read length\t%d\n", long_read_info.longest_read_length);
- fprintf(fp, "N50 read length\t%d\n", long_read_info.n50_read_length);
- fprintf(fp, "Mean read length\t%.2f\n", long_read_info.mean_read_length);
- fprintf(fp, "Median read length\t%d\n", long_read_info.median_read_length);
- fprintf(fp, "\n");
- }
+ // Write JSON output for all, passed, and failed reads
+ fprintf(fp, "{\n");
+ fprintf(fp, " \"filetype\": \"sequencing_summary\",\n");
+ fprintf(fp, " \"longreadsum_version\": \"%s\",\n", params.getVersion().c_str());
+
+ // Helper lambda to write a block for each read type
+ auto write_read_info = [fp](const char* label, const Basic_Seq_Statistics& info, bool last) {
+ fprintf(fp, " \"%s\": {\n", label);
+ fprintf(fp, " \"total_num_reads\": %d,\n", info.total_num_reads);
+ fprintf(fp, " \"total_num_bases\": %ld,\n", info.total_num_bases);
+ fprintf(fp, " \"longest_read_length\": %d,\n", info.longest_read_length);
+ fprintf(fp, " \"n50_read_length\": %d,\n", info.n50_read_length);
+ fprintf(fp, " \"mean_read_length\": %.2f,\n", info.mean_read_length);
+ fprintf(fp, " \"median_read_length\": %d\n", info.median_read_length);
+ fprintf(fp, " }%s\n", last ? "" : ",");
+ };
+
+ write_read_info("all", all_long_read_info.long_read_info, false);
+ write_read_info("passed", passed_long_read_info.long_read_info, false);
+ write_read_info("failed", failed_long_read_info.long_read_info, true);
+
+ fprintf(fp, "}\n");
fclose(fp);
}
}
diff --git a/src/seqtxt_module.cpp b/src/seqtxt_module.cpp
index 991b87f..dadfc88 100644
--- a/src/seqtxt_module.cpp
+++ b/src/seqtxt_module.cpp
@@ -217,7 +217,9 @@ int SeqTxt_Module::generateStatistics( Output_SeqTxt& t_output_SeqTxt_info){
// Save summary statistics to the output file
std::cout << "Saving summary statistics to file..." << std::endl;
- std::string summary_filepath = _input_parameters.output_folder + "/seqtxt_summary.txt";
+ // std::string summary_filepath = _input_parameters.output_folder + "/" +
+ // _input_parameters.sample_name + "_summary.json";
+ std::string summary_filepath = _input_parameters.output_folder + "/" + _input_parameters.sample_name + "_summary.json";
t_output_SeqTxt_info.save_summary(summary_filepath, _input_parameters);
auto relapse_end_time = std::chrono::high_resolution_clock::now();
diff --git a/src/tin.cpp b/src/tin.cpp
index da945a1..2919f50 100644
--- a/src/tin.cpp
+++ b/src/tin.cpp
@@ -178,7 +178,7 @@ bool checkMinReads(htsFile* bam_file, hts_idx_t* idx, bam_hdr_t* header, std::st
return min_reads_met;
}
-void calculateTIN(TINStats* tin_stats, const std::string& gene_bed, const std::string& bam_filepath, int min_cov, int sample_size, const std::string& output_folder, int thread_count)
+void calculateTIN(TINStats& tin_stats, const std::string& gene_bed, const std::string& bam_filepath, int min_cov, int sample_size, const std::string& sample_name, const std::string& output_folder, int thread_count)
{
std::cout << "Using TIN minimum coverage " << min_cov << " and sample size " << sample_size << std::endl;
@@ -474,7 +474,7 @@ void calculateTIN(TINStats* tin_stats, const std::string& gene_bed, const std::s
std::cout << "Writing TIN scores to file..." << std::endl;
// Write the TIN scores to a file
- std::string output_tin_tsv = output_folder + "/tin_scores.tsv";
+ std::string output_tin_tsv = output_folder + "/" + sample_name + "_tin_scores.tsv";
std::ofstream output_tin_file(output_tin_tsv);
output_tin_file << std::fixed << std::setprecision(14);
@@ -502,7 +502,7 @@ void calculateTIN(TINStats* tin_stats, const std::string& gene_bed, const std::s
std::cout << "TIN scores written to " << output_tin_tsv << std::endl;
// Write the TIN summary to a file
- std::string output_tin_summary_tsv = output_folder + "/tin_summary.tsv";
+ std::string output_tin_summary_tsv = output_folder + "/" + sample_name + "_tin_summary.tsv";
std::ofstream output_tin_summary_file(output_tin_summary_tsv);
output_tin_summary_file << std::fixed << std::setprecision(14);
@@ -524,9 +524,9 @@ void calculateTIN(TINStats* tin_stats, const std::string& gene_bed, const std::s
std::cout << "TIN summary written to " << output_tin_summary_tsv << std::endl;
// Update the TIN stats struct
- tin_stats->mean = TIN_mean;
- tin_stats->median = TIN_median;
- tin_stats->stddev = TIN_stddev;
- tin_stats->num_transcripts = TIN_scores.size();
+ tin_stats.mean = TIN_mean;
+ tin_stats.median = TIN_median;
+ tin_stats.stddev = TIN_stddev;
+ tin_stats.num_transcripts = TIN_scores.size();
}
}
diff --git a/tests/test_general.py b/tests/test_general.py
index c0a8420..695081b 100644
--- a/tests/test_general.py
+++ b/tests/test_general.py
@@ -21,9 +21,12 @@ def fasta_output():
"""
# Set parameters
default_parameters = lrst.Input_Para()
- output_folder = os.path.abspath(str("output/"))
+ output_folder = os.path.abspath(str("output/fasta/"))
+ if not os.path.exists(output_folder):
+ os.makedirs(output_folder)
+
default_parameters.output_folder = output_folder
- default_parameters.out_prefix = str("fa_")
+ default_parameters.sample_name = "HG002"
# Check if running remotely
local_dir = os.path.expanduser('~/github/LongReadSum')
@@ -80,9 +83,12 @@ def multiple_fasta_output():
"""
# Set parameters
default_parameters = lrst.Input_Para()
- output_folder = os.path.abspath(str("output/"))
+ output_folder = os.path.abspath(str("output/multi_fasta/"))
+ if not os.path.exists(output_folder):
+ os.makedirs(output_folder)
+
default_parameters.output_folder = output_folder
- default_parameters.out_prefix = str("fa_multi_")
+ default_parameters.sample_name = "HG002"
# Check if running remotely
local_dir = os.path.expanduser('~/github/LongReadSum')
@@ -145,9 +151,12 @@ def fastq_output():
"""
# Set parameters
default_parameters = lrst.Input_Para()
- output_folder = os.path.abspath(str("output/"))
+ output_folder = os.path.abspath(str("output/fastq/"))
+ if not os.path.exists(output_folder):
+ os.makedirs(output_folder)
+
default_parameters.output_folder = output_folder
- default_parameters.out_prefix = str("fq_")
+ default_parameters.sample_name = "HG002"
# Check if running remotely
local_dir = os.path.expanduser('~/github/LongReadSum')
@@ -205,9 +214,12 @@ def fast5_output():
"""
# Set parameters
default_parameters = lrst.Input_Para()
- output_folder = os.path.abspath(str("output/"))
+ output_folder = os.path.abspath(str("output/fast5/"))
+ if not os.path.exists(output_folder):
+ os.makedirs(output_folder)
+
default_parameters.output_folder = output_folder
- default_parameters.out_prefix = str("f5_")
+ default_parameters.sample_name = "HG002"
# Check if running remotely
file_dir = ''
@@ -280,9 +292,12 @@ def fast5s_output():
"""
# Set parameters
default_parameters = lrst.Input_Para()
- output_folder = os.path.abspath(str("output/"))
+ output_folder = os.path.abspath(str("output/fast5s/"))
+ if not os.path.exists(output_folder):
+ os.makedirs(output_folder)
+
default_parameters.output_folder = output_folder
- default_parameters.out_prefix = str("f5s_")
+ default_parameters.sample_name = "HG002"
default_parameters.other_flags = 1 # 0 for normal QC, 1 for signal statistics output
# Check if running remotely
@@ -351,9 +366,12 @@ def bam_output():
"""
# Set parameters
default_parameters = lrst.Input_Para()
- output_folder = os.path.abspath(str("output/"))
+ output_folder = os.path.abspath(str("output/bam"))
+ if not os.path.exists(output_folder):
+ os.makedirs(output_folder)
+
default_parameters.output_folder = output_folder
- default_parameters.out_prefix = str("bam_")
+ default_parameters.sample_name = "HG002"
# Check if running remotely
local_dir = os.path.expanduser('~/github/LongReadSum')
@@ -420,9 +438,12 @@ def unmapped_bam_output():
"""Run the BAM module on unmapped inputs."""
# Set parameters
default_parameters = lrst.Input_Para()
- output_folder = os.path.abspath(str("output/"))
+ output_folder = os.path.abspath(str("output/ubam/"))
+ if not os.path.exists(output_folder):
+ os.makedirs(output_folder)
+
default_parameters.output_folder = output_folder
- default_parameters.out_prefix = str("ubam_")
+ default_parameters.sample_name = "HG002"
# Check if running remotely
local_dir = os.path.expanduser('~/github/LongReadSum')
@@ -487,11 +508,15 @@ def forward_base_mod_output():
"""Run the BAM module on a read aligned to the forward strand with base modifications."""
# Set parameters
default_parameters = lrst.Input_Para()
- output_folder = os.path.abspath(str("output/"))
+ output_folder = os.path.abspath(str("output/fmod/"))
+ if not os.path.exists(output_folder):
+ os.makedirs(output_folder)
+
default_parameters.output_folder = output_folder
- default_parameters.out_prefix = str("fwdmod_")
+ default_parameters.sample_name = "HG002"
+
default_parameters.mod_analysis = True
- default_parameters.base_mod_threshold = -1.0
+ default_parameters.base_mod_threshold = 0
# Check if running remotely
local_dir = os.path.expanduser('~/github/LongReadSum')
@@ -559,11 +584,15 @@ def reverse_base_mod_output():
"""Run the BAM module on a read aligned to the reverse strand with base modifications."""
# Set parameters
default_parameters = lrst.Input_Para()
- output_folder = os.path.abspath(str("output/"))
+ output_folder = os.path.abspath(str("output/rmod/"))
+ if not os.path.exists(output_folder):
+ os.makedirs(output_folder)
+
default_parameters.output_folder = output_folder
- default_parameters.out_prefix = str("revmod_")
+ default_parameters.sample_name = "HG002"
+
default_parameters.mod_analysis = True
- default_parameters.base_mod_threshold = -1.0
+ default_parameters.base_mod_threshold = 0
# Check if running remotely
local_dir = os.path.expanduser('~/github/LongReadSum')
@@ -633,10 +662,13 @@ def seqtxt_output():
"""
# Set parameters
default_parameters = lrst.Input_Para()
- output_folder = os.path.abspath(str("output/"))
- default_parameters.output_folder = output_folder
- default_parameters.out_prefix = str("seqtxt_")
+ output_folder = os.path.abspath(str("output/seqtxt/"))
+ if not os.path.exists(output_folder):
+ os.makedirs(output_folder)
+ default_parameters.output_folder = output_folder
+ default_parameters.sample_name = "HG002"
+
# Check if running remotely
local_dir = os.path.expanduser('~/github/LongReadSum')
if os.getcwd() == local_dir:
@@ -708,9 +740,13 @@ def rnaseq_bam_output():
"""Run the BAM module on RNASeq inputs."""
# Set parameters
default_parameters = lrst.Input_Para()
- output_folder = os.path.abspath(str("output/"))
+ output_folder = os.path.abspath(str("output/rnaseq/"))
+ if not os.path.exists(output_folder):
+ os.makedirs(output_folder)
+
default_parameters.output_folder = output_folder
- default_parameters.out_prefix = str("rnaseq_")
+ default_parameters.sample_name = "GTEX"
+
default_parameters.tin_sample_size = 100
default_parameters.tin_min_coverage = 2