From 268ef040ebcb907624943a7e2f64d4e7c0bac26f Mon Sep 17 00:00:00 2001 From: jonperdomo Date: Thu, 17 Jul 2025 15:29:52 -0400 Subject: [PATCH 1/9] add sample name and update file path --- Makefile | 8 +++++ include/input_parameters.h | 5 +++- include/version.h | 2 ++ src/cli.py | 61 ++++++++++++++------------------------ src/fasta_module.cpp | 51 ++++++++++++++++++------------- src/generate_html.py | 11 +++---- src/input_parameters.cpp | 7 +++++ tests/test_general.py | 15 +++++++--- 8 files changed, 91 insertions(+), 69 deletions(-) create mode 100644 include/version.h diff --git a/Makefile b/Makefile index bcab4bf..4816617 100644 --- a/Makefile +++ b/Makefile @@ -2,6 +2,9 @@ INCL_DIR := $(CURDIR)/include SRC_DIR := $(CURDIR)/src LIB_DIR := $(CURDIR)/lib +VERSION := $(shell git describe --tags --always) +VERSION_HEADER := $(INCL_DIR)/version.h + # Set the library paths for the compiler CONDA_PREFIX ?= $(shell echo $$CONDA_PREFIX) LIBRARY_PATHS := -L$(LIB_DIR) -L$(CONDA_PREFIX)/lib @@ -10,6 +13,11 @@ INCLUDE_PATHS := -I$(INCL_DIR) -I$(CONDA_PREFIX)/include # All targets all: swig_build compile +# Rule to generate version.h +$(VERSION_HEADER): + @echo "#pragma once" > $@ + @echo "#define VERSION \"$(VERSION)\"" >> $@ + # Generate the SWIG Python/C++ wrappers swig_build: mkdir -p $(LIB_DIR) diff --git a/include/input_parameters.h b/include/input_parameters.h index e092f5f..cdc9f44 100644 --- a/include/input_parameters.h +++ b/include/input_parameters.h @@ -21,7 +21,6 @@ class Input_Para{ // Parameters int threads; size_t num_input_files; - std::string out_prefix; int64_t other_flags; int32_t user_defined_fastq_base_qual_offset; std::string output_folder; // Output folder @@ -36,10 +35,14 @@ class Input_Para{ bool mod_analysis; // Perform base modification analysis on BAM file int tin_sample_size; // Number of equally spaced samples for TIN calculation int tin_min_coverage; // Minimum coverage for TIN calculation + std::string sample_name; // Sample name + std::string version_str = ""; // Version string for the program // Functions std::string add_input_file(const std::string& input_filepath); + const std::string &getVersion() const; + Input_Para(); ~Input_Para(); diff --git a/include/version.h b/include/version.h new file mode 100644 index 0000000..767e900 --- /dev/null +++ b/include/version.h @@ -0,0 +1,2 @@ +#pragma once +#define VERSION "v1.4.0-5-g9e0d2c4" diff --git a/src/cli.py b/src/cli.py index 3d0f7e2..176a9e2 100644 --- a/src/cli.py +++ b/src/cli.py @@ -65,9 +65,6 @@ def get_common_param(margs): pat_split = margs.pattern.split("*") param_dict["input_files"].extend( glob(os.path.join("*".join(pat_split[:-1]), "*" + pat_split[-1]))) - - # read_count = int(margs.read_count) - # param_dict["read_count"] = read_count if len(param_dict["input_files"]) == 0: parsing_error_msg += "No input file(s) were provided.\n" @@ -88,8 +85,6 @@ def get_common_param(margs): except OSError: parsing_error_msg += "Cannot create folder for " + param_dict["output_folder"] + " \n" - param_dict["out_prefix"] = margs.outprefix - # Set up logging to file and stdout if margs.log is None or margs.log == "": parsing_error_msg += "No log file was provided.\n" @@ -113,6 +108,7 @@ def get_common_param(margs): param_dict["log_level"] = margs.log_level param_dict["threads"] = margs.threads + param_dict["sample_name"] = margs.sample # Reset the param_dict if there are parsing errors if parsing_error_msg != "": @@ -133,7 +129,6 @@ def fq_module(margs): else: logging.info('Input file(s) are:\n%s', '\n'.join(param_dict["input_files"])) - param_dict["out_prefix"] += "fastq" # Import the SWIG Python wrapper for our C++ module input_para = lrst.Input_Para() @@ -141,9 +136,8 @@ def fq_module(margs): input_para.other_flags = 0 input_para.user_defined_fastq_base_qual_offset = margs.udqual - input_para.output_folder = str(param_dict["output_folder"]) - input_para.out_prefix = str(param_dict["out_prefix"]) + input_para.sample_name = param_dict["sample_name"] for _ipf in param_dict["input_files"]: input_para.add_input_file(str(_ipf)) @@ -157,7 +151,7 @@ def fq_module(margs): fq_html_gen = generate_html.ST_HTML_Generator( [["basic_st", "read_length_bar", "read_length_hist", "gc_content_hist", "base_counts", "base_quality", "read_avg_base_quality"], "FASTQ QC", param_dict], plot_filepaths, static=False) - fq_html_gen.generate_html() + fq_html_gen.generate_html(input_para.sample_name, "fastq") logging.info("Done. Output files are in %s", param_dict["output_folder"]) else: @@ -176,12 +170,11 @@ def fa_module(margs): else: # If there are no parse errors, run the filetype-specific module logging.info('Input file(s) are:\n%s', '\n'.join(param_dict["input_files"])) - param_dict["out_prefix"] += "fasta" input_para = lrst.Input_Para() input_para.threads = param_dict["threads"] input_para.other_flags = 0 input_para.output_folder = str(param_dict["output_folder"]) - input_para.out_prefix = str(param_dict["out_prefix"]) + input_para.sample_name = param_dict["sample_name"] for _ipf in param_dict["input_files"]: input_para.add_input_file(str(_ipf)) @@ -195,7 +188,7 @@ def fa_module(margs): fa_html_gen = generate_html.ST_HTML_Generator( [["basic_st", "read_length_bar", "read_length_hist", "gc_content_hist", "base_counts"], "FASTA QC", param_dict], plot_filepaths, static=True) - fa_html_gen.generate_html() + fa_html_gen.generate_html(input_para.sample_name, "fasta") logging.info("Done. Output files are in %s", param_dict["output_folder"]) else: @@ -211,11 +204,10 @@ def bam_module(margs): else: logging.info('Input file(s) are:\n%s', '\n'.join(param_dict["input_files"])) - param_dict["out_prefix"] += "bam"; input_para = lrst.Input_Para() input_para.threads = param_dict["threads"] input_para.output_folder = str(param_dict["output_folder"]) - input_para.out_prefix = str(param_dict["out_prefix"]) + input_para.sample_name = param_dict["sample_name"] # Set the reference genome file and base modification threshold ref_genome = margs.ref if margs.ref != "" or margs.ref is not None else "" @@ -269,7 +261,7 @@ def bam_module(margs): # Generate the HTML report bam_html_gen = generate_html.ST_HTML_Generator( [qc_info_list, "BAM QC", param_dict], plot_filepaths, static=False) - bam_html_gen.generate_html() + bam_html_gen.generate_html(input_para.sample_name, "bam") logging.info("Done. Output files are in %s", param_dict["output_folder"]) else: @@ -288,7 +280,8 @@ def rrms_module(margs): input_para = lrst.Input_Para() input_para.threads = param_dict["threads"] input_para.output_folder = str(param_dict["output_folder"]) - input_para.out_prefix = str(param_dict["out_prefix"]) + input_para.sample_name = param_dict["sample_name"] + for _ipf in param_dict["input_files"]: input_para.add_input_file(str(_ipf)) @@ -296,18 +289,10 @@ def rrms_module(margs): input_para.rrms_csv = margs.csv logging.info("RRMS CSV file is " + input_para.rrms_csv) - # Get the output prefix - output_prefix = param_dict["out_prefix"] - # Run QC for both accepted and rejected reads rrms_filter = [True, False] for filter_type in rrms_filter: - - # Set the RRMS filter type - input_para.rrms_filter = filter_type - - # Set the output prefix - param_dict["out_prefix"] = output_prefix + "rrms_" + ("accepted" if filter_type else "rejected") + input_para.rrms_filter = filter_type # True for accepted reads, False for rejected reads param_dict["mod"] = input_para.mod_analysis = False # Disable base modification analysis for RRMS (use BAM module for this) # Run the QC module @@ -332,7 +317,7 @@ def rrms_module(margs): # Generate the HTML report bam_html_gen = generate_html.ST_HTML_Generator( [qc_info_list, "BAM QC", param_dict], plot_filepaths, static=False) - bam_html_gen.generate_html() + bam_html_gen.generate_html(input_para.sample_name, "rrms_bam_" + ("accepted" if filter_type else "rejected")) logging.info("Done. Output files are in %s", param_dict["output_folder"]) else: @@ -349,11 +334,10 @@ def seqtxt_module(margs): else: logging.info('Input file(s) are:\n%s', '\n'.join(param_dict["input_files"])) - param_dict["out_prefix"] += "seqtxt" input_para = lrst.Input_Para() input_para.threads = param_dict["threads"] input_para.output_folder = str(param_dict["output_folder"]) - input_para.out_prefix = str(param_dict["out_prefix"]) + input_para.sample_name = param_dict["sample_name"] for _ipf in param_dict["input_files"]: input_para.add_input_file(str(_ipf)) @@ -370,7 +354,7 @@ def seqtxt_module(margs): [["basic_st", "read_length_bar", "read_length_hist"], report_title, param_dict], plot_filepaths, static=False) - seqtxt_html_gen.generate_html() + seqtxt_html_gen.generate_html(input_para.sample_name, "basecall_summary") logging.info("Done. Output files are in %s", param_dict["output_folder"]) else: logging.error("QC did not generate.") @@ -386,11 +370,10 @@ def fast5_module(margs): else: # logging.info('Input file(s) are:\n%s', '\n'.join(param_dict["input_files"])) - param_dict["out_prefix"] += "FAST5" input_para = lrst.Input_Para() input_para.threads = param_dict["threads"] input_para.output_folder = str(param_dict["output_folder"]) - input_para.out_prefix = str(param_dict["out_prefix"]) + input_para.sample_name = param_dict["sample_name"] input_para.other_flags = 0 # 0 for normal QC, 1 for signal statistics output for _ipf in param_dict["input_files"]: @@ -405,7 +388,7 @@ def fast5_module(margs): fast5_html_obj = generate_html.ST_HTML_Generator( [["basic_st", "read_length_bar", "read_length_hist", "gc_content_hist", "base_counts", "base_quality"], "FAST5 QC", param_dict], plot_filepaths, static=False) - fast5_html_obj.generate_html() + fast5_html_obj.generate_html(input_para.sample_name, "fast5") logging.info("Done. Output files are in %s", param_dict["output_folder"]) else: @@ -421,11 +404,11 @@ def fast5_signal_module(margs): else: # logging.info('Input file(s) are:\n%s', '\n'.join(param_dict["input_files"])) - param_dict["out_prefix"] += "fast5_signal" input_para = lrst.Input_Para() input_para.threads = param_dict["threads"] input_para.output_folder = str(param_dict["output_folder"]) - input_para.out_prefix = str(param_dict["out_prefix"]) + input_para.sample_name = param_dict["sample_name"] + input_para.other_flags = 1 # 0 for normal QC, 1 for signal statistics output # Get the read count if specified @@ -450,7 +433,7 @@ def fast5_signal_module(margs): plot_filepaths = plot(fast5_output, param_dict, 'FAST5s') fast5_html_obj = generate_html.ST_HTML_Generator( [["basic_st", "read_length_bar", "read_length_hist", "gc_content_hist", "base_counts", "ont_signal"], "FAST5 QC", param_dict], plot_filepaths, static=False) - fast5_html_obj.generate_html(signal_plots=True) + fast5_html_obj.generate_html(input_para.sample_name, "fast5_signal", signal_plots=True) logging.info("Done. Output files are in %s", param_dict["output_folder"]) else: @@ -467,11 +450,10 @@ def pod5_module(margs): else: # logging.info('Input file(s) are:\n%s', '\n'.join(param_dict["input_files"])) - param_dict["out_prefix"] += "POD5" input_para = {} input_para['threads'] = param_dict["threads"] input_para['output_folder'] = str(param_dict["output_folder"]) - input_para['out_prefix'] = str(param_dict["out_prefix"]) + input_para['sample_name'] = param_dict["sample_name"] input_para['other_flags'] = 0 # 0 for normal QC, 1 for signal statistics output input_para['input_files'] = [] for input_file in param_dict["input_files"]: @@ -496,7 +478,6 @@ def pod5_module(margs): basecalls_input = lrst.Input_Para() basecalls_input.threads = param_dict["threads"] basecalls_input.output_folder = str(param_dict["output_folder"]) - basecalls_input.out_prefix = str(param_dict["out_prefix"]) basecalls_input.add_input_file(basecalls) bam_output = lrst.Output_BAM() exit_code = lrst.callBAMModule(basecalls_input, bam_output) @@ -518,7 +499,7 @@ def pod5_module(margs): webpage_title = "POD5 QC" fast5_html_obj = generate_html.ST_HTML_Generator( [["basic_st", "read_length_bar", "read_length_hist", "gc_content_hist", "base_counts", "ont_signal"], webpage_title, param_dict], plot_filepaths, static=False) - fast5_html_obj.generate_html(signal_plots=True) + fast5_html_obj.generate_html(input_para.sample_name, "pod5_signal", signal_plots=True) logging.info("Done. Output files are in %s", param_dict["output_folder"]) else: @@ -543,6 +524,8 @@ def set_file_parser_defaults(file_parser): help="The number of threads used. Default: 1.") file_parser.add_argument("-Q", "--outprefix", type=str, default="QC_", help="The prefix for output filenames. Default: `QC_`.") + file_parser.add_argument("-s", "--sample", type=str, default="Sample", + help="Sample name. Default: `Sample`") # Set up the argument parser diff --git a/src/fasta_module.cpp b/src/fasta_module.cpp index bcbb946..39d3dd4 100644 --- a/src/fasta_module.cpp +++ b/src/fasta_module.cpp @@ -205,9 +205,11 @@ int qc_fasta_files(Input_Para &_input_data, Output_FA &py_output_fa) const char *input_file = NULL; std::string read_details_file, read_summary_file; FILE *read_details_fp, *read_summary_fp; + const std::string &sample_name = _input_data.sample_name; - read_details_file = _input_data.output_folder + "/FASTA_details.txt"; - read_summary_file = _input_data.output_folder + "/FASTA_summary.txt"; + read_details_file = _input_data.output_folder + "/" + sample_name + "_readqc.fasta.txt"; + // read_summary_file = _input_data.output_folder + "/FASTA_summary.txt"; + read_summary_file = _input_data.output_folder + "/" + sample_name + "_summary.fasta.json"; // ============= @@ -308,27 +310,36 @@ int qc_fasta_files(Input_Para &_input_data, Output_FA &py_output_fa) py_output_fa.long_read_info.mean_read_length = (double)py_output_fa.long_read_info.total_num_bases / (double)py_output_fa.long_read_info.total_num_reads; read_summary_fp = fopen(read_summary_file.c_str(), "w"); - fprintf(read_summary_fp, "total number of reads\t%d\n", py_output_fa.long_read_info.total_num_reads); - fprintf(read_summary_fp, "total number of bases\t%ld\n", py_output_fa.long_read_info.total_num_bases); - fprintf(read_summary_fp, "longest read length\t%d\n", py_output_fa.long_read_info.longest_read_length); - fprintf(read_summary_fp, "N50 read length\t%d\n", py_output_fa.long_read_info.n50_read_length); - fprintf(read_summary_fp, "mean read length\t%.2f\n", py_output_fa.long_read_info.mean_read_length); - fprintf(read_summary_fp, "median read length\t%d\n", py_output_fa.long_read_info.median_read_length); - fprintf(read_summary_fp, "GC%%\t%.2f\n", py_output_fa.long_read_info.gc_cnt * 100); - fprintf(read_summary_fp, "\n\n"); - for (int percent = 5; percent < 100; percent += 5) - { - fprintf(read_summary_fp, "N%02d read length\t%.d\n", percent, py_output_fa.long_read_info.NXX_read_length[percent]); - } + if (read_summary_fp) { + // Write JSON summary + fprintf(read_summary_fp, "{\n"); + fprintf(read_summary_fp, " \"filetype\": \"fasta\",\n"); + fprintf(read_summary_fp, " \"longreadsum_version\": \"%s\",\n", _input_data.getVersion().c_str()); + fprintf(read_summary_fp, " \"total_num_reads\": %d,\n", py_output_fa.long_read_info.total_num_reads); + fprintf(read_summary_fp, " \"total_num_bases\": %ld,\n", py_output_fa.long_read_info.total_num_bases); + fprintf(read_summary_fp, " \"longest_read_length\": %d,\n", py_output_fa.long_read_info.longest_read_length); + fprintf(read_summary_fp, " \"n50_read_length\": %d,\n", py_output_fa.long_read_info.n50_read_length); + fprintf(read_summary_fp, " \"mean_read_length\": %.2f,\n", py_output_fa.long_read_info.mean_read_length); + fprintf(read_summary_fp, " \"median_read_length\": %d,\n", py_output_fa.long_read_info.median_read_length); + fprintf(read_summary_fp, " \"gc_percent\": %.2f,\n", py_output_fa.long_read_info.gc_cnt * 100); + + // NXX read lengths + fprintf(read_summary_fp, " \"NXX_read_length\": {\n"); + for (int percent = 5; percent < 100; percent += 5) { + fprintf(read_summary_fp, " \"N%02d\": %d%s\n", percent, py_output_fa.long_read_info.NXX_read_length[percent], (percent + 5 < 100) ? "," : ""); + } + fprintf(read_summary_fp, " },\n"); - fprintf(read_summary_fp, "\n\n"); + // GC content distribution + fprintf(read_summary_fp, " \"gc_content_distribution\": {\n"); + for (int gc_ratio = 0; gc_ratio <= 100; gc_ratio++) { + fprintf(read_summary_fp, " \"%d\": %d%s\n", gc_ratio, py_output_fa.long_read_info.read_gc_content_count[gc_ratio], (gc_ratio < 100) ? "," : ""); + } + fprintf(read_summary_fp, " }\n"); - fprintf(read_summary_fp, "GC content\tnumber of reads\n"); - for (int gc_ratio = 0; gc_ratio <= 100; gc_ratio++) - { - fprintf(read_summary_fp, "GC=%d%%\t%d\n", gc_ratio, py_output_fa.long_read_info.read_gc_content_count[gc_ratio]); + fprintf(read_summary_fp, "}\n"); + fclose(read_summary_fp); } - fclose(read_summary_fp); } } } diff --git a/src/generate_html.py b/src/generate_html.py index dce7237..06a0f6a 100644 --- a/src/generate_html.py +++ b/src/generate_html.py @@ -20,9 +20,10 @@ def __init__(self, para_list, plot_filepaths, static=True): else: self.more_input_files = False - def generate_header(self): + def generate_header(self, sample_name, filetype): """Format the header of the HTML file with the title and CSS.""" - html_filepath = self.input_para["output_folder"] + '/' + self.input_para["out_prefix"] + ".html" + html_filepath = self.input_para["output_folder"] + '/longreadsum_' + sample_name + '_' + filetype + '.html' + logging.info("Generating HTML file: %s", html_filepath) self.html_writer = open(html_filepath, 'w', encoding='utf-8') self.html_writer.write("") self.html_writer.write("") @@ -429,9 +430,9 @@ def generate_end(self): self.html_writer.close() # Main function for generating the HTML. - def generate_html(self, signal_plots=False): + def generate_html(self, sample_name, filetype, signal_plots=False): if signal_plots: - self.generate_header() + self.generate_header(sample_name, filetype) # Get the signal plots signal_plots = self.plot_filepaths["ont_signal"]['dynamic'] read_names = signal_plots.keys() @@ -440,7 +441,7 @@ def generate_html(self, signal_plots=False): self.generate_end() else: # Format base QC - self.generate_header() + self.generate_header(sample_name, filetype) self.generate_left() self.generate_right() self.generate_end() diff --git a/src/input_parameters.cpp b/src/input_parameters.cpp index c409526..3f9b976 100644 --- a/src/input_parameters.cpp +++ b/src/input_parameters.cpp @@ -1,6 +1,7 @@ #include #include "input_parameters.h" +#include "version.h" Input_Para::Input_Para(){ // Set default parameters @@ -13,6 +14,8 @@ Input_Para::Input_Para(){ this->base_mod_threshold = 0.5; this->gene_bed = ""; this->mod_analysis = false; + this->sample_name = "Sample"; + this->version_str = VERSION; } Input_Para::~Input_Para(){ @@ -28,3 +31,7 @@ std::string Input_Para::add_input_file(const std::string& input_filepath){ return "Only "+std::to_string(MAX_INPUT_FILES)+" input files are supported!!"; } } + +const std::string& Input_Para::getVersion() const { + return this->version_str; +} diff --git a/tests/test_general.py b/tests/test_general.py index c0a8420..dd60ceb 100644 --- a/tests/test_general.py +++ b/tests/test_general.py @@ -21,9 +21,12 @@ def fasta_output(): """ # Set parameters default_parameters = lrst.Input_Para() - output_folder = os.path.abspath(str("output/")) + output_folder = os.path.abspath(str("output/fasta/")) + if not os.path.exists(output_folder): + os.makedirs(output_folder) + default_parameters.output_folder = output_folder - default_parameters.out_prefix = str("fa_") + default_parameters.sample_name = "HG002" # Check if running remotely local_dir = os.path.expanduser('~/github/LongReadSum') @@ -80,9 +83,13 @@ def multiple_fasta_output(): """ # Set parameters default_parameters = lrst.Input_Para() - output_folder = os.path.abspath(str("output/")) + output_folder = os.path.abspath(str("output/multi_fasta/")) + if not os.path.exists(output_folder): + os.makedirs(output_folder) + default_parameters.output_folder = output_folder - default_parameters.out_prefix = str("fa_multi_") + default_parameters.sample_name = "HG002_multi" + # default_parameters.out_prefix = str("fa_multi_") # Check if running remotely local_dir = os.path.expanduser('~/github/LongReadSum') From a3ec7f2de067e049404e9a1dfcfe68ac7a540c07 Mon Sep 17 00:00:00 2001 From: jonperdomo Date: Wed, 23 Jul 2025 14:24:31 -0400 Subject: [PATCH 2/9] update output file formats --- src/bam_module.cpp | 14 +++-- src/output_data.cpp | 127 +++++++++++++++++++++--------------------- src/seqtxt_module.cpp | 2 +- tests/test_general.py | 7 ++- 4 files changed, 79 insertions(+), 71 deletions(-) diff --git a/src/bam_module.cpp b/src/bam_module.cpp index e73c9fb..c697805 100644 --- a/src/bam_module.cpp +++ b/src/bam_module.cpp @@ -154,8 +154,6 @@ int BAM_Module::calculateStatistics(Input_Para &input_params, Output_BAM &final_ while (reader.hasNextRecord()){ // Read the next batch of records - // std::cout << "Generating " << thread_count << " thread(s)..." << - // std::endl; printMessage("Generating " + std::to_string(thread_count) + " thread(s)..."); std::vector thread_vector; for (int thread_index=0; thread_index; - - // Save basic statistics for total, passed, and failed reads - for (const ReadInfo& read_type : { - ReadInfo("All", all_long_read_info.long_read_info), - ReadInfo("Passed", passed_long_read_info.long_read_info), - ReadInfo("Failed", failed_long_read_info.long_read_info) - }) { - std::string read_filter = std::get<0>(read_type); - Basic_Seq_Statistics& long_read_info = std::get<1>(read_type); - - fprintf(fp, "%s reads:\n", read_filter.c_str()); - fprintf(fp, "Total number of reads\t%d\n", long_read_info.total_num_reads); - fprintf(fp, "Total number of bases\t%ld\n", long_read_info.total_num_bases); - fprintf(fp, "Longest read length\t%d\n", long_read_info.longest_read_length); - fprintf(fp, "N50 read length\t%d\n", long_read_info.n50_read_length); - fprintf(fp, "Mean read length\t%.2f\n", long_read_info.mean_read_length); - fprintf(fp, "Median read length\t%d\n", long_read_info.median_read_length); - fprintf(fp, "\n"); - } + // Write JSON output for all, passed, and failed reads + fprintf(fp, "{\n"); + + // Helper lambda to write a block for each read type + auto write_read_info = [fp](const char* label, const Basic_Seq_Statistics& info, bool last) { + fprintf(fp, " \"%s\": {\n", label); + fprintf(fp, " \"total_num_reads\": %d,\n", info.total_num_reads); + fprintf(fp, " \"total_num_bases\": %ld,\n", info.total_num_bases); + fprintf(fp, " \"longest_read_length\": %d,\n", info.longest_read_length); + fprintf(fp, " \"n50_read_length\": %d,\n", info.n50_read_length); + fprintf(fp, " \"mean_read_length\": %.2f,\n", info.mean_read_length); + fprintf(fp, " \"median_read_length\": %d\n", info.median_read_length); + fprintf(fp, " }%s\n", last ? "" : ","); + }; + + write_read_info("all", all_long_read_info.long_read_info, false); + write_read_info("passed", passed_long_read_info.long_read_info, false); + write_read_info("failed", failed_long_read_info.long_read_info, true); + + fprintf(fp, "}\n"); fclose(fp); } } diff --git a/src/seqtxt_module.cpp b/src/seqtxt_module.cpp index 991b87f..ca7b95c 100644 --- a/src/seqtxt_module.cpp +++ b/src/seqtxt_module.cpp @@ -217,7 +217,7 @@ int SeqTxt_Module::generateStatistics( Output_SeqTxt& t_output_SeqTxt_info){ // Save summary statistics to the output file std::cout << "Saving summary statistics to file..." << std::endl; - std::string summary_filepath = _input_parameters.output_folder + "/seqtxt_summary.txt"; + std::string summary_filepath = _input_parameters.output_folder + "/" + _input_parameters.sample_name + "_summary.seqtxt.json"; t_output_SeqTxt_info.save_summary(summary_filepath, _input_parameters); auto relapse_end_time = std::chrono::high_resolution_clock::now(); diff --git a/tests/test_general.py b/tests/test_general.py index dd60ceb..c26ff4a 100644 --- a/tests/test_general.py +++ b/tests/test_general.py @@ -358,9 +358,12 @@ def bam_output(): """ # Set parameters default_parameters = lrst.Input_Para() - output_folder = os.path.abspath(str("output/")) + output_folder = os.path.abspath(str("output/bam")) + if not os.path.exists(output_folder): + os.makedirs(output_folder) + default_parameters.output_folder = output_folder - default_parameters.out_prefix = str("bam_") + default_parameters.sample_name = "HG002" # Check if running remotely local_dir = os.path.expanduser('~/github/LongReadSum') From af53e0ef38fc8f1ff14ef75bb03362bbafb740d9 Mon Sep 17 00:00:00 2001 From: jonperdomo Date: Thu, 24 Jul 2025 13:31:34 -0400 Subject: [PATCH 3/9] update fastq --- src/fastq_module.cpp | 55 ++++++++++++++++++++++++++----------------- tests/test_general.py | 10 ++++---- 2 files changed, 39 insertions(+), 26 deletions(-) diff --git a/src/fastq_module.cpp b/src/fastq_module.cpp index 3d87a32..0d22012 100644 --- a/src/fastq_module.cpp +++ b/src/fastq_module.cpp @@ -162,9 +162,9 @@ int qc_fastq_files(Input_Para &_input_data, Output_FQ &output_data) std::string read_details_file, read_summary_file; FILE *read_details_fp, *read_summary_fp; - read_details_file = _input_data.output_folder + "/FASTQ_details.txt"; - read_summary_file = _input_data.output_folder + "/FASTQ_summary.txt"; - + read_details_file = _input_data.output_folder + "/" + _input_data.sample_name + "_readqc.fastq.txt"; + read_summary_file = _input_data.output_folder + "/" + _input_data.sample_name + "_summary.fastq.json"; + output_data.long_read_info.total_num_reads = ZeroDefault; // total number of long reads output_data.long_read_info.total_num_bases = ZeroDefault; // total number of bases @@ -281,40 +281,51 @@ int qc_fastq_files(Input_Para &_input_data, Output_FQ &output_data) output_data.long_read_info.n05_read_length = output_data.long_read_info.NXX_read_length[5]; read_summary_fp = fopen(read_summary_file.c_str(), "w"); - fprintf(read_summary_fp, "total number of reads\t%d\n", output_data.long_read_info.total_num_reads); - fprintf(read_summary_fp, "total number of bases\t%ld\n", output_data.long_read_info.total_num_bases); - fprintf(read_summary_fp, "longest read length\t%d\n", output_data.long_read_info.longest_read_length); - fprintf(read_summary_fp, "N50 read length\t%d\n", output_data.long_read_info.n50_read_length); - fprintf(read_summary_fp, "mean read length\t%.2f\n", output_data.long_read_info.mean_read_length); - fprintf(read_summary_fp, "median read length\t%d\n", output_data.long_read_info.median_read_length); - fprintf(read_summary_fp, "GC%%\t%.2f\n", output_data.long_read_info.gc_cnt * 100); - fprintf(read_summary_fp, "\n\n"); + // Write summary in JSON format + fprintf(read_summary_fp, "{\n"); + fprintf(read_summary_fp, " \"filetype\": \"fastq\",\n"); + fprintf(read_summary_fp, " \"longreadsum_version\": \"%s\",\n", _input_data.getVersion().c_str()); + fprintf(read_summary_fp, " \"total_num_reads\": %d,\n", output_data.long_read_info.total_num_reads); + fprintf(read_summary_fp, " \"total_num_bases\": %ld,\n", output_data.long_read_info.total_num_bases); + fprintf(read_summary_fp, " \"longest_read_length\": %d,\n", output_data.long_read_info.longest_read_length); + fprintf(read_summary_fp, " \"n50_read_length\": %d,\n", output_data.long_read_info.n50_read_length); + fprintf(read_summary_fp, " \"mean_read_length\": %.2f,\n", output_data.long_read_info.mean_read_length); + fprintf(read_summary_fp, " \"median_read_length\": %d,\n", output_data.long_read_info.median_read_length); + fprintf(read_summary_fp, " \"gc_percent\": %.2f,\n", output_data.long_read_info.gc_cnt * 100); + + // NXX read lengths + fprintf(read_summary_fp, " \"NXX_read_length\": {\n"); for (int percent = 5; percent < 100; percent += 5) { - fprintf(read_summary_fp, "N%02d read length\t%.d\n", percent, output_data.long_read_info.NXX_read_length[percent]); + fprintf(read_summary_fp, " \"N%02d\": %d%s\n", percent, output_data.long_read_info.NXX_read_length[percent], (percent + 5 < 100) ? "," : ""); } + fprintf(read_summary_fp, " },\n"); - fprintf(read_summary_fp, "\n\n"); - - fprintf(read_summary_fp, "GC content\tnumber of reads\n"); + // GC content distribution + fprintf(read_summary_fp, " \"gc_content_distribution\": {\n"); for (int gc_ratio = 0; gc_ratio < 100; gc_ratio++) { - fprintf(read_summary_fp, "GC=%d%%\t%d\n", gc_ratio, output_data.long_read_info.read_gc_content_count[gc_ratio]); + fprintf(read_summary_fp, " \"%d\": %d%s\n", gc_ratio, output_data.long_read_info.read_gc_content_count[gc_ratio], (gc_ratio < 99) ? "," : ""); } + fprintf(read_summary_fp, " },\n"); - fprintf(read_summary_fp, "\n\n"); - fprintf(read_summary_fp, "base quality\tnumber of bases\n"); + // Base quality distribution + fprintf(read_summary_fp, " \"base_quality_distribution\": {\n"); for (int baseq = 0; baseq <= 60; baseq++) { - fprintf(read_summary_fp, "%d\t%ld\n", baseq, output_data.seq_quality_info.base_quality_distribution[baseq]); + fprintf(read_summary_fp, " \"%d\": %ld%s\n", baseq, output_data.seq_quality_info.base_quality_distribution[baseq], (baseq < 60) ? "," : ""); } + fprintf(read_summary_fp, " },\n"); - fprintf(read_summary_fp, "\n\n"); - fprintf(read_summary_fp, "read average base quality\tnumber of reads\n"); + // Read average base quality distribution + fprintf(read_summary_fp, " \"read_average_base_quality_distribution\": {\n"); for (int baseq = 0; baseq <= 60; baseq++) { - fprintf(read_summary_fp, "%d\t%d\n", baseq, output_data.seq_quality_info.read_average_base_quality_distribution[baseq]); + fprintf(read_summary_fp, " \"%d\": %d%s\n", baseq, output_data.seq_quality_info.read_average_base_quality_distribution[baseq], (baseq < 60) ? "," : ""); } + fprintf(read_summary_fp, " }\n"); + + fprintf(read_summary_fp, "}\n"); fclose(read_summary_fp); } } diff --git a/tests/test_general.py b/tests/test_general.py index c26ff4a..4cf8183 100644 --- a/tests/test_general.py +++ b/tests/test_general.py @@ -88,8 +88,7 @@ def multiple_fasta_output(): os.makedirs(output_folder) default_parameters.output_folder = output_folder - default_parameters.sample_name = "HG002_multi" - # default_parameters.out_prefix = str("fa_multi_") + default_parameters.sample_name = "HG002" # Check if running remotely local_dir = os.path.expanduser('~/github/LongReadSum') @@ -152,9 +151,12 @@ def fastq_output(): """ # Set parameters default_parameters = lrst.Input_Para() - output_folder = os.path.abspath(str("output/")) + output_folder = os.path.abspath(str("output/fastq/")) + if not os.path.exists(output_folder): + os.makedirs(output_folder) + default_parameters.output_folder = output_folder - default_parameters.out_prefix = str("fq_") + default_parameters.sample_name = "HG002" # Check if running remotely local_dir = os.path.expanduser('~/github/LongReadSum') From 0cad7416974de58ef1137deb76284175166b1004 Mon Sep 17 00:00:00 2001 From: jonperdomo Date: Fri, 25 Jul 2025 13:10:23 -0400 Subject: [PATCH 4/9] fast5 and fast5 signal mode --- src/fast5_module.cpp | 233 ++++++++++++++++++++++-------------------- tests/test_general.py | 14 ++- 2 files changed, 131 insertions(+), 116 deletions(-) diff --git a/src/fast5_module.cpp b/src/fast5_module.cpp index 76d4f32..c86e463 100644 --- a/src/fast5_module.cpp +++ b/src/fast5_module.cpp @@ -586,127 +586,136 @@ int generateQCForFAST5(Input_Para &_input_data, Output_FAST5 &output_data) // Write QC details to the file exit_code = writeSignalQCDetails(input_file, output_data, read_id_list); } + } + // Generate the usual read and base QC output + read_details_file = _input_data.output_folder + "/" + _input_data.sample_name + "_readqc.fast5.txt"; + read_summary_file = _input_data.output_folder + "/" + _input_data.sample_name + "_summary.fast5.json"; + + // Set up the output summary text file + read_details_fp = fopen(read_details_file.c_str(), "w"); + if (NULL == read_details_fp) + { + std::cerr << "Failed to set up output file: " << read_details_file << std::endl; + exit_code = 3; } else { - // Generate the usual read and base QC output - read_details_file = _input_data.output_folder + "/FAST5_details.txt"; - read_summary_file = _input_data.output_folder + "/FAST5_summary.txt"; - - // Set up the output summary text file - read_details_fp = fopen(read_details_file.c_str(), "w"); - if (NULL == read_details_fp) - { - std::cerr << "Failed to set up output file: " << read_details_file << std::endl; - exit_code = 3; - } else { - fprintf(read_details_fp, "#read_name\tlength\tGC\taverage_baseq_quality\n"); - - // Loop through each input file and get the QC data across files - size_t file_count = _input_data.num_input_files; - - // Write QC details to the file - for (size_t i = 0; i < file_count; i++) - { - const std::string input_file = _input_data.input_files[i]; - exit_code = writeBaseQCDetails(input_file, output_data, read_details_fp); - } - fclose(read_details_fp); - - // Check if the GC content was calculated successfully - if (exit_code == 0) { - - // Add the G + C bases - double g_c = output_data.long_read_info.total_g_cnt + output_data.long_read_info.total_c_cnt; + fprintf(read_details_fp, "#read_name\tlength\tGC\taverage_baseq_quality\n"); - // Add all bases - double a_tu_g_c = g_c + output_data.long_read_info.total_a_cnt + output_data.long_read_info.total_tu_cnt; - - // Calculate read length statistics if base counts are not zero - uint64_t total_num_bases = output_data.long_read_info.total_num_bases; - if (total_num_bases == 0) { - std::cerr << "No bases found in input files." << std::endl; - exit_code = 3; - } else { - // Calculate GC-content - output_data.long_read_info.gc_cnt = g_c / a_tu_g_c; - - // Sort the read lengths in descending order - std::vector read_lengths = output_data.long_read_info.read_lengths; - std::sort(read_lengths.begin(), read_lengths.end(), std::greater()); - - // Get the max read length - int64_t max_read_length = read_lengths.at(0); - output_data.long_read_info.longest_read_length = max_read_length; - - // Get the median read length - int64_t median_read_length = read_lengths[read_lengths.size() / 2]; - output_data.long_read_info.median_read_length = median_read_length; - - // Get the mean read length - float mean_read_length = (double)total_num_bases / (double)read_lengths.size(); - output_data.long_read_info.mean_read_length = mean_read_length; - - // Calculate N50 and other N-scores - for (int percent_value = 1; percent_value <= 100; percent_value++) - { - // Get the base percentage threshold for this N-score - double base_threshold = (double)total_num_bases * (percent_value / 100.0); - - // Calculate the NXX score - double current_base_count = 0; - int current_read_index = -1; - while (current_base_count < base_threshold) { - current_read_index ++; - current_base_count += read_lengths.at(current_read_index); - } - int nxx_read_length = read_lengths.at(current_read_index); - output_data.long_read_info.NXX_read_length[percent_value] = nxx_read_length; - } + // Loop through each input file and get the QC data across files + size_t file_count = _input_data.num_input_files; - // Set common score variables - output_data.long_read_info.n50_read_length = output_data.long_read_info.NXX_read_length[50]; - output_data.long_read_info.n95_read_length = output_data.long_read_info.NXX_read_length[95]; - output_data.long_read_info.n05_read_length = output_data.long_read_info.NXX_read_length[5]; - - // Create the summary file - std::cout << "Writing summary file: " << read_summary_file.c_str() << std::endl; - read_summary_fp = fopen(read_summary_file.c_str(), "w"); - fprintf(read_summary_fp, "total number of reads\t%d\n", output_data.long_read_info.total_num_reads); - fprintf(read_summary_fp, "total number of bases\t%ld\n", output_data.long_read_info.total_num_bases); - fprintf(read_summary_fp, "longest read length\t%d\n", output_data.long_read_info.longest_read_length); - fprintf(read_summary_fp, "N50 read length\t%d\n", output_data.long_read_info.n50_read_length); - fprintf(read_summary_fp, "mean read length\t%.2f\n", output_data.long_read_info.mean_read_length); - fprintf(read_summary_fp, "median read length\t%d\n", output_data.long_read_info.median_read_length); - fprintf(read_summary_fp, "GC%%\t%.2f\n", output_data.long_read_info.gc_cnt * 100); - fprintf(read_summary_fp, "\n\n"); - for (int percent = 5; percent < 100; percent += 5) - { - fprintf(read_summary_fp, "N%02d read length\t%.d\n", percent, output_data.long_read_info.NXX_read_length[percent]); + // Write QC details to the file + for (size_t i = 0; i < file_count; i++) + { + const std::string input_file = _input_data.input_files[i]; + exit_code = writeBaseQCDetails(input_file, output_data, read_details_fp); + } + fclose(read_details_fp); + + // Check if the GC content was calculated successfully + if (exit_code == 0) { + + // Add the G + C bases + double g_c = output_data.long_read_info.total_g_cnt + output_data.long_read_info.total_c_cnt; + + // Add all bases + double a_tu_g_c = g_c + output_data.long_read_info.total_a_cnt + output_data.long_read_info.total_tu_cnt; + + // Calculate read length statistics if base counts are not zero + uint64_t total_num_bases = output_data.long_read_info.total_num_bases; + if (total_num_bases == 0) { + std::cerr << "No bases found in input files." << std::endl; + exit_code = 3; + } else { + // Calculate GC-content + output_data.long_read_info.gc_cnt = g_c / a_tu_g_c; + + // Sort the read lengths in descending order + std::vector read_lengths = output_data.long_read_info.read_lengths; + std::sort(read_lengths.begin(), read_lengths.end(), std::greater()); + + // Get the max read length + int64_t max_read_length = read_lengths.at(0); + output_data.long_read_info.longest_read_length = max_read_length; + + // Get the median read length + int64_t median_read_length = read_lengths[read_lengths.size() / 2]; + output_data.long_read_info.median_read_length = median_read_length; + + // Get the mean read length + float mean_read_length = (double)total_num_bases / (double)read_lengths.size(); + output_data.long_read_info.mean_read_length = mean_read_length; + + // Calculate N50 and other N-scores + for (int percent_value = 1; percent_value <= 100; percent_value++) + { + // Get the base percentage threshold for this N-score + double base_threshold = (double)total_num_bases * (percent_value / 100.0); + + // Calculate the NXX score + double current_base_count = 0; + int current_read_index = -1; + while (current_base_count < base_threshold) { + current_read_index ++; + current_base_count += read_lengths.at(current_read_index); } + int nxx_read_length = read_lengths.at(current_read_index); + output_data.long_read_info.NXX_read_length[percent_value] = nxx_read_length; + } - fprintf(read_summary_fp, "\n\n"); - - fprintf(read_summary_fp, "GC content\tnumber of reads\n"); - for (int gc_ratio = 0; gc_ratio < 100; gc_ratio++) - { - fprintf(read_summary_fp, "GC=%d%%\t%d\n", gc_ratio, output_data.long_read_info.read_gc_content_count[gc_ratio]); - } + // Set common score variables + output_data.long_read_info.n50_read_length = output_data.long_read_info.NXX_read_length[50]; + output_data.long_read_info.n95_read_length = output_data.long_read_info.NXX_read_length[95]; + output_data.long_read_info.n05_read_length = output_data.long_read_info.NXX_read_length[5]; + + // Create the summary file + std::cout << "Writing summary file: " << read_summary_file.c_str() << std::endl; + read_summary_fp = fopen(read_summary_file.c_str(), "w"); + // Write summary in JSON format + fprintf(read_summary_fp, "{\n"); + fprintf(read_summary_fp, " \"filetype\": \"fast5\",\n"); + fprintf(read_summary_fp, " \"longreadsum_version\": \"%s\",\n", _input_data.getVersion().c_str()); + fprintf(read_summary_fp, " \"total_num_reads\": %d,\n", output_data.long_read_info.total_num_reads); + fprintf(read_summary_fp, " \"total_num_bases\": %ld,\n", output_data.long_read_info.total_num_bases); + fprintf(read_summary_fp, " \"longest_read_length\": %d,\n", output_data.long_read_info.longest_read_length); + fprintf(read_summary_fp, " \"n50_read_length\": %d,\n", output_data.long_read_info.n50_read_length); + fprintf(read_summary_fp, " \"mean_read_length\": %.2f,\n", output_data.long_read_info.mean_read_length); + fprintf(read_summary_fp, " \"median_read_length\": %d,\n", output_data.long_read_info.median_read_length); + fprintf(read_summary_fp, " \"gc_percent\": %.2f,\n", output_data.long_read_info.gc_cnt * 100); + + // NXX read lengths + fprintf(read_summary_fp, " \"NXX_read_length\": {\n"); + for (int percent = 5; percent < 100; percent += 5) + { + fprintf(read_summary_fp, " \"N%02d\": %d%s\n", percent, output_data.long_read_info.NXX_read_length[percent], (percent + 5 < 100) ? "," : ""); + } + fprintf(read_summary_fp, " },\n"); + // GC content distribution + fprintf(read_summary_fp, " \"gc_content_distribution\": {\n"); + for (int gc_ratio = 0; gc_ratio < 100; gc_ratio++) + { + fprintf(read_summary_fp, " \"%d\": %d%s\n", gc_ratio, output_data.long_read_info.read_gc_content_count[gc_ratio], (gc_ratio + 1 < 100) ? "," : ""); + } + fprintf(read_summary_fp, " },\n"); - fprintf(read_summary_fp, "\n\n"); - fprintf(read_summary_fp, "base quality\tnumber of bases\n"); - for (int baseq = 0; baseq <= 60; baseq++) - { - fprintf(read_summary_fp, "%d\t%ld\n", baseq, output_data.seq_quality_info.base_quality_distribution[baseq]); - } + // Base quality distribution + fprintf(read_summary_fp, " \"base_quality_distribution\": {\n"); + for (int baseq = 0; baseq <= 60; baseq++) + { + fprintf(read_summary_fp, " \"%d\": %ld%s\n", baseq, output_data.seq_quality_info.base_quality_distribution[baseq], (baseq < 60) ? "," : ""); + } + fprintf(read_summary_fp, " },\n"); - fprintf(read_summary_fp, "\n\n"); - fprintf(read_summary_fp, "read average base quality\tnumber of reads\n"); - for (int baseq = 0; baseq <= 60; baseq++) - { - fprintf(read_summary_fp, "%d\t%d\n", baseq, output_data.seq_quality_info.read_average_base_quality_distribution[baseq]); - } - fclose(read_summary_fp); + // Read average base quality distribution + fprintf(read_summary_fp, " \"read_average_base_quality_distribution\": {\n"); + for (int baseq = 0; baseq <= 60; baseq++) + { + fprintf(read_summary_fp, " \"%d\": %d%s\n", baseq, output_data.seq_quality_info.read_average_base_quality_distribution[baseq], (baseq < 60) ? "," : ""); } + fprintf(read_summary_fp, " }\n"); + + fprintf(read_summary_fp, "}\n"); + fclose(read_summary_fp); } } } diff --git a/tests/test_general.py b/tests/test_general.py index 4cf8183..e21c097 100644 --- a/tests/test_general.py +++ b/tests/test_general.py @@ -214,9 +214,12 @@ def fast5_output(): """ # Set parameters default_parameters = lrst.Input_Para() - output_folder = os.path.abspath(str("output/")) + output_folder = os.path.abspath(str("output/fast5/")) + if not os.path.exists(output_folder): + os.makedirs(output_folder) + default_parameters.output_folder = output_folder - default_parameters.out_prefix = str("f5_") + default_parameters.sample_name = "HG002" # Check if running remotely file_dir = '' @@ -289,9 +292,12 @@ def fast5s_output(): """ # Set parameters default_parameters = lrst.Input_Para() - output_folder = os.path.abspath(str("output/")) + output_folder = os.path.abspath(str("output/fast5s/")) + if not os.path.exists(output_folder): + os.makedirs(output_folder) + default_parameters.output_folder = output_folder - default_parameters.out_prefix = str("f5s_") + default_parameters.sample_name = "HG002" default_parameters.other_flags = 1 # 0 for normal QC, 1 for signal statistics output # Check if running remotely From 730b491d70cc0ce088e620237f3636bf6dfcbea1 Mon Sep 17 00:00:00 2001 From: jonperdomo Date: Sun, 27 Jul 2025 13:08:37 -0400 Subject: [PATCH 5/9] multiqc base mods --- src/bam_module.cpp | 2 - src/output_data.cpp | 85 ++++++++++++++++++++++++++++++++++++++++++- tests/test_general.py | 27 ++++++++++---- 3 files changed, 103 insertions(+), 11 deletions(-) diff --git a/src/bam_module.cpp b/src/bam_module.cpp index c697805..695adaa 100644 --- a/src/bam_module.cpp +++ b/src/bam_module.cpp @@ -225,8 +225,6 @@ int BAM_Module::calculateStatistics(Input_Para &input_params, Output_BAM &final_ output_prefix += input_params.rrms_filter ? "_accepted" : "_rejected"; } - // std::string summary_filepath = input_params.output_folder + "/" + - // output_prefix + "_summary.txt"; std::string summary_filepath = input_params.output_folder + "/" + output_prefix + "_summary.bam.json"; final_output.save_summary(summary_filepath, input_params, final_output); diff --git a/src/output_data.cpp b/src/output_data.cpp index ccdb85f..a34ad1f 100644 --- a/src/output_data.cpp +++ b/src/output_data.cpp @@ -592,7 +592,90 @@ void Output_BAM::save_summary(std::string &output_file, Input_Para ¶ms, Outp fprintf(fp, " \"insertions\": %ld,\n", output_data.num_ins_bases); fprintf(fp, " \"deletions\": %ld,\n", output_data.num_del_bases); fprintf(fp, " \"clipped\": %ld\n", output_data.num_clip_bases); - fprintf(fp, " }\n"); + + // Determine if there is base modification data + bool modification_data_exists = output_data.modified_prediction_count > 0; + if (modification_data_exists) { + fprintf(fp, " },\n"); + } else { + fprintf(fp, " }\n"); + } + + // Base modification statistics (if any) + if (modification_data_exists) { + + // Map of modification character to full name + std::unordered_map mod_char_to_name = { + {'m', "5mC"}, {'h', "5hmC"}, {'f', "5fC"}, {'c', "5caC"}, + {'g', "5hmU"}, {'e', "5fu"}, {'b', "5caU"}, + {'a', "6mA"}, {'o', "8oxoG"}, {'n', "Xao"}, + {'C', "Amb. C"}, {'A', "Amb. A"}, {'T', "Amb. T"}, {'G', "Amb. G"}, + {'N', "Amb. N"}, + {'v', "pseU"} + }; + + fprintf(fp, " \"base_modifications\": {\n"); + fprintf(fp, " \"unfiltered_modifications\": %lu,\n", output_data.modified_prediction_count); + fprintf(fp, " \"filter_threshold\": %.2f,\n", params.base_mod_threshold); + fprintf(fp, " \"sample_modified_base_count\": %lu,\n", output_data.sample_modified_base_count); + fprintf(fp, " \"sample_modified_base_count_forward\": %lu,\n", output_data.sample_modified_base_count_forward); + fprintf(fp, " \"sample_modified_base_count_reverse\": %lu,\n", output_data.sample_modified_base_count_reverse); + fprintf(fp, " \"cpg_forward\": %lu,\n", output_data.sample_cpg_forward_count); + fprintf(fp, " \"cpg_reverse\": %lu,\n", output_data.sample_cpg_reverse_count); + fprintf(fp, " \"base_mod_counts\": {\n"); + for (auto it = output_data.base_mod_counts.begin(); it != output_data.base_mod_counts.end(); ++it) { + char mod_type = it->first; + std::string mod_name = std::string(1, mod_type); + auto it_char = mod_char_to_name.find(mod_type); + if (it_char != mod_char_to_name.end()) { + mod_name = it_char->second; + } + uint64_t count = it->second; + // fprintf(fp, " \"%c\": %lu", mod_type, count); + fprintf(fp, " \"%s\": %lu", mod_name.c_str(), count); + if (std::next(it) != output_data.base_mod_counts.end()) { + fprintf(fp, ",\n"); + } else { + fprintf(fp, "\n"); + } + } + fprintf(fp, " },\n"); + fprintf(fp, " \"base_mod_counts_forward\": {\n"); + for (auto it = output_data.base_mod_counts_forward.begin(); it != output_data.base_mod_counts_forward.end(); ++it) { + char mod_type = it->first; + std::string mod_name = std::string(1, mod_type); + auto it_char = mod_char_to_name.find(mod_type); + if (it_char != mod_char_to_name.end()) { + mod_name = it_char->second; + } + uint64_t count = it->second; + fprintf(fp, " \"%s\": %lu", mod_name.c_str(), count); + if (std::next(it) != output_data.base_mod_counts_forward.end()) { + fprintf(fp, ",\n"); + } else { + fprintf(fp, "\n"); + } + } + fprintf(fp, " },\n"); + fprintf(fp, " \"base_mod_counts_reverse\": {\n"); + for (auto it = output_data.base_mod_counts_reverse.begin(); it != output_data.base_mod_counts_reverse.end(); ++it) { + char mod_type = it->first; + std::string mod_name = std::string(1, mod_type); + auto it_char = mod_char_to_name.find(mod_type); + if (it_char != mod_char_to_name.end()) { + mod_name = it_char->second; + } + uint64_t count = it->second; + fprintf(fp, " \"%s\": %lu", mod_name.c_str(), count); + if (std::next(it) != output_data.base_mod_counts_reverse.end()) { + fprintf(fp, ",\n"); + } else { + fprintf(fp, "\n"); + } + } + fprintf(fp, " }\n"); + fprintf(fp, " }\n"); + } fprintf(fp, "}\n"); fclose(fp); diff --git a/tests/test_general.py b/tests/test_general.py index e21c097..2ac7d2f 100644 --- a/tests/test_general.py +++ b/tests/test_general.py @@ -438,9 +438,12 @@ def unmapped_bam_output(): """Run the BAM module on unmapped inputs.""" # Set parameters default_parameters = lrst.Input_Para() - output_folder = os.path.abspath(str("output/")) + output_folder = os.path.abspath(str("output/ubam/")) + if not os.path.exists(output_folder): + os.makedirs(output_folder) + default_parameters.output_folder = output_folder - default_parameters.out_prefix = str("ubam_") + default_parameters.sample_name = "HG002" # Check if running remotely local_dir = os.path.expanduser('~/github/LongReadSum') @@ -505,11 +508,15 @@ def forward_base_mod_output(): """Run the BAM module on a read aligned to the forward strand with base modifications.""" # Set parameters default_parameters = lrst.Input_Para() - output_folder = os.path.abspath(str("output/")) + output_folder = os.path.abspath(str("output/fmod/")) + if not os.path.exists(output_folder): + os.makedirs(output_folder) + default_parameters.output_folder = output_folder - default_parameters.out_prefix = str("fwdmod_") + default_parameters.sample_name = "HG002" + default_parameters.mod_analysis = True - default_parameters.base_mod_threshold = -1.0 + default_parameters.base_mod_threshold = 0 # Check if running remotely local_dir = os.path.expanduser('~/github/LongReadSum') @@ -577,11 +584,15 @@ def reverse_base_mod_output(): """Run the BAM module on a read aligned to the reverse strand with base modifications.""" # Set parameters default_parameters = lrst.Input_Para() - output_folder = os.path.abspath(str("output/")) + output_folder = os.path.abspath(str("output/rmod/")) + if not os.path.exists(output_folder): + os.makedirs(output_folder) + default_parameters.output_folder = output_folder - default_parameters.out_prefix = str("revmod_") + default_parameters.sample_name = "HG002" + default_parameters.mod_analysis = True - default_parameters.base_mod_threshold = -1.0 + default_parameters.base_mod_threshold = 0 # Check if running remotely local_dir = os.path.expanduser('~/github/LongReadSum') From a0981ce5353da5098e7ea64635c9784a4aa1992d Mon Sep 17 00:00:00 2001 From: jonperdomo Date: Sun, 27 Jul 2025 14:53:23 -0400 Subject: [PATCH 6/9] add sequencing summary multiqc --- src/output_data.cpp | 2 ++ src/seqtxt_module.cpp | 4 +++- tests/test_general.py | 9 ++++++--- 3 files changed, 11 insertions(+), 4 deletions(-) diff --git a/src/output_data.cpp b/src/output_data.cpp index a34ad1f..b729094 100644 --- a/src/output_data.cpp +++ b/src/output_data.cpp @@ -733,6 +733,8 @@ void Output_SeqTxt::save_summary(std::string & output_file, Input_Para & params) } else { // Write JSON output for all, passed, and failed reads fprintf(fp, "{\n"); + fprintf(fp, " \"filetype\": \"sequencing_summary\",\n"); + fprintf(fp, " \"longreadsum_version\": \"%s\",\n", params.getVersion().c_str()); // Helper lambda to write a block for each read type auto write_read_info = [fp](const char* label, const Basic_Seq_Statistics& info, bool last) { diff --git a/src/seqtxt_module.cpp b/src/seqtxt_module.cpp index ca7b95c..dadfc88 100644 --- a/src/seqtxt_module.cpp +++ b/src/seqtxt_module.cpp @@ -217,7 +217,9 @@ int SeqTxt_Module::generateStatistics( Output_SeqTxt& t_output_SeqTxt_info){ // Save summary statistics to the output file std::cout << "Saving summary statistics to file..." << std::endl; - std::string summary_filepath = _input_parameters.output_folder + "/" + _input_parameters.sample_name + "_summary.seqtxt.json"; + // std::string summary_filepath = _input_parameters.output_folder + "/" + + // _input_parameters.sample_name + "_summary.json"; + std::string summary_filepath = _input_parameters.output_folder + "/" + _input_parameters.sample_name + "_summary.json"; t_output_SeqTxt_info.save_summary(summary_filepath, _input_parameters); auto relapse_end_time = std::chrono::high_resolution_clock::now(); diff --git a/tests/test_general.py b/tests/test_general.py index 2ac7d2f..d9bbc65 100644 --- a/tests/test_general.py +++ b/tests/test_general.py @@ -662,10 +662,13 @@ def seqtxt_output(): """ # Set parameters default_parameters = lrst.Input_Para() - output_folder = os.path.abspath(str("output/")) - default_parameters.output_folder = output_folder - default_parameters.out_prefix = str("seqtxt_") + output_folder = os.path.abspath(str("output/seqtxt/")) + if not os.path.exists(output_folder): + os.makedirs(output_folder) + default_parameters.output_folder = output_folder + default_parameters.sample_name = "HG002" + # Check if running remotely local_dir = os.path.expanduser('~/github/LongReadSum') if os.getcwd() == local_dir: From d693cc167fff724bcc288bfb8c56ce17cd9e883e Mon Sep 17 00:00:00 2001 From: jonperdomo Date: Mon, 28 Jul 2025 09:06:03 -0400 Subject: [PATCH 7/9] add tin scores --- include/tin.h | 2 +- src/bam_module.cpp | 2 +- src/output_data.cpp | 30 +++++++++++++++++++++++++++++- src/tin.cpp | 14 +++++++------- tests/test_general.py | 8 ++++++-- 5 files changed, 44 insertions(+), 12 deletions(-) diff --git a/include/tin.h b/include/tin.h index fd4c330..27e406c 100644 --- a/include/tin.h +++ b/include/tin.h @@ -15,7 +15,7 @@ typedef std::unordered_map getReadDepths(htsFile* bam_file, hts_idx_t* idx, bam_hdr_t* header, std::string chr, int start, int end, bool transcript_strand); diff --git a/src/bam_module.cpp b/src/bam_module.cpp index 695adaa..3780a8a 100644 --- a/src/bam_module.cpp +++ b/src/bam_module.cpp @@ -80,7 +80,7 @@ int BAM_Module::calculateStatistics(Input_Para &input_params, Output_BAM &final_ std::cout << "Calculating TIN scores for file: " << filepath << std::endl; TINStats tin_stats; - calculateTIN(&tin_stats, gene_bed, input_params.input_files[i], min_cov, sample_size, input_params.output_folder, input_params.threads); + calculateTIN(tin_stats, gene_bed, input_params.input_files[i], min_cov, sample_size, input_params.sample_name, input_params.output_folder, input_params.threads); // Print the TIN stats std::cout << "Number of transcripts: " << tin_stats.num_transcripts << std::endl; diff --git a/src/output_data.cpp b/src/output_data.cpp index b729094..ea9116c 100644 --- a/src/output_data.cpp +++ b/src/output_data.cpp @@ -4,6 +4,7 @@ #include #include #include // std::round +#include #include "output_data.h" #include "utils.h" @@ -595,12 +596,39 @@ void Output_BAM::save_summary(std::string &output_file, Input_Para ¶ms, Outp // Determine if there is base modification data bool modification_data_exists = output_data.modified_prediction_count > 0; - if (modification_data_exists) { + bool tin_data_exists = !output_data.tin_data.empty(); + if (modification_data_exists || tin_data_exists) { fprintf(fp, " },\n"); } else { fprintf(fp, " }\n"); } + // TIN data statistics (if any) + if (tin_data_exists) { + fprintf(fp, " \"tin_data\": {\n"); + for (auto it = output_data.tin_data.begin(); it != output_data.tin_data.end(); ++it) { + // Print each BAM file's TIN statistics + const std::string& bam_file = it->first; + const TINStats& tin_data = it->second; + fprintf(fp, " \"%s\": {\n", bam_file.c_str()); + fprintf(fp, " \"total_transcripts\": %d,\n", tin_data.num_transcripts); + fprintf(fp, " \"mean\": %.2f,\n", tin_data.mean); + fprintf(fp, " \"median\": %.2f,\n", tin_data.median); + fprintf(fp, " \"stddev\": %.2f\n", tin_data.stddev); + fprintf(fp, " }"); + if (std::next(it) != output_data.tin_data.end()) { + fprintf(fp, ",\n"); + } else { + fprintf(fp, "\n"); + } + } + if (modification_data_exists) { + fprintf(fp, " },\n"); + } else { + fprintf(fp, " }\n"); + } + } + // Base modification statistics (if any) if (modification_data_exists) { diff --git a/src/tin.cpp b/src/tin.cpp index da945a1..2919f50 100644 --- a/src/tin.cpp +++ b/src/tin.cpp @@ -178,7 +178,7 @@ bool checkMinReads(htsFile* bam_file, hts_idx_t* idx, bam_hdr_t* header, std::st return min_reads_met; } -void calculateTIN(TINStats* tin_stats, const std::string& gene_bed, const std::string& bam_filepath, int min_cov, int sample_size, const std::string& output_folder, int thread_count) +void calculateTIN(TINStats& tin_stats, const std::string& gene_bed, const std::string& bam_filepath, int min_cov, int sample_size, const std::string& sample_name, const std::string& output_folder, int thread_count) { std::cout << "Using TIN minimum coverage " << min_cov << " and sample size " << sample_size << std::endl; @@ -474,7 +474,7 @@ void calculateTIN(TINStats* tin_stats, const std::string& gene_bed, const std::s std::cout << "Writing TIN scores to file..." << std::endl; // Write the TIN scores to a file - std::string output_tin_tsv = output_folder + "/tin_scores.tsv"; + std::string output_tin_tsv = output_folder + "/" + sample_name + "_tin_scores.tsv"; std::ofstream output_tin_file(output_tin_tsv); output_tin_file << std::fixed << std::setprecision(14); @@ -502,7 +502,7 @@ void calculateTIN(TINStats* tin_stats, const std::string& gene_bed, const std::s std::cout << "TIN scores written to " << output_tin_tsv << std::endl; // Write the TIN summary to a file - std::string output_tin_summary_tsv = output_folder + "/tin_summary.tsv"; + std::string output_tin_summary_tsv = output_folder + "/" + sample_name + "_tin_summary.tsv"; std::ofstream output_tin_summary_file(output_tin_summary_tsv); output_tin_summary_file << std::fixed << std::setprecision(14); @@ -524,9 +524,9 @@ void calculateTIN(TINStats* tin_stats, const std::string& gene_bed, const std::s std::cout << "TIN summary written to " << output_tin_summary_tsv << std::endl; // Update the TIN stats struct - tin_stats->mean = TIN_mean; - tin_stats->median = TIN_median; - tin_stats->stddev = TIN_stddev; - tin_stats->num_transcripts = TIN_scores.size(); + tin_stats.mean = TIN_mean; + tin_stats.median = TIN_median; + tin_stats.stddev = TIN_stddev; + tin_stats.num_transcripts = TIN_scores.size(); } } diff --git a/tests/test_general.py b/tests/test_general.py index d9bbc65..695081b 100644 --- a/tests/test_general.py +++ b/tests/test_general.py @@ -740,9 +740,13 @@ def rnaseq_bam_output(): """Run the BAM module on RNASeq inputs.""" # Set parameters default_parameters = lrst.Input_Para() - output_folder = os.path.abspath(str("output/")) + output_folder = os.path.abspath(str("output/rnaseq/")) + if not os.path.exists(output_folder): + os.makedirs(output_folder) + default_parameters.output_folder = output_folder - default_parameters.out_prefix = str("rnaseq_") + default_parameters.sample_name = "GTEX" + default_parameters.tin_sample_size = 100 default_parameters.tin_min_coverage = 2 From c6d19c9883ef895a8fd381a9ad106a596bba5e94 Mon Sep 17 00:00:00 2001 From: jonperdomo Date: Wed, 30 Jul 2025 11:37:03 -0400 Subject: [PATCH 8/9] update readme for multiqc --- README.md | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/README.md b/README.md index 47307b0..a85c347 100644 --- a/README.md +++ b/README.md @@ -10,6 +10,7 @@ LongReadSum supports FASTA, FASTQ, BAM, FAST5, and sequencing_summary.txt file f - [Installation using Anaconda (recommended)](#installation-using-anaconda) - [Installation using Docker](#installation-using-anaconda) - [Building from source](#building-from-source) +- [MultiQC support](#multiqc-support) - General usage for common filetypes: - [Common parameters](#common-parameters) - [WGS BAM](#wgs-bam) @@ -78,6 +79,16 @@ conda activate longreadsum make ``` +# MultiQC support +[MultiQC](https://seqera.io/multiqc/) is a widely used open-source tool for +aggregating bioinformatics analyses results from many tools across samples. + +To run MultiQC, input the LongReadSum directory containing the output JSON +summary file, and specify the _longreadsum_ module: + +``` +multiqc $INPUT_DIRECTORY --module longreadsum --outdir $OUTPUT_DIRECTORY/multiqc +``` ## Running Activate the conda environment and then run with arguments: From f6579a6b9395b82f62c5f1c417fa53fa65a0c171 Mon Sep 17 00:00:00 2001 From: Jon Perdomo Date: Wed, 30 Jul 2025 11:39:30 -0400 Subject: [PATCH 9/9] Update README.md --- README.md | 3 +++ 1 file changed, 3 insertions(+) diff --git a/README.md b/README.md index a85c347..d820ff9 100644 --- a/README.md +++ b/README.md @@ -90,6 +90,9 @@ summary file, and specify the _longreadsum_ module: multiqc $INPUT_DIRECTORY --module longreadsum --outdir $OUTPUT_DIRECTORY/multiqc ``` +Example report: +image + ## Running Activate the conda environment and then run with arguments: ```