Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 8 additions & 0 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,9 @@ INCL_DIR := $(CURDIR)/include
SRC_DIR := $(CURDIR)/src
LIB_DIR := $(CURDIR)/lib

VERSION := $(shell git describe --tags --always)
VERSION_HEADER := $(INCL_DIR)/version.h

# Set the library paths for the compiler
CONDA_PREFIX ?= $(shell echo $$CONDA_PREFIX)
LIBRARY_PATHS := -L$(LIB_DIR) -L$(CONDA_PREFIX)/lib
Expand All @@ -10,6 +13,11 @@ INCLUDE_PATHS := -I$(INCL_DIR) -I$(CONDA_PREFIX)/include
# All targets
all: swig_build compile

# Rule to generate version.h
$(VERSION_HEADER):
@echo "#pragma once" > $@
@echo "#define VERSION \"$(VERSION)\"" >> $@

# Generate the SWIG Python/C++ wrappers
swig_build:
mkdir -p $(LIB_DIR)
Expand Down
14 changes: 14 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@ LongReadSum supports FASTA, FASTQ, BAM, FAST5, and sequencing_summary.txt file f
- [Installation using Anaconda (recommended)](#installation-using-anaconda)
- [Installation using Docker](#installation-using-anaconda)
- [Building from source](#building-from-source)
- [MultiQC support](#multiqc-support)
- General usage for common filetypes:
- [Common parameters](#common-parameters)
- [WGS BAM](#wgs-bam)
Expand Down Expand Up @@ -78,6 +79,19 @@ conda activate longreadsum
make
```

# MultiQC support
[MultiQC](https://seqera.io/multiqc/) is a widely used open-source tool for
aggregating bioinformatics analyses results from many tools across samples.

To run MultiQC, input the LongReadSum directory containing the output JSON
summary file, and specify the _longreadsum_ module:

```
multiqc $INPUT_DIRECTORY --module longreadsum --outdir $OUTPUT_DIRECTORY/multiqc
```

Example report:
<img width="1707" height="761" alt="image" src="https://github.com/user-attachments/assets/adbcacf7-44f8-48bd-9135-9293379d65d2" />

## Running
Activate the conda environment and then run with arguments:
Expand Down
5 changes: 4 additions & 1 deletion include/input_parameters.h
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,6 @@ class Input_Para{
// Parameters
int threads;
size_t num_input_files;
std::string out_prefix;
int64_t other_flags;
int32_t user_defined_fastq_base_qual_offset;
std::string output_folder; // Output folder
Expand All @@ -36,10 +35,14 @@ class Input_Para{
bool mod_analysis; // Perform base modification analysis on BAM file
int tin_sample_size; // Number of equally spaced samples for TIN calculation
int tin_min_coverage; // Minimum coverage for TIN calculation
std::string sample_name; // Sample name
std::string version_str = ""; // Version string for the program

// Functions
std::string add_input_file(const std::string& input_filepath);

const std::string &getVersion() const;

Input_Para();

~Input_Para();
Expand Down
2 changes: 1 addition & 1 deletion include/tin.h
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ typedef std::unordered_map<std::string, std::tuple<std::string, int, int, double

// Calculate the TIN score for each transcript in the gene BED file
// (Reference: https://bmcbioinformatics.biomedcentral.com/articles/10.1186/s12859-016-0922-z#Sec11)
void calculateTIN(TINStats* tin_stats, const std::string& gene_bed, const std::string& bam_filepath, int min_cov, int sample_size, const std::string& output_folder, int thread_count);
void calculateTIN(TINStats& tin_stats, const std::string& gene_bed, const std::string& bam_filepath, int min_cov, int sample_size, const std::string& sample_name, const std::string& output_folder, int thread_count);

std::unordered_map<int, int> getReadDepths(htsFile* bam_file, hts_idx_t* idx, bam_hdr_t* header, std::string chr, int start, int end, bool transcript_strand);

Expand Down
2 changes: 2 additions & 0 deletions include/version.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
#pragma once
#define VERSION "v1.4.0-5-g9e0d2c4"
14 changes: 7 additions & 7 deletions src/bam_module.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -80,7 +80,7 @@ int BAM_Module::calculateStatistics(Input_Para &input_params, Output_BAM &final_
std::cout << "Calculating TIN scores for file: " << filepath << std::endl;

TINStats tin_stats;
calculateTIN(&tin_stats, gene_bed, input_params.input_files[i], min_cov, sample_size, input_params.output_folder, input_params.threads);
calculateTIN(tin_stats, gene_bed, input_params.input_files[i], min_cov, sample_size, input_params.sample_name, input_params.output_folder, input_params.threads);

// Print the TIN stats
std::cout << "Number of transcripts: " << tin_stats.num_transcripts << std::endl;
Expand Down Expand Up @@ -154,8 +154,6 @@ int BAM_Module::calculateStatistics(Input_Para &input_params, Output_BAM &final_

while (reader.hasNextRecord()){
// Read the next batch of records
// std::cout << "Generating " << thread_count << " thread(s)..." <<
// std::endl;
printMessage("Generating " + std::to_string(thread_count) + " thread(s)...");
std::vector<std::thread> thread_vector;
for (int thread_index=0; thread_index<thread_count; thread_index++){
Expand All @@ -165,7 +163,6 @@ int BAM_Module::calculateStatistics(Input_Para &input_params, Output_BAM &final_
}

// Join the threads in thread_vector
// std::cout<<"Joining threads..."<<std::endl;
int thread_index = 0;
for (auto& t : thread_vector){
if (t.joinable()){
Expand Down Expand Up @@ -222,11 +219,14 @@ int BAM_Module::calculateStatistics(Input_Para &input_params, Output_BAM &final_
std::cout << "Saving summary statistics to file..." << std::endl;

// If in RRMS mode, append RRMS accepted/rejected to the output prefix
std::string output_prefix = "bam";
// std::string output_prefix = "bam";
std::string output_prefix = input_params.sample_name;
if (input_params.rrms_csv != ""){
output_prefix += input_params.rrms_filter ? "_rrms_accepted" : "_rrms_rejected";
output_prefix += input_params.rrms_filter ? "_accepted" : "_rejected";
}
std::string summary_filepath = input_params.output_folder + "/" + output_prefix + "_summary.txt";

std::string summary_filepath = input_params.output_folder + "/" + output_prefix + "_summary.bam.json";

final_output.save_summary(summary_filepath, input_params, final_output);
std::cout << "Saved file: " << summary_filepath << std::endl;

Expand Down
61 changes: 22 additions & 39 deletions src/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -65,9 +65,6 @@ def get_common_param(margs):
pat_split = margs.pattern.split("*")
param_dict["input_files"].extend(
glob(os.path.join("*".join(pat_split[:-1]), "*" + pat_split[-1])))

# read_count = int(margs.read_count)
# param_dict["read_count"] = read_count

if len(param_dict["input_files"]) == 0:
parsing_error_msg += "No input file(s) were provided.\n"
Expand All @@ -88,8 +85,6 @@ def get_common_param(margs):
except OSError:
parsing_error_msg += "Cannot create folder for " + param_dict["output_folder"] + " \n"

param_dict["out_prefix"] = margs.outprefix

# Set up logging to file and stdout
if margs.log is None or margs.log == "":
parsing_error_msg += "No log file was provided.\n"
Expand All @@ -113,6 +108,7 @@ def get_common_param(margs):
param_dict["log_level"] = margs.log_level

param_dict["threads"] = margs.threads
param_dict["sample_name"] = margs.sample

# Reset the param_dict if there are parsing errors
if parsing_error_msg != "":
Expand All @@ -133,17 +129,15 @@ def fq_module(margs):

else:
logging.info('Input file(s) are:\n%s', '\n'.join(param_dict["input_files"]))
param_dict["out_prefix"] += "fastq"

# Import the SWIG Python wrapper for our C++ module
input_para = lrst.Input_Para()
input_para.threads = param_dict["threads"]

input_para.other_flags = 0
input_para.user_defined_fastq_base_qual_offset = margs.udqual

input_para.output_folder = str(param_dict["output_folder"])
input_para.out_prefix = str(param_dict["out_prefix"])
input_para.sample_name = param_dict["sample_name"]

for _ipf in param_dict["input_files"]:
input_para.add_input_file(str(_ipf))
Expand All @@ -157,7 +151,7 @@ def fq_module(margs):
fq_html_gen = generate_html.ST_HTML_Generator(
[["basic_st", "read_length_bar", "read_length_hist", "gc_content_hist", "base_counts", "base_quality",
"read_avg_base_quality"], "FASTQ QC", param_dict], plot_filepaths, static=False)
fq_html_gen.generate_html()
fq_html_gen.generate_html(input_para.sample_name, "fastq")

logging.info("Done. Output files are in %s", param_dict["output_folder"])
else:
Expand All @@ -176,12 +170,11 @@ def fa_module(margs):
else:
# If there are no parse errors, run the filetype-specific module
logging.info('Input file(s) are:\n%s', '\n'.join(param_dict["input_files"]))
param_dict["out_prefix"] += "fasta"
input_para = lrst.Input_Para()
input_para.threads = param_dict["threads"]
input_para.other_flags = 0
input_para.output_folder = str(param_dict["output_folder"])
input_para.out_prefix = str(param_dict["out_prefix"])
input_para.sample_name = param_dict["sample_name"]

for _ipf in param_dict["input_files"]:
input_para.add_input_file(str(_ipf))
Expand All @@ -195,7 +188,7 @@ def fa_module(margs):
fa_html_gen = generate_html.ST_HTML_Generator(
[["basic_st", "read_length_bar", "read_length_hist", "gc_content_hist", "base_counts"], "FASTA QC",
param_dict], plot_filepaths, static=True)
fa_html_gen.generate_html()
fa_html_gen.generate_html(input_para.sample_name, "fasta")
logging.info("Done. Output files are in %s", param_dict["output_folder"])

else:
Expand All @@ -211,11 +204,10 @@ def bam_module(margs):

else:
logging.info('Input file(s) are:\n%s', '\n'.join(param_dict["input_files"]))
param_dict["out_prefix"] += "bam";
input_para = lrst.Input_Para()
input_para.threads = param_dict["threads"]
input_para.output_folder = str(param_dict["output_folder"])
input_para.out_prefix = str(param_dict["out_prefix"])
input_para.sample_name = param_dict["sample_name"]

# Set the reference genome file and base modification threshold
ref_genome = margs.ref if margs.ref != "" or margs.ref is not None else ""
Expand Down Expand Up @@ -269,7 +261,7 @@ def bam_module(margs):
# Generate the HTML report
bam_html_gen = generate_html.ST_HTML_Generator(
[qc_info_list, "BAM QC", param_dict], plot_filepaths, static=False)
bam_html_gen.generate_html()
bam_html_gen.generate_html(input_para.sample_name, "bam")
logging.info("Done. Output files are in %s", param_dict["output_folder"])

else:
Expand All @@ -288,26 +280,19 @@ def rrms_module(margs):
input_para = lrst.Input_Para()
input_para.threads = param_dict["threads"]
input_para.output_folder = str(param_dict["output_folder"])
input_para.out_prefix = str(param_dict["out_prefix"])
input_para.sample_name = param_dict["sample_name"]

for _ipf in param_dict["input_files"]:
input_para.add_input_file(str(_ipf))

# Set the RRMS input CSV file
input_para.rrms_csv = margs.csv
logging.info("RRMS CSV file is " + input_para.rrms_csv)

# Get the output prefix
output_prefix = param_dict["out_prefix"]

# Run QC for both accepted and rejected reads
rrms_filter = [True, False]
for filter_type in rrms_filter:

# Set the RRMS filter type
input_para.rrms_filter = filter_type

# Set the output prefix
param_dict["out_prefix"] = output_prefix + "rrms_" + ("accepted" if filter_type else "rejected")
input_para.rrms_filter = filter_type # True for accepted reads, False for rejected reads
param_dict["mod"] = input_para.mod_analysis = False # Disable base modification analysis for RRMS (use BAM module for this)

# Run the QC module
Expand All @@ -332,7 +317,7 @@ def rrms_module(margs):
# Generate the HTML report
bam_html_gen = generate_html.ST_HTML_Generator(
[qc_info_list, "BAM QC", param_dict], plot_filepaths, static=False)
bam_html_gen.generate_html()
bam_html_gen.generate_html(input_para.sample_name, "rrms_bam_" + ("accepted" if filter_type else "rejected"))
logging.info("Done. Output files are in %s", param_dict["output_folder"])

else:
Expand All @@ -349,11 +334,10 @@ def seqtxt_module(margs):

else:
logging.info('Input file(s) are:\n%s', '\n'.join(param_dict["input_files"]))
param_dict["out_prefix"] += "seqtxt"
input_para = lrst.Input_Para()
input_para.threads = param_dict["threads"]
input_para.output_folder = str(param_dict["output_folder"])
input_para.out_prefix = str(param_dict["out_prefix"])
input_para.sample_name = param_dict["sample_name"]

for _ipf in param_dict["input_files"]:
input_para.add_input_file(str(_ipf))
Expand All @@ -370,7 +354,7 @@ def seqtxt_module(margs):
[["basic_st", "read_length_bar", "read_length_hist"],
report_title, param_dict], plot_filepaths, static=False)

seqtxt_html_gen.generate_html()
seqtxt_html_gen.generate_html(input_para.sample_name, "basecall_summary")
logging.info("Done. Output files are in %s", param_dict["output_folder"])
else:
logging.error("QC did not generate.")
Expand All @@ -386,11 +370,10 @@ def fast5_module(margs):

else:
# logging.info('Input file(s) are:\n%s', '\n'.join(param_dict["input_files"]))
param_dict["out_prefix"] += "FAST5"
input_para = lrst.Input_Para()
input_para.threads = param_dict["threads"]
input_para.output_folder = str(param_dict["output_folder"])
input_para.out_prefix = str(param_dict["out_prefix"])
input_para.sample_name = param_dict["sample_name"]
input_para.other_flags = 0 # 0 for normal QC, 1 for signal statistics output

for _ipf in param_dict["input_files"]:
Expand All @@ -405,7 +388,7 @@ def fast5_module(margs):
fast5_html_obj = generate_html.ST_HTML_Generator(
[["basic_st", "read_length_bar", "read_length_hist", "gc_content_hist", "base_counts", "base_quality"],
"FAST5 QC", param_dict], plot_filepaths, static=False)
fast5_html_obj.generate_html()
fast5_html_obj.generate_html(input_para.sample_name, "fast5")
logging.info("Done. Output files are in %s", param_dict["output_folder"])

else:
Expand All @@ -421,11 +404,11 @@ def fast5_signal_module(margs):

else:
# logging.info('Input file(s) are:\n%s', '\n'.join(param_dict["input_files"]))
param_dict["out_prefix"] += "fast5_signal"
input_para = lrst.Input_Para()
input_para.threads = param_dict["threads"]
input_para.output_folder = str(param_dict["output_folder"])
input_para.out_prefix = str(param_dict["out_prefix"])
input_para.sample_name = param_dict["sample_name"]

input_para.other_flags = 1 # 0 for normal QC, 1 for signal statistics output

# Get the read count if specified
Expand All @@ -450,7 +433,7 @@ def fast5_signal_module(margs):
plot_filepaths = plot(fast5_output, param_dict, 'FAST5s')
fast5_html_obj = generate_html.ST_HTML_Generator(
[["basic_st", "read_length_bar", "read_length_hist", "gc_content_hist", "base_counts", "ont_signal"], "FAST5 QC", param_dict], plot_filepaths, static=False)
fast5_html_obj.generate_html(signal_plots=True)
fast5_html_obj.generate_html(input_para.sample_name, "fast5_signal", signal_plots=True)
logging.info("Done. Output files are in %s", param_dict["output_folder"])

else:
Expand All @@ -467,11 +450,10 @@ def pod5_module(margs):

else:
# logging.info('Input file(s) are:\n%s', '\n'.join(param_dict["input_files"]))
param_dict["out_prefix"] += "POD5"
input_para = {}
input_para['threads'] = param_dict["threads"]
input_para['output_folder'] = str(param_dict["output_folder"])
input_para['out_prefix'] = str(param_dict["out_prefix"])
input_para['sample_name'] = param_dict["sample_name"]
input_para['other_flags'] = 0 # 0 for normal QC, 1 for signal statistics output
input_para['input_files'] = []
for input_file in param_dict["input_files"]:
Expand All @@ -496,7 +478,6 @@ def pod5_module(margs):
basecalls_input = lrst.Input_Para()
basecalls_input.threads = param_dict["threads"]
basecalls_input.output_folder = str(param_dict["output_folder"])
basecalls_input.out_prefix = str(param_dict["out_prefix"])
basecalls_input.add_input_file(basecalls)
bam_output = lrst.Output_BAM()
exit_code = lrst.callBAMModule(basecalls_input, bam_output)
Expand All @@ -518,7 +499,7 @@ def pod5_module(margs):
webpage_title = "POD5 QC"
fast5_html_obj = generate_html.ST_HTML_Generator(
[["basic_st", "read_length_bar", "read_length_hist", "gc_content_hist", "base_counts", "ont_signal"], webpage_title, param_dict], plot_filepaths, static=False)
fast5_html_obj.generate_html(signal_plots=True)
fast5_html_obj.generate_html(input_para.sample_name, "pod5_signal", signal_plots=True)
logging.info("Done. Output files are in %s", param_dict["output_folder"])

else:
Expand All @@ -543,6 +524,8 @@ def set_file_parser_defaults(file_parser):
help="The number of threads used. Default: 1.")
file_parser.add_argument("-Q", "--outprefix", type=str, default="QC_",
help="The prefix for output filenames. Default: `QC_`.")
file_parser.add_argument("-s", "--sample", type=str, default="Sample",
help="Sample name. Default: `Sample`")


# Set up the argument parser
Expand Down
Loading
Loading