From 2f9532010d9e2a02db84d88794e47d91b69d961c Mon Sep 17 00:00:00 2001 From: nbollis Date: Tue, 16 Sep 2025 10:16:40 -0500 Subject: [PATCH 1/2] retain accessions --- README.md | 1 + src/Peptides.cpp | 28 +++++++++++++++++++++++++++- src/Peptides.h | 2 ++ 3 files changed, 30 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index b386365..007093e 100644 --- a/README.md +++ b/README.md @@ -45,5 +45,6 @@ $ ./mimic |replaceI | I | Count I as L for homolgy check | |empiric | e | Infer amino acid frequency from fasta for mutations | |no-digest | N | Do not digest the sequence before scrambling | +|retain-accession | A | Retains original accession (after prefix) in fasta header lines if set | diff --git a/src/Peptides.cpp b/src/Peptides.cpp index 1b76a0b..53bbb82 100644 --- a/src/Peptides.cpp +++ b/src/Peptides.cpp @@ -151,6 +151,19 @@ void Peptides::readFasta(string& path, bool write, std::ostream& os) { if (write) os << line << std::endl; if (line[0] == '>') { // id line + string accession; + if (retainAccession_) { + // Extract accession from header, e.g. ">sp|P84243|H33_HUMAN ..." + size_t pipe1 = line.find('|'); + if (pipe1 != string::npos) { + size_t pipe2 = line.find('|', pipe1 + 1); + if (pipe2 != string::npos) { + accession = line.substr(pipe1 + 1, pipe2 - pipe1 - 1); + } + } + accessions_.push_back(accession); + } + if (spillOver) { assert(!seq.empty()); // add peptides and KR-stretches to connectorStrings_ @@ -158,7 +171,12 @@ void Peptides::readFasta(string& path, bool write, std::ostream& os) { seq = ""; } ostringstream newProteinName(""); - newProteinName << ">" << proteinNamePrefix_ << ++proteinNo; + if (retainAccession_ && !accession.empty()) { + newProteinName << ">" << proteinNamePrefix_ << accession << "_" << (++proteinNo); + } + else { + newProteinName << ">" << proteinNamePrefix_ << (++proteinNo); + } connectorStrings_.push_back(newProteinName.str()); assert(connectorStrings_.size() == pepNo+1); } else { // amino acid sequence @@ -333,6 +351,11 @@ bool Peptides::parseOptions(int argc, char **argv){ "Do not digest the sequence before scrambling", "", TRUE_IF_SET); + cmd.defineOption("A", + "retain-accession", + "Retain the original accession in the mimic output. \n Not set >mimic|Random_1|shuffle_1 \n If set >mimic|Random_P84243_1|shuffle_1", + "", + TRUE_IF_SET); cmd.parseArgs(argc, argv); @@ -363,6 +386,9 @@ bool Peptides::parseOptions(int argc, char **argv){ } if (cmd.optionSet("N")) { noDigest_ = true; + } + if (cmd.optionSet("A")) { + retainAccession_ = true; } if (!cmd.arguments.empty()) { diff --git a/src/Peptides.h b/src/Peptides.h index 104bb1e..b63c106 100644 --- a/src/Peptides.h +++ b/src/Peptides.h @@ -53,6 +53,7 @@ class Peptides map > pep2ixs_; vector connectorStrings_; + vector accessions_; set usedPeptides_; // input arguments @@ -73,6 +74,7 @@ class Peptides AbsAminoAcidDist absBackground_; bool isVerbose_ = false; bool noDigest_ = false; + bool retainAccession_ = false; }; #endif /*PEPTIDES_H_*/ From bd14d36dde232f80248352f8fae1f65eabe9b3a3 Mon Sep 17 00:00:00 2001 From: nbollis Date: Tue, 30 Sep 2025 17:36:41 -0500 Subject: [PATCH 2/2] and termini number --- src/Peptides.cpp | 30 +++++++++++++++++++++++------- src/Peptides.h | 8 +++++--- 2 files changed, 28 insertions(+), 10 deletions(-) diff --git a/src/Peptides.cpp b/src/Peptides.cpp index 53bbb82..575560d 100644 --- a/src/Peptides.cpp +++ b/src/Peptides.cpp @@ -52,11 +52,11 @@ std::vector getIdxs(const string& sequence){ return idxs; } -Peptides::Peptides(unsigned int minLength, set usedPeptides, AminoAcidDist background, std::mt19937 rGen, unsigned int maxTries, bool replaceI) : +Peptides::Peptides(unsigned int minLength, set usedPeptides, AminoAcidDist background, std::mt19937 rGen, unsigned int maxTries, bool replaceI, unsigned int retainTermini) : maxTries_{maxTries}, usedPeptides_{std::move(usedPeptides)}, minLen_(minLength), replaceI_(replaceI), seed_(1u), multFactor_(1u), sharedPeptideRatio_(0.0), proteinNamePrefix_("mimic|Random_"), prependOriginal_(false), - background_{std::move(background)}, inferAAFrequency_{false}, rGen_{rGen} {} + background_{std::move(background)}, inferAAFrequency_{false}, rGen_{rGen}, retainTermini_{ retainTermini } {} void Peptides::printAll(const vector& connectorStrings, const std::string& suffix, std::ostream& os) { @@ -193,9 +193,17 @@ void Peptides::readFasta(string& path, bool write, std::ostream& os) { } void Peptides::shuffle(const string& in,string& out) { - out=in; - std::shuffle(out.begin(), out.end(), rGen_); + if (retainTermini_ == 0 || in.length() <= 2 * retainTermini_) { + out = in; + std::shuffle(out.begin(), out.end(), rGen_); + } else { + out = in; + auto start = out.begin() + retainTermini_; + auto end = out.end() - retainTermini_; + std::shuffle(start, end, rGen_); + } } + void Peptides::mutate(const string& in, string& out) { out=in; std::vector okIds; @@ -288,7 +296,8 @@ int Peptides::run() { readFasta(inFile_, prependOriginal_, outStream); background_.print(cerr); for (unsigned int m = 0; m < multFactor_; ++m) { - Peptides entrapmentDB(minLen_, usedPeptides_, background_, rGen_, maxTries_, replaceI_); + Peptides entrapmentDB(minLen_, usedPeptides_, background_, rGen_, maxTries_, replaceI_, retainTermini_); + cerr << "Shuffling round: " << (m+1) << endl; entrapmentDB.shuffle(pep2ixs_, logger); @@ -356,6 +365,10 @@ bool Peptides::parseOptions(int argc, char **argv){ "Retain the original accession in the mimic output. \n Not set >mimic|Random_1|shuffle_1 \n If set >mimic|Random_P84243_1|shuffle_1", "", TRUE_IF_SET); + cmd.defineOption("T", + "retain-termini", + "Number of N- and C-terminal residues to retain during scrambling (NoDigest only)", + "int"); cmd.parseArgs(argc, argv); @@ -390,6 +403,9 @@ bool Peptides::parseOptions(int argc, char **argv){ if (cmd.optionSet("A")) { retainAccession_ = true; } + if (cmd.optionSet("T")) { + retainTermini_ = cmd.getInt("T", 0, 100); + } if (!cmd.arguments.empty()) { inFile_ = cmd.arguments[0]; @@ -400,8 +416,8 @@ bool Peptides::parseOptions(int argc, char **argv){ return true; } -Peptides::Peptides(unsigned int minLen, set usedPeptides, std::mt19937 rGen, unsigned int maxTries, bool replaceI): -Peptides(minLen, std::move(usedPeptides), AminoAcidDist{replaceI}, rGen, maxTries, replaceI) { +Peptides::Peptides(unsigned int minLen, set usedPeptides, std::mt19937 rGen, unsigned int maxTries, bool replaceI, unsigned int retainTermini): +Peptides(minLen, std::move(usedPeptides), AminoAcidDist{replaceI}, rGen, maxTries, replaceI, retainTermini) { } diff --git a/src/Peptides.h b/src/Peptides.h index b63c106..6b7c86e 100644 --- a/src/Peptides.h +++ b/src/Peptides.h @@ -27,9 +27,9 @@ using namespace std; class Peptides { public: - Peptides(unsigned int minLen, set usedPeptides, AminoAcidDist background, std::mt19937 rGen, unsigned int maxTries = 1000, bool replaceI=false); - Peptides(unsigned int minLen, set usedPeptides, std::mt19937 rGen, unsigned int maxTries = 1000, bool replaceI=false); - Peptides(); + Peptides(unsigned int minLen, set usedPeptides, AminoAcidDist background, std::mt19937 rGen, unsigned int maxTries = 1000, bool replaceI = false, unsigned int retainTermini = 0); + Peptides(unsigned int minLen, set usedPeptides, std::mt19937 rGen, unsigned int maxTries = 1000, bool replaceI = false, unsigned int retainTermini = 0); + Peptides(); bool parseOptions(int argc, char **argv); int run(); @@ -75,6 +75,8 @@ class Peptides bool isVerbose_ = false; bool noDigest_ = false; bool retainAccession_ = false; + unsigned int retainTermini_ = 0; + }; #endif /*PEPTIDES_H_*/