From ebb5cdf827858ec11652e47ff324b6defdd3867d Mon Sep 17 00:00:00 2001 From: Joshua Charkow Date: Fri, 28 Nov 2025 15:26:04 -0500 Subject: [PATCH 1/8] feature: add "+" and slicing for AASequence --- openms_python/py_aasequence.py | 103 ++++++++++++++++++++++++++++++--- 1 file changed, 94 insertions(+), 9 deletions(-) diff --git a/openms_python/py_aasequence.py b/openms_python/py_aasequence.py index 34fcf9d..95bedc2 100644 --- a/openms_python/py_aasequence.py +++ b/openms_python/py_aasequence.py @@ -8,7 +8,7 @@ class Py_AASequence: """ - A Pythonic wrapper around pyOpenMS AASequence. + A Pythonic, immutable wrapper around pyOpenMS AASequence. This class provides intuitive properties and methods for working with amino acid sequences, including common operations like reversing and @@ -204,25 +204,110 @@ def __eq__(self, other: object) -> bool: return False return self.sequence == other.sequence - def __getitem__(self, index: int) -> str: + def __getitem__(self, index): """ - Get residue at position. + Get residue(s) at position(s). + + Supports both single indexing and slicing, returning Py_AASequence objects. Args: - index: Position in the sequence (0-based). + index: Integer for single residue, or slice object for subsequence. Returns: - str: Single letter amino acid code. + Py_AASequence: Wrapped residue or subsequence. + + Example: + >>> seq = Py_AASequence.from_string("PEPTIDE") + >>> seq[1] # Returns Py_AASequence("E") + >>> seq[1:4] # Returns Py_AASequence("EPT") + >>> seq[-1] # Returns Py_AASequence("E") """ - if index < 0 or index >= len(self): - raise IndexError(f"Index {index} out of range for sequence of length {len(self)}") - residue = self._sequence.getResidue(index) - return residue.getOneLetterCode() + if isinstance(index, slice): + start, stop, step = index.indices(len(self)) + if step != 1: + raise ValueError("Step slicing is not supported for amino acid sequences") + subsequence = self.sequence[start:stop] + return Py_AASequence.from_string(subsequence) + else: + # Handle negative indices + if index < 0: + index = len(self) + index + if index < 0 or index >= len(self): + raise IndexError(f"Index {index} out of range for sequence of length {len(self)}") + residue = self._sequence.getResidue(index) + residue_char = residue.getOneLetterCode() + return Py_AASequence.from_string(residue_char) def __iter__(self): """Iterate over residues.""" for i in range(len(self)): yield self[i] + def __add__(self, other: Py_AASequence | str) -> Py_AASequence: + """ + Concatenate sequences. + + Args: + other: Py_AASequence or string to append. + + Returns: + Py_AASequence: New concatenated sequence. + + Example: + >>> seq1 = Py_AASequence.from_string("PEP") + >>> seq2 = Py_AASequence.from_string("TIDE") + >>> combined = seq1 + seq2 + >>> print(combined.sequence) + PEPTIDE + >>> combined2 = seq1 + "TIDE" + >>> print(combined2.sequence) + PEPTIDE + """ + if isinstance(other, Py_AASequence): + combined_str = self.sequence + other.sequence + elif isinstance(other, str): + combined_str = self.sequence + other + else: + return NotImplemented + return Py_AASequence.from_string(combined_str) + + def __radd__(self, other: str) -> Py_AASequence: + """ + Support string + Py_AASequence. + + Example: + >>> seq = Py_AASequence.from_string("TIDE") + >>> combined = "PEP" + seq + >>> print(combined.sequence) + PEPTIDE + """ + if isinstance(other, str): + combined_str = other + self.sequence + return Py_AASequence.from_string(combined_str) + return NotImplemented + + def __mul__(self, times: int) -> Py_AASequence: + """ + Repeat sequence. + + Args: + times: Number of times to repeat (must be >= 0). + + Returns: + Py_AASequence: New repeated sequence. + + Example: + >>> seq = Py_AASequence.from_string("PEP") + >>> repeated = seq * 3 + >>> print(repeated.sequence) + PEPPEPPEP + """ + if not isinstance(times, int) or times < 0: + return NotImplemented + return Py_AASequence.from_string(self.sequence * times) + + def __rmul__(self, times: int) -> Py_AASequence: + """Support int * Py_AASequence.""" + return self.__mul__(times) # ==================== Additional Utilities ==================== From af516c233dc6328807c894cc47a709b9fbebf344 Mon Sep 17 00:00:00 2001 From: Joshua Charkow Date: Fri, 28 Nov 2025 15:33:17 -0500 Subject: [PATCH 2/8] feature: add more pythonic methods --- openms_python/py_aasequence.py | 18 +++++++++++++++++- 1 file changed, 17 insertions(+), 1 deletion(-) diff --git a/openms_python/py_aasequence.py b/openms_python/py_aasequence.py index 95bedc2..ad0a3b9 100644 --- a/openms_python/py_aasequence.py +++ b/openms_python/py_aasequence.py @@ -308,7 +308,23 @@ def __mul__(self, times: int) -> Py_AASequence: def __rmul__(self, times: int) -> Py_AASequence: """Support int * Py_AASequence.""" return self.__mul__(times) - + def __contains__(self, substring: str) -> bool: + """Check if substring is in sequence.""" + return self.has_substring(substring) + + def __hash__(self) -> int: + """Make sequences hashable for use in sets/dicts.""" + return hash(self.sequence) + + def __lt__(self, other: Py_AASequence) -> bool: + """Lexicographic comparison by sequence.""" + if not isinstance(other, Py_AASequence): + return NotImplemented + return self.sequence < other.sequence + def count(self, residue: str) -> int: + """Count occurrences of a residue, to be consistent with str.count().""" + return self._sequence.count(residue) + # ==================== Additional Utilities ==================== def get_mz(self, charge: int) -> float: From 4167d767ab0647feb83f3d715a0ada7d300eda10 Mon Sep 17 00:00:00 2001 From: Joshua Charkow Date: Mon, 1 Dec 2025 15:32:12 -0500 Subject: [PATCH 3/8] fix: slicing for modified peptide sequences --- openms_python/py_aasequence.py | 19 ++++++++++++++++--- 1 file changed, 16 insertions(+), 3 deletions(-) diff --git a/openms_python/py_aasequence.py b/openms_python/py_aasequence.py index ad0a3b9..07becf2 100644 --- a/openms_python/py_aasequence.py +++ b/openms_python/py_aasequence.py @@ -40,7 +40,7 @@ def __init__(self, native_sequence: Optional[oms.AASequence] = None): @classmethod def from_string(cls, sequence_str: str) -> Py_AASequence: """ - Create AASequence from string representation. + Create Py_AASequence from string representation. Args: sequence_str: String representation of the amino acid sequence. @@ -57,6 +57,20 @@ def from_string(cls, sequence_str: str) -> Py_AASequence: # ==================== Pythonic Properties ==================== + @classmethod + def from_native(cls, native_sequence: oms.AASequence) -> Py_AASequence: + """ + Creates Py_AASequence from native pyOpenMS AASequence. + + Args: + native_sequence (oms.AASequence): + + Returns: + Py_AASequence: New wrapped opject + + """ + return cls(native_sequence) + @property def native(self) -> oms.AASequence: """Return the underlying pyOpenMS AASequence.""" @@ -226,8 +240,7 @@ def __getitem__(self, index): start, stop, step = index.indices(len(self)) if step != 1: raise ValueError("Step slicing is not supported for amino acid sequences") - subsequence = self.sequence[start:stop] - return Py_AASequence.from_string(subsequence) + return Py_AASequence.from_native(self._sequence.getSubsequence(start, stop - start)) else: # Handle negative indices if index < 0: From 1d0d031668588b3f72e5d8d3c142e5f51209454f Mon Sep 17 00:00:00 2001 From: Joshua Charkow Date: Mon, 1 Dec 2025 15:54:25 -0500 Subject: [PATCH 4/8] feature: add string export method --- openms_python/py_aasequence.py | 25 ++++++++++++++++++++++++- 1 file changed, 24 insertions(+), 1 deletion(-) diff --git a/openms_python/py_aasequence.py b/openms_python/py_aasequence.py index 07becf2..8d2293d 100644 --- a/openms_python/py_aasequence.py +++ b/openms_python/py_aasequence.py @@ -2,7 +2,7 @@ from __future__ import annotations -from typing import Optional +from typing import Optional, Literal import pyopenms as oms @@ -391,4 +391,27 @@ def has_suffix(self, suffix: str) -> bool: bool: True if sequence ends with suffix. """ return self._sequence.hasSuffix(oms.AASequence.fromString(suffix)) + + + # ===================== Exporting ======================= + def to_string(self, modified=True, mod_format: Optional[Literal['unimod', 'bracket']] = 'unimod') -> str: + """ + Get string representation of the sequence. + + Returns: + str: Amino acid sequence as string. + Example: + >>> seq = Py_AASequence.from_string("PEPTIDE") + >>> seq_str = seq.to_string() + """ + if not modified: + return self.unmodified_sequence + + else: + if mod_format == 'unimod': + return self._sequence.toUniModString() + elif mod_format == 'bracket': + return self._sequence.toBracketString() + else: + raise ValueError(f"Unsupported mod_format: {mod_format}, supported are 'unimod' and 'bracket'") From 70eb83db5821b4bc8c78d99a2f3dbdd787591253 Mon Sep 17 00:00:00 2001 From: Joshua Charkow Date: Mon, 1 Dec 2025 16:20:40 -0500 Subject: [PATCH 5/8] test: fix tests, allow for negative indices in slicing --- openms_python/py_aasequence.py | 16 +++++++++--- tests/test_py_aasequence.py | 45 ++++++++++++++++++++++++++++------ 2 files changed, 51 insertions(+), 10 deletions(-) diff --git a/openms_python/py_aasequence.py b/openms_python/py_aasequence.py index 8d2293d..7f1365d 100644 --- a/openms_python/py_aasequence.py +++ b/openms_python/py_aasequence.py @@ -245,7 +245,7 @@ def __getitem__(self, index): # Handle negative indices if index < 0: index = len(self) + index - if index < 0 or index >= len(self): + if index >= len(self): raise IndexError(f"Index {index} out of range for sequence of length {len(self)}") residue = self._sequence.getResidue(index) residue_char = residue.getOneLetterCode() @@ -394,10 +394,18 @@ def has_suffix(self, suffix: str) -> bool: # ===================== Exporting ======================= - def to_string(self, modified=True, mod_format: Optional[Literal['unimod', 'bracket']] = 'unimod') -> str: + def to_string(self, modified=True, mod_format: Optional[Literal['default', 'unimod', 'bracket']] = 'default') -> str: """ Get string representation of the sequence. + Args: + modified (bool): Whether to include modifications in the string. + mod_format (Optional[Literal['default', 'unimod', 'bracket']]): Format for modifications. + 'default' for OpenMS format, + 'unimod' for UniMod format, + 'bracket' for bracket notation. + Default is 'unimod'. + Returns: str: Amino acid sequence as string. @@ -409,7 +417,9 @@ def to_string(self, modified=True, mod_format: Optional[Literal['unimod', 'brack return self.unmodified_sequence else: - if mod_format == 'unimod': + if mod_format == 'default': + return self._sequence.toString() + elif mod_format == 'unimod': return self._sequence.toUniModString() elif mod_format == 'bracket': return self._sequence.toBracketString() diff --git a/tests/test_py_aasequence.py b/tests/test_py_aasequence.py index 4f56f01..056851d 100644 --- a/tests/test_py_aasequence.py +++ b/tests/test_py_aasequence.py @@ -118,7 +118,7 @@ def test_py_aasequence_iteration(): seq = Py_AASequence.from_string("PEPTIDE") residues = list(seq) - assert residues == ["P", "E", "P", "T", "I", "D", "E"] + assert [res.sequence for res in residues] == ["P", "E", "P", "T", "I", "D", "E"] assert len(residues) == 7 @@ -126,17 +126,14 @@ def test_py_aasequence_indexing(): """Test indexing into sequence.""" seq = Py_AASequence.from_string("PEPTIDE") - assert seq[0] == "P" - assert seq[1] == "E" - assert seq[6] == "E" + assert seq[0].sequence == "P" + assert seq[1].sequence == "E" + assert seq[6].sequence == "E" # Test out of bounds with pytest.raises(IndexError): _ = seq[7] - with pytest.raises(IndexError): - _ = seq[-1] - def test_py_aasequence_string_representation(): """Test string representations.""" @@ -258,3 +255,37 @@ def test_py_aasequence_with_native_aasequence(): assert seq.sequence == "PEPTIDE" assert seq.native is native + + +def test_py_aasequence_to_string(): + """Test to_string method with different options.""" + seq = Py_AASequence.from_string("PEPTIDEM(Oxidation)") + + # Default should return modified string in default format + mod_str = seq.to_string() + assert mod_str == "PEPTIDEM(Oxidation)" + + # Unmodified should return unmodified sequence + unmod_str = seq.to_string(modified=False) + assert unmod_str == "PEPTIDEM" + + # Bracket format + bracket_str = seq.to_string(modified=True, mod_format='bracket') + assert bracket_str == "PEPTIDEM[147]" + + # unimod format + unimod_str = seq.to_string(modified=True, mod_format='unimod') + assert unimod_str == "PEPTIDEM(UniMod:35)" + + # Invalid format should raise error + with pytest.raises(ValueError): + _ = seq.to_string(modified=True, mod_format='invalid_format') + + +def test_slicing(): + aa_seq = Py_AASequence.from_string('PEPTIDEM(Oxidation)R') + assert aa_seq[0].sequence == 'P' + assert aa_seq[-1].sequence == 'R' + assert aa_seq[1:4].sequence == 'EPT' + assert aa_seq[-2:].sequence == 'M(Oxidation)R' + From d9d6c47d732981b28e458c3cc1d63d51e557f514 Mon Sep 17 00:00:00 2001 From: Joshua Charkow Date: Mon, 1 Dec 2025 17:04:09 -0500 Subject: [PATCH 6/8] fix: address ai review comments --- openms_python/py_aasequence.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/openms_python/py_aasequence.py b/openms_python/py_aasequence.py index 7f1365d..ede05ab 100644 --- a/openms_python/py_aasequence.py +++ b/openms_python/py_aasequence.py @@ -394,7 +394,7 @@ def has_suffix(self, suffix: str) -> bool: # ===================== Exporting ======================= - def to_string(self, modified=True, mod_format: Optional[Literal['default', 'unimod', 'bracket']] = 'default') -> str: + def to_string(self, modified=True, mod_format: Literal['default', 'unimod', 'bracket'] = 'default') -> str: """ Get string representation of the sequence. @@ -404,7 +404,7 @@ def to_string(self, modified=True, mod_format: Optional[Literal['default', 'unim 'default' for OpenMS format, 'unimod' for UniMod format, 'bracket' for bracket notation. - Default is 'unimod'. + Default is 'default'. Returns: str: Amino acid sequence as string. @@ -424,4 +424,4 @@ def to_string(self, modified=True, mod_format: Optional[Literal['default', 'unim elif mod_format == 'bracket': return self._sequence.toBracketString() else: - raise ValueError(f"Unsupported mod_format: {mod_format}, supported are 'unimod' and 'bracket'") + raise ValueError(f"Unsupported mod_format: {mod_format}, supported are 'default', 'unimod' and 'bracket'") From 266a926a1a88d7991c689c8d7c54b2716cef0e5f Mon Sep 17 00:00:00 2001 From: Joshua Charkow Date: Mon, 1 Dec 2025 17:30:24 -0500 Subject: [PATCH 7/8] fix: critical slicing bug --- openms_python/py_aasequence.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/openms_python/py_aasequence.py b/openms_python/py_aasequence.py index ede05ab..27ebe9d 100644 --- a/openms_python/py_aasequence.py +++ b/openms_python/py_aasequence.py @@ -247,9 +247,8 @@ def __getitem__(self, index): index = len(self) + index if index >= len(self): raise IndexError(f"Index {index} out of range for sequence of length {len(self)}") - residue = self._sequence.getResidue(index) - residue_char = residue.getOneLetterCode() - return Py_AASequence.from_string(residue_char) + residue = self._sequence.getSubsequence(index, 1) + return Py_AASequence.from_native(residue) def __iter__(self): """Iterate over residues.""" From f2fa5e84040a207c684cba936c37dd872be6c37f Mon Sep 17 00:00:00 2001 From: Joshua Charkow Date: Mon, 1 Dec 2025 17:42:26 -0500 Subject: [PATCH 8/8] address ai comments --- openms_python/py_aasequence.py | 6 ++++-- tests/test_py_aasequence.py | 6 ++++++ 2 files changed, 10 insertions(+), 2 deletions(-) diff --git a/openms_python/py_aasequence.py b/openms_python/py_aasequence.py index 27ebe9d..a6cc4b2 100644 --- a/openms_python/py_aasequence.py +++ b/openms_python/py_aasequence.py @@ -4,6 +4,7 @@ from typing import Optional, Literal import pyopenms as oms +import warnings class Py_AASequence: @@ -334,8 +335,9 @@ def __lt__(self, other: Py_AASequence) -> bool: return NotImplemented return self.sequence < other.sequence def count(self, residue: str) -> int: - """Count occurrences of a residue, to be consistent with str.count().""" - return self._sequence.count(residue) + """Count occurrences of a residue, to be consistent with str.count(), note currently does not account for modifications""" + warnings.warn("count method does not account for modifications") + return self.unmodified_sequence.count(residue) # ==================== Additional Utilities ==================== diff --git a/tests/test_py_aasequence.py b/tests/test_py_aasequence.py index 056851d..93ee370 100644 --- a/tests/test_py_aasequence.py +++ b/tests/test_py_aasequence.py @@ -289,3 +289,9 @@ def test_slicing(): assert aa_seq[1:4].sequence == 'EPT' assert aa_seq[-2:].sequence == 'M(Oxidation)R' +def test_count(): + aa_seq = Py_AASequence.from_string('PEPTIDEM(Oxidation)R') + assert aa_seq.count('E') == 2 + assert aa_seq.count('P') == 2 + assert aa_seq.count('K') == 0 +