From 9e0673f9010111f17f97dad16e3a55af97f2b1f1 Mon Sep 17 00:00:00 2001 From: Jordan Watson Date: Mon, 9 Dec 2019 13:20:07 -0800 Subject: [PATCH 01/16] Start of biopy --- stdlib/biopy/SeqUtils/CheckSum.py | 0 1 file changed, 0 insertions(+), 0 deletions(-) create mode 100644 stdlib/biopy/SeqUtils/CheckSum.py diff --git a/stdlib/biopy/SeqUtils/CheckSum.py b/stdlib/biopy/SeqUtils/CheckSum.py new file mode 100644 index 000000000..e69de29bb From 9a76b34a408c60d80ebe4f302de1fc0a3c08ef9e Mon Sep 17 00:00:00 2001 From: Jordan Watson Date: Mon, 9 Dec 2019 13:54:49 -0800 Subject: [PATCH 02/16] Add checksum.seq --- stdlib/biopy/SeqUtils/checksum.seq | 44 +++++++++++++++++++ .../sequtils_test/checksum_test.seq | 13 ++++++ 2 files changed, 57 insertions(+) create mode 100644 stdlib/biopy/SeqUtils/checksum.seq create mode 100644 test/stdlib/biopy_test/sequtils_test/checksum_test.seq diff --git a/stdlib/biopy/SeqUtils/checksum.seq b/stdlib/biopy/SeqUtils/checksum.seq new file mode 100644 index 000000000..1c2f357dc --- /dev/null +++ b/stdlib/biopy/SeqUtils/checksum.seq @@ -0,0 +1,44 @@ +# def crc32(seq): +# """ +# crc32(seq) + +# Return the crc32 checksum for a sequence (string or Seq object). +# """ +# return _crc32(_as_bytes(str(seq))) + +def _init_table_h(): + _table_h = list[int]() + for i in range(256): + part_l = i + part_h = 0 + for j in range(8): + rflag = part_l & 1 + part_l >>= 1 + if part_h & 1: + part_l |= 1 << 31 + part_h >>= 1 + if rflag: + part_h ^= 0xD8000000 + _table_h.append(part_h) + return _table_h + +#Initialisation +_table_h = _init_table_h() + +def crc64(s: str): + """ + crc64(s) + + Return the crc64 checksum for a sequence (string or Seq object). + """ + crcl = 0 + crch = 0 + for c in s: + shr = (crch & 0xFF) << 24 + temp1h = crch >> 8 + temp1l = (crcl >> 8) | shr + idx = (crcl ^ ord(c)) & 0xFF + crch = temp1h ^ _table_h[idx] + crcl = temp1l + + return "CRC-%08X%08X" % (crch, crcl) \ No newline at end of file diff --git a/test/stdlib/biopy_test/sequtils_test/checksum_test.seq b/test/stdlib/biopy_test/sequtils_test/checksum_test.seq new file mode 100644 index 000000000..7f75dfe1a --- /dev/null +++ b/test/stdlib/biopy_test/sequtils_test/checksum_test.seq @@ -0,0 +1,13 @@ +import checksum + +str_light_chain_one = ("QSALTQPASVSGSPGQSITISCTGTSSDVGSYNLVSWYQQ" + "HPGKAPKLMIYEGSKRPSGVSNRFSGSKSGNTASLTISGL" + "QAEDEADYYCSSYAGSSTLVFGGGTKLTVL") + +str_light_chain_two = ("QSALTQPASVSGSPGQSITISCTGTSSDVGSYNLVSWYQQ" + "HPGKAPKLMIYEGSKRPSGVSNRFSGSKSGNTASLTISGL" + "QAEDEADYYCCSYAGSSTWVFGGGTKLTVL") + +def test_checksum(): + num = checksum.crc64(str_light_chain_one) + print num \ No newline at end of file From 13ca06b476c8afe2e59497e35ddaeeba60fa5603 Mon Sep 17 00:00:00 2001 From: Jordan Watson Date: Tue, 10 Dec 2019 13:36:51 -0800 Subject: [PATCH 03/16] Add sequtils.checksum.crc64() --- stdlib/biopy/SeqUtils/CheckSum.py | 0 stdlib/biopy/SeqUtils/checksum.seq | 60 +++++++++++++++++-- .../sequtils_test/checksum_test.seq | 9 ++- 3 files changed, 62 insertions(+), 7 deletions(-) delete mode 100644 stdlib/biopy/SeqUtils/CheckSum.py diff --git a/stdlib/biopy/SeqUtils/CheckSum.py b/stdlib/biopy/SeqUtils/CheckSum.py deleted file mode 100644 index e69de29bb..000000000 diff --git a/stdlib/biopy/SeqUtils/checksum.seq b/stdlib/biopy/SeqUtils/checksum.seq index 1c2f357dc..d05dc5d64 100644 --- a/stdlib/biopy/SeqUtils/checksum.seq +++ b/stdlib/biopy/SeqUtils/checksum.seq @@ -1,12 +1,15 @@ + +# TODO/ CAVEATS: +# @jordan - crc32() needs to be implemented # def crc32(seq): # """ # crc32(seq) # Return the crc32 checksum for a sequence (string or Seq object). # """ -# return _crc32(_as_bytes(str(seq))) +# return _crc32(seq) -def _init_table_h(): +def _init_table_h() -> list[int]: _table_h = list[int]() for i in range(256): part_l = i @@ -25,7 +28,7 @@ def _init_table_h(): #Initialisation _table_h = _init_table_h() -def crc64(s: str): +def crc64(s: str) -> str: """ crc64(s) @@ -41,4 +44,53 @@ def crc64(s: str): crch = temp1h ^ _table_h[idx] crcl = temp1l - return "CRC-%08X%08X" % (crch, crcl) \ No newline at end of file + # need it in 08X return f"CRC-{crch}{crcl}" + return f"CRC-{crch}{crcl}" + #"CRC-%08X%08X" % (crch, crcl) + +def gcg(s: seq) -> int: + """ + gcg(s) + + Given a nucleotide or amino-acid secuence (or any string), + returns the GCG checksum (int). Checksum used by GCG program. + seq type = str. + """ + + index = 0 + checksum = 0 + + for char in s: + index += 1 + checksum += index * ord(char.upper()) + if index == 57: + index = 0 + return checksum % 10000 + +# TODO/CAVEATS: +# @jordan - This still needs to be implemented +#hashlib and base64?? +# def seguid(s: seq) -> str: +# """ +# seguid(seq: seq) + +# Return the SEGUID (string) for a sequence (string or Seq object). + +# Given a nucleotide or amino-acid secuence (or any string), +# returns the SEGUID string (A SEquence Globally Unique IDentifier). +# seq type = str. +# """ +# import hashlib +# import base64 + +# m = hashlib.sha1() +# m.update(_as_bytes(s.upper())) +# try: +# # For Python 3+ +# tmp = base64.encodebytes(m.digest()) +# return tmp.decode().replace("\n", "").rstrip("=") +# except AttributeError: +# pass +# # For all other Pythons +# return base64.b64encode(m.digest()).rstrip("=") + diff --git a/test/stdlib/biopy_test/sequtils_test/checksum_test.seq b/test/stdlib/biopy_test/sequtils_test/checksum_test.seq index 7f75dfe1a..7f276da08 100644 --- a/test/stdlib/biopy_test/sequtils_test/checksum_test.seq +++ b/test/stdlib/biopy_test/sequtils_test/checksum_test.seq @@ -1,4 +1,4 @@ -import checksum +from biopy.sequtils.checksum import crc64 str_light_chain_one = ("QSALTQPASVSGSPGQSITISCTGTSSDVGSYNLVSWYQQ" "HPGKAPKLMIYEGSKRPSGVSNRFSGSKSGNTASLTISGL" @@ -8,6 +8,9 @@ str_light_chain_two = ("QSALTQPASVSGSPGQSITISCTGTSSDVGSYNLVSWYQQ" "HPGKAPKLMIYEGSKRPSGVSNRFSGSKSGNTASLTISGL" "QAEDEADYYCCSYAGSSTWVFGGGTKLTVL") +@test def test_checksum(): - num = checksum.crc64(str_light_chain_one) - print num \ No newline at end of file + num1 = crc64(str_light_chain_one) + num2 = crc64(str_light_chain_two) + assert num1 == num2 +test_checksum() \ No newline at end of file From 8514bf78695d130206f30867fb307781b2bb4ab4 Mon Sep 17 00:00:00 2001 From: Jordan Watson Date: Tue, 10 Dec 2019 13:58:22 -0800 Subject: [PATCH 04/16] Add sequtils.checksum.gcg() --- stdlib/biopy/SeqUtils/checksum.seq | 8 +++++--- .../stdlib/biopy_test/sequtils_test/checksum_test.seq | 11 +++++++++-- 2 files changed, 14 insertions(+), 5 deletions(-) diff --git a/stdlib/biopy/SeqUtils/checksum.seq b/stdlib/biopy/SeqUtils/checksum.seq index d05dc5d64..99cc60da2 100644 --- a/stdlib/biopy/SeqUtils/checksum.seq +++ b/stdlib/biopy/SeqUtils/checksum.seq @@ -33,6 +33,9 @@ def crc64(s: str) -> str: crc64(s) Return the crc64 checksum for a sequence (string or Seq object). + + TODO/CAVEATS: + @jordan - return formatting crch and crcl as hex """ crcl = 0 crch = 0 @@ -48,7 +51,7 @@ def crc64(s: str) -> str: return f"CRC-{crch}{crcl}" #"CRC-%08X%08X" % (crch, crcl) -def gcg(s: seq) -> int: +def gcg(s: str) -> int: """ gcg(s) @@ -57,8 +60,7 @@ def gcg(s: seq) -> int: seq type = str. """ - index = 0 - checksum = 0 + index = checksum = 0 for char in s: index += 1 diff --git a/test/stdlib/biopy_test/sequtils_test/checksum_test.seq b/test/stdlib/biopy_test/sequtils_test/checksum_test.seq index 7f276da08..167a72b9b 100644 --- a/test/stdlib/biopy_test/sequtils_test/checksum_test.seq +++ b/test/stdlib/biopy_test/sequtils_test/checksum_test.seq @@ -1,4 +1,4 @@ -from biopy.sequtils.checksum import crc64 +from biopy.sequtils.checksum import crc64, gcg str_light_chain_one = ("QSALTQPASVSGSPGQSITISCTGTSSDVGSYNLVSWYQQ" "HPGKAPKLMIYEGSKRPSGVSNRFSGSKSGNTASLTISGL" @@ -13,4 +13,11 @@ def test_checksum(): num1 = crc64(str_light_chain_one) num2 = crc64(str_light_chain_two) assert num1 == num2 -test_checksum() \ No newline at end of file +test_checksum() + +@test +def test_gcg(): + num1 = gcg(str_light_chain_one) + num2 = gcg(str_light_chain_two) + assert num1 != num2 +test_gcg() \ No newline at end of file From 0427156d003ed5d5b512afd5a6ba1cc90f16e745 Mon Sep 17 00:00:00 2001 From: Jordan Watson Date: Tue, 10 Dec 2019 14:03:00 -0800 Subject: [PATCH 05/16] Add sequtils.codonusageindices.seq --- stdlib/biopy/SeqUtils/codonusageindices.seq | 25 +++++++++++++++++++++ 1 file changed, 25 insertions(+) create mode 100644 stdlib/biopy/SeqUtils/codonusageindices.seq diff --git a/stdlib/biopy/SeqUtils/codonusageindices.seq b/stdlib/biopy/SeqUtils/codonusageindices.seq new file mode 100644 index 000000000..58f7072ed --- /dev/null +++ b/stdlib/biopy/SeqUtils/codonusageindices.seq @@ -0,0 +1,25 @@ +""" +Codon adaption indxes, including Sharp and Li (1987) E. coli index. + +Currently this module only defines a single codon adaption index from +Sharp & Li, Nucleic Acids Res. 1987. +""" +# Turn black code style off +# fmt: off + +SharpEcoliIndex = { + "GCA": 0.586, "GCC": 0.122, "GCG": 0.424, "GCT": 1, "AGA": 0.004, + "AGG": 0.002, "CGA": 0.004, "CGC": 0.356, "CGG": 0.004, "CGT": 1, "AAC": 1, + "AAT": 0.051, "GAC": 1, "GAT": 0.434, "TGC": 1, "TGT": 0.5, "CAA": 0.124, + "CAG": 1, "GAA": 1, "GAG": 0.259, "GGA": 0.01, "GGC": 0.724, "GGG": 0.019, + "GGT": 1, "CAC": 1, "CAT": 0.291, "ATA": 0.003, "ATC": 1, "ATT": 0.185, + "CTA": 0.007, "CTC": 0.037, "CTG": 1, "CTT": 0.042, "TTA": 0.02, + "TTG": 0.02, "AAA": 1, "AAG": 0.253, "ATG": 1, "TTC": 1, "TTT": 0.296, + "CCA": 0.135, "CCC": 0.012, "CCG": 1, "CCT": 0.07, "AGC": 0.41, + "AGT": 0.085, "TCA": 0.077, "TCC": 0.744, "TCG": 0.017, "TCT": 1, + "ACA": 0.076, "ACC": 1, "ACG": 0.099, "ACT": 0.965, "TGG": 1, "TAC": 1, + "TAT": 0.239, "GTA": 0.495, "GTC": 0.066, "GTG": 0.221, "GTT": 1 +} + +# Turn black code style on +# fmt: on \ No newline at end of file From 67363dd29fafc098464e94984183274b145d587b Mon Sep 17 00:00:00 2001 From: Jordan Watson Date: Tue, 10 Dec 2019 14:46:56 -0800 Subject: [PATCH 06/16] Add biopy.sequtils.poparamdata.seq --- stdlib/biopy/SeqUtils/codonusageindices.seq | 22 +-- stdlib/biopy/SeqUtils/proparamdata.seq | 154 ++++++++++++++++++ .../sequtils_test/checksum_test.seq | 2 +- 3 files changed, 166 insertions(+), 12 deletions(-) create mode 100644 stdlib/biopy/SeqUtils/proparamdata.seq diff --git a/stdlib/biopy/SeqUtils/codonusageindices.seq b/stdlib/biopy/SeqUtils/codonusageindices.seq index 58f7072ed..5f066dd34 100644 --- a/stdlib/biopy/SeqUtils/codonusageindices.seq +++ b/stdlib/biopy/SeqUtils/codonusageindices.seq @@ -8,17 +8,17 @@ Sharp & Li, Nucleic Acids Res. 1987. # fmt: off SharpEcoliIndex = { - "GCA": 0.586, "GCC": 0.122, "GCG": 0.424, "GCT": 1, "AGA": 0.004, - "AGG": 0.002, "CGA": 0.004, "CGC": 0.356, "CGG": 0.004, "CGT": 1, "AAC": 1, - "AAT": 0.051, "GAC": 1, "GAT": 0.434, "TGC": 1, "TGT": 0.5, "CAA": 0.124, - "CAG": 1, "GAA": 1, "GAG": 0.259, "GGA": 0.01, "GGC": 0.724, "GGG": 0.019, - "GGT": 1, "CAC": 1, "CAT": 0.291, "ATA": 0.003, "ATC": 1, "ATT": 0.185, - "CTA": 0.007, "CTC": 0.037, "CTG": 1, "CTT": 0.042, "TTA": 0.02, - "TTG": 0.02, "AAA": 1, "AAG": 0.253, "ATG": 1, "TTC": 1, "TTT": 0.296, - "CCA": 0.135, "CCC": 0.012, "CCG": 1, "CCT": 0.07, "AGC": 0.41, - "AGT": 0.085, "TCA": 0.077, "TCC": 0.744, "TCG": 0.017, "TCT": 1, - "ACA": 0.076, "ACC": 1, "ACG": 0.099, "ACT": 0.965, "TGG": 1, "TAC": 1, - "TAT": 0.239, "GTA": 0.495, "GTC": 0.066, "GTG": 0.221, "GTT": 1 + "GCA": 0.586, "GCC": 0.122, "GCG": 0.424, "GCT": 1.0, "AGA": 0.004, + "AGG": 0.002, "CGA": 0.004, "CGC": 0.356, "CGG": 0.004, "CGT": 1.0, "AAC": 1.0, + "AAT": 0.051, "GAC": 1.0, "GAT": 0.434, "TGC": 1.0, "TGT": 0.5, "CAA": 0.124, + "CAG": 1.0, "GAA": 1.0, "GAG": 0.259, "GGA": 0.01, "GGC": 0.724, "GGG": 0.019, + "GGT": 1.0, "CAC": 1.0, "CAT": 0.291, "ATA": 0.003, "ATC": 1.0, "ATT": 0.185, + "CTA": 0.007, "CTC": 0.037, "CTG": 1.0, "CTT": 0.042, "TTA": 0.02, + "TTG": 0.02, "AAA": 1.0, "AAG": 0.253, "ATG": 1.0, "TTC": 1.0, "TTT": 0.296, + "CCA": 0.135, "CCC": 0.012, "CCG": 1.0, "CCT": 0.07, "AGC": 0.41, + "AGT": 0.085, "TCA": 0.077, "TCC": 0.744, "TCG": 0.017, "TCT": 1.0, + "ACA": 0.076, "ACC": 1.0, "ACG": 0.099, "ACT": 0.965, "TGG": 1.0, "TAC": 1.0, + "TAT": 0.239, "GTA": 0.495, "GTC": 0.066, "GTG": 0.221, "GTT": 1.0 } # Turn black code style on diff --git a/stdlib/biopy/SeqUtils/proparamdata.seq b/stdlib/biopy/SeqUtils/proparamdata.seq new file mode 100644 index 000000000..401f1d8c4 --- /dev/null +++ b/stdlib/biopy/SeqUtils/proparamdata.seq @@ -0,0 +1,154 @@ + +"""Indices to be used with ProtParam.""" + +# Turn black code style off +# fmt: off + +# Kyte & Doolittle index of hydrophobicity +# J. Mol. Biol. 157:105-132(1982). +kd = {"A": 1.8, "R": -4.5, "N": -3.5, "D": -3.5, "C": 2.5, + "Q": -3.5, "E": -3.5, "G": -0.4, "H": -3.2, "I": 4.5, + "L": 3.8, "K": -3.9, "M": 1.9, "F": 2.8, "P": -1.6, + "S": -0.8, "T": -0.7, "W": -0.9, "Y": -1.3, "V": 4.2} + +# Flexibility +# Normalized flexibility parameters (B-values), average +# Vihinen M., Torkkila E., Riikonen P. Proteins. 19(2):141-9(1994). +Flex = {"A": 0.984, "C": 0.906, "E": 1.094, "D": 1.068, + "G": 1.031, "F": 0.915, "I": 0.927, "H": 0.950, + "K": 1.102, "M": 0.952, "L": 0.935, "N": 1.048, + "Q": 1.037, "P": 1.049, "S": 1.046, "R": 1.008, + "T": 0.997, "W": 0.904, "V": 0.931, "Y": 0.929} + +# Hydrophilicity +# 1 Hopp & Wood +# Proc. Natl. Acad. Sci. U.S.A. 78:3824-3828(1981). +hw = {"A": -0.5, "R": 3.0, "N": 0.2, "D": 3.0, "C": -1.0, + "Q": 0.2, "E": 3.0, "G": 0.0, "H": -0.5, "I": -1.8, + "L": -1.8, "K": 3.0, "M": -1.3, "F": -2.5, "P": 0.0, + "S": 0.3, "T": -0.4, "W": -3.4, "Y": -2.3, "V": -1.5} + +# Surface accessibility +# Vergoten G & Theophanides T, Biomolecular Structure and Dynamics, +# pg.138 (1997). +# 1 Emini Surface fractional probability +em = {"A": 0.815, "R": 1.475, "N": 1.296, "D": 1.283, "C": 0.394, + "Q": 1.348, "E": 1.445, "G": 0.714, "H": 1.180, "I": 0.603, + "L": 0.603, "K": 1.545, "M": 0.714, "F": 0.695, "P": 1.236, + "S": 1.115, "T": 1.184, "W": 0.808, "Y": 1.089, "V": 0.606} + +# 2 Janin Interior to surface transfer energy scale +ja = {"A": 0.28, "R": -1.14, "N": -0.55, "D": -0.52, "C": 0.97, + "Q": -0.69, "E": -1.01, "G": 0.43, "H": -0.31, "I": 0.60, + "L": 0.60, "K": -1.62, "M": 0.43, "F": 0.46, "P": -0.42, + "S": -0.19, "T": -0.32, "W": 0.29, "Y": -0.15, "V": 0.60} + + +# A two dimensional dictionary for calculating the instability index. +# Guruprasad K., Reddy B.V.B., Pandit M.W. Protein Engineering 4:155-161(1990). +# It is based on dipeptide values; therefore, the value for the dipeptide DG +# is DIWV['D']['G']. +DIWV = {"A": {"A": 1.0, "C": 44.94, "E": 1.0, "D": -7.49, + "G": 1.0, "F": 1.0, "I": 1.0, "H": -7.49, + "K": 1.0, "M": 1.0, "L": 1.0, "N": 1.0, + "Q": 1.0, "P": 20.26, "S": 1.0, "R": 1.0, + "T": 1.0, "W": 1.0, "V": 1.0, "Y": 1.0}, + "C": {"A": 1.0, "C": 1.0, "E": 1.0, "D": 20.26, + "G": 1.0, "F": 1.0, "I": 1.0, "H": 33.60, + "K": 1.0, "M": 33.60, "L": 20.26, "N": 1.0, + "Q": -6.54, "P": 20.26, "S": 1.0, "R": 1.0, + "T": 33.60, "W": 24.68, "V": -6.54, "Y": 1.0}, + "E": {"A": 1.0, "C": 44.94, "E": 33.60, "D": 20.26, + "G": 1.0, "F": 1.0, "I": 20.26, "H": -6.54, + "K": 1.0, "M": 1.0, "L": 1.0, "N": 1.0, + "Q": 20.26, "P": 20.26, "S": 20.26, "R": 1.0, + "T": 1.0, "W": -14.03, "V": 1.0, "Y": 1.0}, + "D": {"A": 1.0, "C": 1.0, "E": 1.0, "D": 1.0, + "G": 1.0, "F": -6.54, "I": 1.0, "H": 1.0, + "K": -7.49, "M": 1.0, "L": 1.0, "N": 1.0, + "Q": 1.0, "P": 1.0, "S": 20.26, "R": -6.54, + "T": -14.03, "W": 1.0, "V": 1.0, "Y": 1.0}, + "G": {"A": -7.49, "C": 1.0, "E": -6.54, "D": 1.0, + "G": 13.34, "F": 1.0, "I": -7.49, "H": 1.0, + "K": -7.49, "M": 1.0, "L": 1.0, "N": -7.49, + "Q": 1.0, "P": 1.0, "S": 1.0, "R": 1.0, + "T": -7.49, "W": 13.34, "V": 1.0, "Y": -7.49}, + "F": {"A": 1.0, "C": 1.0, "E": 1.0, "D": 13.34, + "G": 1.0, "F": 1.0, "I": 1.0, "H": 1.0, + "K": -14.03, "M": 1.0, "L": 1.0, "N": 1.0, + "Q": 1.0, "P": 20.26, "S": 1.0, "R": 1.0, + "T": 1.0, "W": 1.0, "V": 1.0, "Y": 33.601}, + "I": {"A": 1.0, "C": 1.0, "E": 44.94, "D": 1.0, + "G": 1.0, "F": 1.0, "I": 1.0, "H": 13.34, + "K": -7.49, "M": 1.0, "L": 20.26, "N": 1.0, + "Q": 1.0, "P": -1.88, "S": 1.0, "R": 1.0, + "T": 1.0, "W": 1.0, "V": -7.49, "Y": 1.0}, + "H": {"A": 1.0, "C": 1.0, "E": 1.0, "D": 1.0, + "G": -9.37, "F": -9.37, "I": 44.94, "H": 1.0, + "K": 24.68, "M": 1.0, "L": 1.0, "N": 24.68, + "Q": 1.0, "P": -1.88, "S": 1.0, "R": 1.0, + "T": -6.54, "W": -1.88, "V": 1.0, "Y": 44.94}, + "K": {"A": 1.0, "C": 1.0, "E": 1.0, "D": 1.0, + "G": -7.49, "F": 1.0, "I": -7.49, "H": 1.0, + "K": 1.0, "M": 33.60, "L": -7.49, "N": 1.0, + "Q": 24.64, "P": -6.54, "S": 1.0, "R": 33.60, + "T": 1.0, "W": 1.0, "V": -7.49, "Y": 1.0}, + "M": {"A": 13.34, "C": 1.0, "E": 1.0, "D": 1.0, + "G": 1.0, "F": 1.0, "I": 1.0, "H": 58.28, + "K": 1.0, "M": -1.88, "L": 1.0, "N": 1.0, + "Q": -6.54, "P": 44.94, "S": 44.94, "R": -6.54, + "T": -1.88, "W": 1.0, "V": 1.0, "Y": 24.68}, + "L": {"A": 1.0, "C": 1.0, "E": 1.0, "D": 1.0, + "G": 1.0, "F": 1.0, "I": 1.0, "H": 1.0, + "K": -7.49, "M": 1.0, "L": 1.0, "N": 1.0, + "Q": 33.60, "P": 20.26, "S": 1.0, "R": 20.26, + "T": 1.0, "W": 24.68, "V": 1.0, "Y": 1.0}, + "N": {"A": 1.0, "C": -1.88, "E": 1.0, "D": 1.0, + "G": -14.03, "F": -14.03, "I": 44.94, "H": 1.0, + "K": 24.68, "M": 1.0, "L": 1.0, "N": 1.0, + "Q": -6.54, "P": -1.88, "S": 1.0, "R": 1.0, + "T": -7.49, "W": -9.37, "V": 1.0, "Y": 1.0}, + "Q": {"A": 1.0, "C": -6.54, "E": 20.26, "D": 20.26, + "G": 1.0, "F": -6.54, "I": 1.0, "H": 1.0, + "K": 1.0, "M": 1.0, "L": 1.0, "N": 1.0, + "Q": 20.26, "P": 20.26, "S": 44.94, "R": 1.0, + "T": 1.0, "W": 1.0, "V": -6.54, "Y": -6.54}, + "P": {"A": 20.26, "C": -6.54, "E": 18.38, "D": -6.54, + "G": 1.0, "F": 20.26, "I": 1.0, "H": 1.0, + "K": 1.0, "M": -6.54, "L": 1.0, "N": 1.0, + "Q": 20.26, "P": 20.26, "S": 20.26, "R": -6.54, + "T": 1.0, "W": -1.88, "V": 20.26, "Y": 1.0}, + "S": {"A": 1.0, "C": 33.60, "E": 20.26, "D": 1.0, + "G": 1.0, "F": 1.0, "I": 1.0, "H": 1.0, + "K": 1.0, "M": 1.0, "L": 1.0, "N": 1.0, + "Q": 20.26, "P": 44.94, "S": 20.26, "R": 20.26, + "T": 1.0, "W": 1.0, "V": 1.0, "Y": 1.0}, + "R": {"A": 1.0, "C": 1.0, "E": 1.0, "D": 1.0, + "G": -7.49, "F": 1.0, "I": 1.0, "H": 20.26, + "K": 1.0, "M": 1.0, "L": 1.0, "N": 13.34, + "Q": 20.26, "P": 20.26, "S": 44.94, "R": 58.28, + "T": 1.0, "W": 58.28, "V": 1.0, "Y": -6.54}, + "T": {"A": 1.0, "C": 1.0, "E": 20.26, "D": 1.0, + "G": -7.49, "F": 13.34, "I": 1.0, "H": 1.0, + "K": 1.0, "M": 1.0, "L": 1.0, "N": -14.03, + "Q": -6.54, "P": 1.0, "S": 1.0, "R": 1.0, + "T": 1.0, "W": -14.03, "V": 1.0, "Y": 1.0}, + "W": {"A": -14.03, "C": 1.0, "E": 1.0, "D": 1.0, + "G": -9.37, "F": 1.0, "I": 1.0, "H": 24.68, + "K": 1.0, "M": 24.68, "L": 13.34, "N": 13.34, + "Q": 1.0, "P": 1.0, "S": 1.0, "R": 1.0, + "T": -14.03, "W": 1.0, "V": -7.49, "Y": 1.0}, + "V": {"A": 1.0, "C": 1.0, "E": 1.0, "D": -14.03, + "G": -7.49, "F": 1.0, "I": 1.0, "H": 1.0, + "K": -1.88, "M": 1.0, "L": 1.0, "N": 1.0, + "Q": 1.0, "P": 20.26, "S": 1.0, "R": 1.0, + "T": -7.49, "W": 1.0, "V": 1.0, "Y": -6.54}, + "Y": {"A": 24.68, "C": 1.0, "E": -6.54, "D": 24.68, + "G": -7.49, "F": 1.0, "I": 1.0, "H": 13.34, + "K": 1.0, "M": 44.94, "L": 1.0, "N": 1.0, + "Q": 1.0, "P": 13.34, "S": 1.0, "R": -15.91, + "T": -7.49, "W": -9.37, "V": 1.0, "Y": 13.34}, + } + +# Turn black code style on +# fmt: on \ No newline at end of file diff --git a/test/stdlib/biopy_test/sequtils_test/checksum_test.seq b/test/stdlib/biopy_test/sequtils_test/checksum_test.seq index 167a72b9b..5283b5c9c 100644 --- a/test/stdlib/biopy_test/sequtils_test/checksum_test.seq +++ b/test/stdlib/biopy_test/sequtils_test/checksum_test.seq @@ -20,4 +20,4 @@ def test_gcg(): num1 = gcg(str_light_chain_one) num2 = gcg(str_light_chain_two) assert num1 != num2 -test_gcg() \ No newline at end of file +test_gcg() From 9c1411c80352d0b3336f7fd4301233ebf9ca5f9a Mon Sep 17 00:00:00 2001 From: Jordan Watson Date: Wed, 11 Dec 2019 12:25:46 -0800 Subject: [PATCH 07/16] Add biopy.sequtils.codonusage.seq --- stdlib/biopy/SeqUtils/checksum.seq | 6 +- stdlib/biopy/SeqUtils/codonusage.seq | 194 ++++++++++++++++++ .../sequtils_test/codonusage_test.seq | 26 +++ 3 files changed, 223 insertions(+), 3 deletions(-) create mode 100644 stdlib/biopy/SeqUtils/codonusage.seq create mode 100644 test/stdlib/biopy_test/sequtils_test/codonusage_test.seq diff --git a/stdlib/biopy/SeqUtils/checksum.seq b/stdlib/biopy/SeqUtils/checksum.seq index 99cc60da2..4493628a6 100644 --- a/stdlib/biopy/SeqUtils/checksum.seq +++ b/stdlib/biopy/SeqUtils/checksum.seq @@ -28,7 +28,7 @@ def _init_table_h() -> list[int]: #Initialisation _table_h = _init_table_h() -def crc64(s: str) -> str: +def crc64(s) -> str: """ crc64(s) @@ -51,7 +51,7 @@ def crc64(s: str) -> str: return f"CRC-{crch}{crcl}" #"CRC-%08X%08X" % (crch, crcl) -def gcg(s: str) -> int: +def gcg(s) -> int: """ gcg(s) @@ -72,7 +72,7 @@ def gcg(s: str) -> int: # TODO/CAVEATS: # @jordan - This still needs to be implemented #hashlib and base64?? -# def seguid(s: seq) -> str: +# def seguid(s) -> str: # """ # seguid(seq: seq) diff --git a/stdlib/biopy/SeqUtils/codonusage.seq b/stdlib/biopy/SeqUtils/codonusage.seq new file mode 100644 index 000000000..cb76843aa --- /dev/null +++ b/stdlib/biopy/SeqUtils/codonusage.seq @@ -0,0 +1,194 @@ + +"""Methods for codon usage calculations.""" + +import math +from biopy.sequtils.codonusageindices import SharpEcoliIndex +import bio +#DONT HAVE YET from Bio import SeqIO + +# Turn black code style off +# fmt: off + +CodonsDict = { + "TTT": 0, "TTC": 0, "TTA": 0, "TTG": 0, + "CTT": 0, "CTC": 0, "CTA": 0, "CTG": 0, + "ATT": 0, "ATC": 0, "ATA": 0, "ATG": 0, + "GTT": 0, "GTC": 0, "GTA": 0, "GTG": 0, + "TAT": 0, "TAC": 0, "TAA": 0, "TAG": 0, + "CAT": 0, "CAC": 0, "CAA": 0, "CAG": 0, + "AAT": 0, "AAC": 0, "AAA": 0, "AAG": 0, + "GAT": 0, "GAC": 0, "GAA": 0, "GAG": 0, + "TCT": 0, "TCC": 0, "TCA": 0, "TCG": 0, + "CCT": 0, "CCC": 0, "CCA": 0, "CCG": 0, + "ACT": 0, "ACC": 0, "ACA": 0, "ACG": 0, + "GCT": 0, "GCC": 0, "GCA": 0, "GCG": 0, + "TGT": 0, "TGC": 0, "TGA": 0, "TGG": 0, + "CGT": 0, "CGC": 0, "CGA": 0, "CGG": 0, + "AGT": 0, "AGC": 0, "AGA": 0, "AGG": 0, + "GGT": 0, "GGC": 0, "GGA": 0, "GGG": 0} + +# Turn black code style on +# fmt: on + +# this dictionary shows which codons encode the same AA +SynonymousCodons = { + "CYS": ["TGT", "TGC"], + "ASP": ["GAT", "GAC"], + "SER": ["TCT", "TCG", "TCA", "TCC", "AGC", "AGT"], + "GLN": ["CAA", "CAG"], + "MET": ["ATG"], + "ASN": ["AAC", "AAT"], + "PRO": ["CCT", "CCG", "CCA", "CCC"], + "LYS": ["AAG", "AAA"], + "STOP": ["TAG", "TGA", "TAA"], + "THR": ["ACC", "ACA", "ACG", "ACT"], + "PHE": ["TTT", "TTC"], + "ALA": ["GCA", "GCC", "GCG", "GCT"], + "GLY": ["GGT", "GGG", "GGA", "GGC"], + "ILE": ["ATC", "ATA", "ATT"], + "LEU": ["TTA", "TTG", "CTC", "CTT", "CTG", "CTA"], + "HIS": ["CAT", "CAC"], + "ARG": ["CGA", "CGC", "CGG", "CGT", "AGG", "AGA"], + "TRP": ["TGG"], + "VAL": ["GTA", "GTC", "GTG", "GTT"], + "GLU": ["GAG", "GAA"], + "TYR": ["TAT", "TAC"], +} + +class CodonAdaptationIndex: + """ + A codon adaptation index (CAI) implementation. + + Implements the codon adaptation index (CAI) described by Sharp and + Li (Nucleic Acids Res. 1987 Feb 11;15(3):1281-95). + + NOTE - This implementation does not currently cope with alternative genetic + codes: only the synonymous codons in the standard table are considered. + """ + index: dict[str, float] + codon_count: dict[str, int] + + def __init__(self: CodonAdaptationIndex): + self.index = dict[str, float]() + self.codon_count = dict[str, int]() + + # use this method with predefined CAI index + def set_cai_index(self, index): + """ + set_cai_index(self, index) + + Set up an index to be used when calculating CAI for a gene. + + Just pass a dictionary similar to the SharpEcoliIndex in the + CodonUsageIndices module. + """ + self.index = index + + def generate_index(self, fasta_file): + """ + generate_index(self, fasta_file) + + Generate a codon usage index from a FASTA file of CDS sequences. + + Takes a location of a Fasta file containing CDS sequences + (which must all have a whole number of codons) and generates a codon + usage index. + + RCSU values + """ + # first make sure we're not overwriting an existing index: + if self.index != dict[str, float]() or self.codon_count != dict[str, int](): + raise ValueError( + "an index has already been set or a codon count " + "has been done. Cannot overwrite either." + ) + + # count codon occurrences in the file. + self._count_codons(fasta_file) + + # now to calculate the index we first need to sum the number of times + # synonymous codons were used all together. + for aa in SynonymousCodons: + total = 0.0 + # RCSU values are CodonCount/((1/num of synonymous codons) * sum of + # all synonymous codons) + rcsu = list[float]() + codons = SynonymousCodons[aa] + + for codon in codons: + total += self.codon_count[codon] + + # calculate the RSCU value for each of the codons + for codon in codons: + denominator = float(total) / len(codons) + rcsu.append(self.codon_count[codon] / denominator) + + # now generate the index W=RCSUi/RCSUmax: + rcsu_max = max(rcsu) + for codon_index, codon in enumerate(codons): + self.index[codon] = rcsu[codon_index] / rcsu_max + + def cai_for_gene(self, dna_sequence: str) -> float: + """ + cai_for_gene(self, dna_sequence) + + Calculate the CAI (float) for the provided DNA sequence (string). + + This method uses the Index (either the one you set or the one you + generated) and returns the CAI for the DNA sequence. + """ + cai_value, cai_length = 0.0, 0 + + # if no index is set or generated, the default SharpEcoliIndex will + # be used. + if self.index == dict[str, float](): + self.set_cai_index(SharpEcoliIndex) + + if dna_sequence.islower(): + dna_sequence = dna_sequence.upper() + + for i in range(0, len(dna_sequence), 3): + codon = dna_sequence[i : i + 3] + if codon in self.index: + # these two codons are always one, exclude them: + if codon not in ["ATG", "TGG"]: + cai_value += math.log(self.index[codon]) + cai_length += 1 + # some indices may not include stop codons: + elif codon not in ["TGA", "TAA", "TAG"]: + raise TypeError( + f"illegal codon in sequence: {codon}.\n{self.index}" + ) + + return math.exp(cai_value / (cai_length - 1.0)) + + def _count_codons(self, fasta_file): + + # make the codon dictionary local + self.codon_count = CodonsDict.copy() + + dna_sequence = '' + + # iterate over sequence and count all the codons in the FastaFile. + for cur_record in bio.FASTA(fasta_file): + # make sure the sequence is lower case + if str(cur_record.seq).islower(): + dna_sequence = str(cur_record.seq).upper() + else: + dna_sequence = str(cur_record.seq) + for i in range(0, len(dna_sequence), 3): + codon = dna_sequence[i : i + 3] + if codon in self.codon_count: + self.codon_count[codon] += 1 + else: + raise TypeError( + f"illegal codon {codon} in gene: {cur_record}" + ) + + def print_index(self): + """ + Print out the index used. + This just gives the index when the objects is printed. + """ + for i in sorted(self.index): + print(f"{i}\t{self.index[i]}") diff --git a/test/stdlib/biopy_test/sequtils_test/codonusage_test.seq b/test/stdlib/biopy_test/sequtils_test/codonusage_test.seq new file mode 100644 index 000000000..fd273893f --- /dev/null +++ b/test/stdlib/biopy_test/sequtils_test/codonusage_test.seq @@ -0,0 +1,26 @@ +from biopy.sequtils.codonusage import CodonAdaptationIndex +import math + +X = CodonAdaptationIndex() +path = "/Users/jordanwatson1/seq/test/stdlib/biopy_test/sequtils_test/scratch.fasta" + +X.generate_index(path) + +@test +def cai_for_gene(): + PRECISION = 1e-6 + + # Test Codon Adaptation Index (CAI) using default E. coli data. + CAI = CodonAdaptationIndex() + value = CAI.cai_for_gene("ATGCGTATCGATCGCGATACGATTAGGCGGATG") + assert math.fabs(value - 0.09978) < PRECISION + + C = CodonAdaptationIndex() + cai = X.cai_for_gene("ATGAAACGCATTAGCACCACCATTACCACCACCATCACCATTACCACAGGTAACGGTGCGGGCTGA") + print cai + # assert math.fabs(cai - 0.6723) < PRECISION +cai_for_gene() + +def print_index(): + X.print_index() +# print_index() \ No newline at end of file From c654d51102fab80b149aa986b19507d60be9dcaf Mon Sep 17 00:00:00 2001 From: Jordan Watson Date: Wed, 11 Dec 2019 12:26:44 -0800 Subject: [PATCH 08/16] Add codonusage_test.seq --- test/stdlib/biopy_test/sequtils_test/codonusage_test.seq | 5 ----- 1 file changed, 5 deletions(-) diff --git a/test/stdlib/biopy_test/sequtils_test/codonusage_test.seq b/test/stdlib/biopy_test/sequtils_test/codonusage_test.seq index fd273893f..bc1aa7934 100644 --- a/test/stdlib/biopy_test/sequtils_test/codonusage_test.seq +++ b/test/stdlib/biopy_test/sequtils_test/codonusage_test.seq @@ -14,11 +14,6 @@ def cai_for_gene(): CAI = CodonAdaptationIndex() value = CAI.cai_for_gene("ATGCGTATCGATCGCGATACGATTAGGCGGATG") assert math.fabs(value - 0.09978) < PRECISION - - C = CodonAdaptationIndex() - cai = X.cai_for_gene("ATGAAACGCATTAGCACCACCATTACCACCACCATCACCATTACCACAGGTAACGGTGCGGGCTGA") - print cai - # assert math.fabs(cai - 0.6723) < PRECISION cai_for_gene() def print_index(): From dbceb28f1dce76f9f62af82c86d987d438598c3e Mon Sep 17 00:00:00 2001 From: Jordan Watson Date: Wed, 11 Dec 2019 16:14:21 -0800 Subject: [PATCH 09/16] Add biopy/sequtils/lcc.seq --- stdlib/biopy/SeqUtils/lcc.seq | 145 ++++++++++++++++++ .../biopy_test/sequtils_test/lcc_test.seq | 25 +++ 2 files changed, 170 insertions(+) create mode 100644 stdlib/biopy/SeqUtils/lcc.seq create mode 100644 test/stdlib/biopy_test/sequtils_test/lcc_test.seq diff --git a/stdlib/biopy/SeqUtils/lcc.seq b/stdlib/biopy/SeqUtils/lcc.seq new file mode 100644 index 000000000..e044ba9ab --- /dev/null +++ b/stdlib/biopy/SeqUtils/lcc.seq @@ -0,0 +1,145 @@ + +"""Local Composition Complexity.""" + +import math + +def lcc_mult(s, wsize: int): + """ + lcc_mult(s, wsize) + + Calculate Local Composition Complexity (LCC) values over sliding window. + + Returns a list of floats, the LCC values for a sliding window over + the sequence. + """ + l2 = math.log(2.0) + tamseq = len(s) + + upper = s.upper() + + compone = [0.0] + lccsal = [0.0] + for i in range(wsize): + compone.append( + ((i + 1) / float(wsize)) * ((math.log((i + 1) / float(wsize))) / l2) + ) + window = s[0:wsize] + cant_a = window.count("A") + cant_c = window.count("C") + cant_t = window.count("T") + cant_g = window.count("G") + term_a = compone[cant_a] + term_c = compone[cant_c] + term_t = compone[cant_t] + term_g = compone[cant_g] + lccsal.append(-(term_a + term_c + term_t + term_g)) + tail = s[0] + for x in range(tamseq - wsize): + window = upper[x + 1 : wsize + x + 1] + if tail == window[-1]: + lccsal.append(lccsal[-1]) + elif tail == "A": + cant_a -= 1 + if window.endswith("C"): + cant_c += 1 + term_a = compone[cant_a] + term_c = compone[cant_c] + lccsal.append(-(term_a + term_c + term_t + term_g)) + elif window.endswith("T"): + cant_t += 1 + term_a = compone[cant_a] + term_t = compone[cant_t] + lccsal.append(-(term_a + term_c + term_t + term_g)) + elif window.endswith("G"): + cant_g += 1 + term_a = compone[cant_a] + term_g = compone[cant_g] + lccsal.append(-(term_a + term_c + term_t + term_g)) + elif tail == "C": + cant_c -= 1 + if window.endswith("A"): + cant_a += 1 + term_a = compone[cant_a] + term_c = compone[cant_c] + lccsal.append(-(term_a + term_c + term_t + term_g)) + elif window.endswith("T"): + cant_t += 1 + term_c = compone[cant_c] + term_t = compone[cant_t] + lccsal.append(-(term_a + term_c + term_t + term_g)) + elif window.endswith("G"): + cant_g += 1 + term_c = compone[cant_c] + term_g = compone[cant_g] + lccsal.append(-(term_a + term_c + term_t + term_g)) + elif tail == "T": + cant_t -= 1 + if window.endswith("A"): + cant_a += 1 + term_a = compone[cant_a] + term_t = compone[cant_t] + lccsal.append(-(term_a + term_c + term_t + term_g)) + elif window.endswith("C"): + cant_c += 1 + term_c = compone[cant_c] + term_t = compone[cant_t] + lccsal.append(-(term_a + term_c + term_t + term_g)) + elif window.endswith("G"): + cant_g += 1 + term_t = compone[cant_t] + term_g = compone[cant_g] + lccsal.append(-(term_a + term_c + term_t + term_g)) + elif tail == "G": + cant_g -= 1 + if window.endswith("A"): + cant_a += 1 + term_a = compone[cant_a] + term_g = compone[cant_g] + lccsal.append(-(term_a + term_c + term_t + term_g)) + elif window.endswith("C"): + cant_c += 1 + term_c = compone[cant_c] + term_g = compone[cant_g] + lccsal.append(-(term_a + term_c + term_t + term_g)) + elif window.endswith("T"): + cant_t += 1 + term_t = compone[cant_t] + term_g = compone[cant_g] + lccsal.append(-(term_a + term_c + term_t + term_g)) + tail = window[0] + return lccsal + + +def lcc_simp(s): + """ + lcc_simp(s) + + Calculate Local Composition Complexity (LCC) for a sequence. + """ + wsize = len(s) + term_a = 0.0 + term_c = 0.0 + term_t = 0.0 + term_g = 0.0 + + + upper = s.upper() + + l2 = math.log(2.0) + if "A" in s: + term_a = ((upper.count("A")) / float(wsize)) * ( + (math.log((upper.count("A")) / float(wsize))) / l2 + ) + if "C" in s: + term_c = ((upper.count("C")) / float(wsize)) * ( + (math.log((upper.count("C")) / float(wsize))) / l2 + ) + if "T" in s: + term_t = ((upper.count("T")) / float(wsize)) * ( + (math.log((upper.count("T")) / float(wsize))) / l2 + ) + if "G" in s: + term_g = ((upper.count("G")) / float(wsize)) * ( + (math.log((upper.count("G")) / float(wsize))) / l2 + ) + return -(term_a + term_c + term_t + term_g) \ No newline at end of file diff --git a/test/stdlib/biopy_test/sequtils_test/lcc_test.seq b/test/stdlib/biopy_test/sequtils_test/lcc_test.seq new file mode 100644 index 000000000..aa5e9c86e --- /dev/null +++ b/test/stdlib/biopy_test/sequtils_test/lcc_test.seq @@ -0,0 +1,25 @@ +from biopy.sequtils.lcc import lcc_simp, lcc_mult +import math + +str_light_chain_one = ("QSALTQPASVSGSPGQSITISCTGTSSDVGSYNLVSWYQQ" + "HPGKAPKLMIYEGSKRPSGVSNRFSGSKSGNTASLTISGL" + "QAEDEADYYCSSYAGSSTLVFGGGTKLTVL") + +exp_simple_LCC = 1.03 +exp_window_LCC = (0.00, 1.00, 0.96, 0.96, 0.96, 0.65, 0.43, 0.35, 0.35, 0.35, 0.35, 0.53, 0.59, 0.26) + +@test +def lcc_simp_test(): + PRECISION = 1e2 + assert math.fabs(lcc_simp(str_light_chain_one) - exp_simple_LCC) < PRECISION +lcc_simp_test() + +@test +def lcc_mult_test(): + PRECISION = 1e2 + values = lcc_mult(str_light_chain_one, 20) + assert len(values) == len(exp_window_LCC) + + for val1, val2 in zip(values, exp_window_LCC): + assert math.fabs(val1 - val2) < PRECISION +lcc_mult_test() \ No newline at end of file From e335b4ebe0c46ff708774d4c45f04e0676a2a392 Mon Sep 17 00:00:00 2001 From: Jordan Watson Date: Mon, 16 Dec 2019 11:08:33 -0800 Subject: [PATCH 10/16] Unfinished seqrecord.seq and seqfeature.seq. Committing for progress --- stdlib/bio/seqfeature.seq | 1295 ++++++++++++++++++++++++++++++++++++ stdlib/biopy/seqrecord.seq | 529 +++++++++++++++ 2 files changed, 1824 insertions(+) create mode 100644 stdlib/bio/seqfeature.seq create mode 100644 stdlib/biopy/seqrecord.seq diff --git a/stdlib/bio/seqfeature.seq b/stdlib/bio/seqfeature.seq new file mode 100644 index 000000000..451eafcbb --- /dev/null +++ b/stdlib/bio/seqfeature.seq @@ -0,0 +1,1295 @@ + +""" +Represent a Sequence Feature holding info about a part of a sequence. +TODO/CAVEATS: +@jordan- For now MutableSeq is left out +""" + +#from bio.seq import reverse_complement + +# --- Handling feature locations + +#class FeatureLocation(object): +class FeatureLocation: + """ + Specify the location of a feature along a sequence. + """ + start: int + end: int + strand: int + ref: int + ref_db: int + + def __init__(self: FeatureLocation, start: int, end: int, strand: optional[int]=None, ref: optional[int]=None, ref_db: optional[int]=None): + """ + Initialize the class. + """ + # TODO - Check 0 <= start <= end (<= length of reference) + self._start = start + + self._end = end + + if self.start > self.end: + raise ValueError(f"""End location ({self.end}) must be greater than or equal + to start location ({self.start})""") + self.strand = strand + self.ref = ref + self.ref_db = ref_db + + def _get_strand(self: FeatureLocation): + """ + Get function for the strand property (PRIVATE). + """ + return self._strand + + def _set_strand(self: FeatureLocation, value: int): + """ + Set function for the strand property (PRIVATE). + """ + if value not in [+1, -1, 0, None]: + raise ValueError(f"Strand should be +1, -1, 0 or None, not {value}") + self._strand = value + + @property + def strand(self: FeatureLocation, fget: optional[int]=None, fset: optional[int]=None, doc="Strand of the location (+1, -1, 0 or None)."): + if fget is None: + fget = self._get_strand + if fset is None: + fset = self._set_strand + return self._strand + + def __str__(self: FeatureLocation): + """ + Return a representation of the FeatureLocation object. + """ + answer = f"[{self._start}:{self._end}]" + if self.ref and self.ref_db: + answer = f"{self.ref_db}:{self.ref}{answer}" + elif self.ref: + answer = self.ref + answer + if self.strand is None: + return answer + elif self.strand == +1: + return answer + "(+)" + elif self.strand == -1: + return answer + "(-)" + else: + # strand = 0, stranded but strand unknown, ? in GFF3 + return answer + "(?)" + + def __repr__(self: FeatureLocation): + """ + Represent the FeatureLocation object as a string for debugging. + """ + optional = "" + if self.strand is not None: + optional += F", strand={self.strand}" + if self.ref is not None: + optional += f", ref={self.ref}" + if self.ref_db is not None: + optional += f", ref_db={self.ref_db}" + return f"{self.__class__.__name__}({self.start}, {self.end}{optional})" \ + + def __add__(self: FeatureLocation, other: FeatureLocation): + """ + Combine location with another FeatureLocation object, or shift it. + """ + return self.CompoundLocation([self, other]) + + # Not sure about `other` type + def __radd__(self: FeatureLocation, other: int): + """ + Add a feature locationanother FeatureLocation object to the left. + """ + return self._shift(other) + + + def __nonzero__(self: FeatureLocation): + """ + Return True regardless of the length of the feature. + """ + return True + + def __len__(self: FeatureLocation): + """ + Return the length of the region described by the FeatureLocation object. + """ + return int(self._end) - int(self._start) + + def __contains__(self: FeatureLocation, value: int): + """ + Check if an integer position is within the FeatureLocation object. + """ + if value < self._start or value >= self._end: + return False + else: + return True + + def __iter__(self: FeatureLocation): + """ + Iterate over the parent positions within the FeatureLocation object. + """ + if self.strand == -1: + for i in range(self._end - 1, self._start - 1, -1): + yield i + else: + for i in range(self._start, self._end): + yield i + + def __eq__(self: FeatureLocation, other: FeatureLocation): + """ + Implement equality by comparing all the location attributes. + """ + return self._start == other.start and \ + self._end == other.end and \ + self._strand == other.strand and \ + self.ref == other.ref and \ + self.ref_db == other.ref_db + + def __ne__(self: FeatureLocation, other: FeatureLocation): + """ + Implement the not-equal operand. + """ + return not self == other + + def _shift(self: FeatureLocation, offset: int): + """ + Return a copy of the FeatureLocation shifted by an offset (PRIVATE). + """ + if self.ref or self.ref_db: + raise ValueError("Feature references another sequence.") + return FeatureLocation(start=self._start._shift(offset), + end=self._end._shift(offset), + strand=self.strand) + + def _flip(self: FeatureLocation, length): + """ + Return a copy of the location after the parent is reversed (PRIVATE). + """ + flip_strand = 0 + if self.ref or self.ref_db: + raise ValueError("Feature references another sequence.") + # Note this will flip the start and end too! + if self.strand == +1: + flip_strand = -1 + elif self.strand == -1: + flip_strand = +1 + else: + # 0 or None + flip_strand = self.strand + return FeatureLocation(start=self._end._flip(length), + end=self._start._flip(length), + strand=flip_strand) + + @property + def parts(self: FeatureLocation): + """ + Read only list of sections (always one, the FeatureLocation object). + """ + return [self] + + @property + def start(self: FeatureLocation): + """ + Start location - left most (minimum) value, regardless of strand. + """ + return self._start + + @property + def end(self: FeatureLocation): + """ + End location - right most (maximum) value, regardless of strand. + """ + return self._end + + @property + def nofuzzy_start(self: FeatureLocation): + """ + Start position (integer, approximated if fuzzy, read only) (OBSOLETE). + """ + return int(self._start) + + @property + def nofuzzy_end(self: FeatureLocation): + """ + End position (integer, approximated if fuzzy, read only) (OBSOLETE). + """ + return int(self._end) + + # TODO/CAVEATS: + # @jordan- reverse_complement needs to be implemented still + # def extract(self: FeatureLocation, parent_sequence: seq): + # """ + # Extract the sequence from supplied parent sequence using the FeatureLocation object. + # """ + # if self.ref or self.ref_db: + # raise ValueError("Feature references another sequence.") + + # f_seq = parent_sequence[self.nofuzzy_start:self.nofuzzy_end] + # if self.strand == -1: + # f_seq = reverse_complement(f_seq) + + # return f_seq + + # def extract(self: FeatureLocation, parent_sequence: str): + # """ + # Extract the sequence from supplied parent sequence using the FeatureLocation object. + # """ + # if self.ref or self.ref_db: + # raise ValueError("Feature references another sequence.") + + # f_seq = parent_sequence[self.nofuzzy_start:self.nofuzzy_end] + # if self.strand == -1: + # f_seq = reverse_complement(f_seq) + + # return f_seq + +# class SeqFeature(object): +class SeqFeature: + """ + Represent a Sequence Feature on an object. + """ + location: FeatureLocation + typ: str + location_operator: str + strand: int + id: str + qualifiers: dict[str, list[str]] + ref: int + ref_db: int + + + def __init__(self: SeqFeature, location: optional[FeatureLocation]=None, typ: str="", location_operator: str="", + strand: optional[int]=None, id: str="", + qualifiers: optional[dict[str, list[str]]]=None, + ref: optional[int]=None, ref_db: optional[int]=None): + """ + Initialize a SeqFeature on a Sequence. + location can either be a FeatureLocation (with strand argument also + given if required), or None. + + TODO/CAVEATS: + @jordan - took out sub_features from parameter for now + - Because OrderedDict is not implemented yet, qualifiers is manditory + """ + + self.location = location + self.typ = typ + + if location_operator: + self.location_operator = location_operator + + if strand is not None: + self.strand = strand + + self.id = id + self.qualifiers = qualifiers + + if ref: + self.ref = ref + + if ref_db: + self.ref_db = ref_db + + def _get_strand(self: SeqFeature): + """ + Get function for the strand property (PRIVATE). + """ + return self.location.strand + + def _set_strand(self: SeqFeature, value: int): + """ + Set function for the strand property (PRIVATE). + """ + self.location.strand = value + + @property + def strand(self: SeqFeature, fget: optional[int]=None, fset: optional[int]= None, doc="""Feature's strand + This is a shortcut for feature.location.strand"""): + if fget is None: + fget = self._get_strand + if fset is None: + fset = self._set_strand + return self._strand + + + def _get_ref(self: SeqFeature): + """ + Get function for the reference property (PRIVATE). + """ + return self.location.ref + + + def _set_ref(self: SeqFeature, value): + """ + Set function for the reference property (PRIVATE). + """ + self.location.ref = value + + @property + def ref(self: SeqFeature, fget: optional[int]=None, fset: optional[int]= None, + doc="""Feature location reference (e.g. accession). + This is a shortcut for feature.location.ref + """): + if fget is None: + fget = self._get_ref + if fset is None: + fset = self._set_ref + + return self._ref + + def _get_ref_db(self: SeqFeature): + """ + Get function for the database reference property (PRIVATE). + """ + return self.location.ref_db + + + def _set_ref_db(self: SeqFeature, value): + """ + Set function for the database reference property (PRIVATE). + """ + self.location.ref_db = value + + @property + def ref_db(self: SeqFeature, fget: optional[int]= None, fset: optional[int]=None, + doc="""Feature location reference's database. + This is a shortcut for feature.location.ref_db + """): + if fget is None: + fget = self._get_ref_db + if fset is None: + fset = self._set_ref_db + return self._ref_db + + def _get_location_operator(self: SeqFeature): + """ + Get function for the location operator property (PRIVATE). + """ + return self.location.operator + + def _set_location_operator(self, value): + """ + Set function for the location operator property (PRIVATE). + """ + self.location.operator = value + + @property + def location_operator(self: SeqFeature, fget: optional[str]=None, fset: optional[str]=None, + doc="Location operator for compound locations (e.g. join)."): + if fget is None: + fget = self._get_location_operator + if fset is None: + fset = self._set_location_operator + return self._location_operator + + def __repr__(self: SeqFeature): + """ + Represent the feature as a string for debugging. + """ + answer = f"{self.__class__.__name__}({self.repr(self.location)}" + if self.typ: + answer += f", type={self.repr(self.typ)}" + if self.location_operator: + answer += f", location_operator={self.repr(self.location_operator)}" + if self.id and self.id != "": + answer += f", id={self.repr(self.id)}" + if self.ref: + answer += f", ref={self.repr(self.ref)}" + if self.ref_db: + answer += f", ref_db={self.repr(self.ref_db)}" + answer += ")" + return answer + + def __str__(self: SeqFeature): + """ + Return the full feature as a python string. + """ + out = f"type: {self.typ}\n" + out += f"location: {self.location}\n" + if self.id and self.id != "": + out += f"id: {self.id}\n" + out += "qualifiers:\n" + for qual_key in sorted(self.qualifiers): + out += f" Key: {qual_key}, Value: {self.qualifiers[qual_key]}\n" + return out + + def _shift(self: SeqFeature, offset): + """ + Return a copy of the feature with its location shifted (PRIVATE). + The annotation qaulifiers are copied. + """ + return SeqFeature(location=self.location._shift(offset), + typ=self.typ, + location_operator=self.location_operator, + id=self.id, + qualifiers=OrderedDict(self.qualifiers.items())) + + def _flip(self: SeqFeature, length: int): + """ + Return a copy of the feature with its location flipped (PRIVATE). + """ + return SeqFeature(location=self.location._flip(length), + typ=self.typ, + location_operator=self.location_operator, + id=self.id, + qualifiers=OrderedDict(self.qualifiers.items())) + + def extract(self: SeqFeature, parent_sequence: seq): + """ + Extract the feature's sequence from supplied parent sequence. + """ + if self.location is None: + raise ValueError("The feature's .location is None. Check the " + "sequence file for a valid location.") + return self.location.extract(parent_sequence) + + def extract(self: SeqFeature, parent_sequence: str): + """ + Extract the feature's sequence from supplied parent sequence. + """ + if self.location is None: + raise ValueError("The feature's .location is None. Check the " + "sequence file for a valid location.") + return self.location.extract(parent_sequence) + + def translate(self: SeqFeature, parent_sequence, table="Standard", start_offset: optional[int]=None, + stop_symbol="*", to_stop=False, cds=None, gap=None): + """ + Get a translation of the feature's sequence. + """ + # see if this feature should be translated in a different + # frame using the "codon_start" qualifier + if start_offset is None: + try: + start_offset = int(self.qualifiers["codon_start"][0]) - 1 + + except KeyError: + start_offset = 0 + + if start_offset not in [0, 1, 2]: + raise ValueError(f"""The start_offset must be 0, 1, or 2. + The supplied value is {start_offset}. Check the value + of either the codon_start qualifier or + the start_offset argument""") + + feat_seq = self.extract(parent_sequence)[start_offset:] + codon_table = self.qualifiers.get("transl_table", [table])[0] + + if cds is None: + cds = (self.typ == "CDS") + + return feat_seq.translate(table=codon_table, stop_symbol=stop_symbol, + to_stop=to_stop, cds=cds, gap=gap) + + def __bool__(self): + """ + Boolean value of an instance of this class (True). + """ + return True + + def __len__(self): + """ + Return the length of the region where the feature is located. + """ + return len(self.location) + + def __iter__(self): + """ + Iterate over the parent positions within the feature. + """ + return iter(self.location) + + def __contains__(self, value): + """ + Check if an integer position is within the feature. + """ + return value in self.location + + +# --- References + +#class Reference(object): +class Reference: + """ + Represent a Generic Reference object. + """ + location: list[Location] + authors: str + title: str + journal: str + medline_id: str + pubmed_id: str + comment: str + + def __init__(self: Reference): + """ + Initialize the class. + """ + self.location = list[Location]() + self.authors = "" + self.consrtm = "" + self.title = "" + self.journal = "" + self.medline_id = "" + self.pubmed_id = "" + self.comment = "" + + def __str__(self: Reference): + """ + Return the full Reference object as a python string. + """ + out = "" + for single_location in self.location: + out += f"location: {single_location}\n" + out += f"authors: {self.authors}\n" + if self.consrtm: + out += f"consrtm: {self.consrtm}\n" + out += f"title: {self.title}\n" + out += f"journal: {self.journal}\n" + out += f"medline id: {self.medline_id}\n" + out += f"pubmed id: {self.pubmed_id}\n" + out += f"comment: {self.comment}\n" + return out + + def __repr__(self: Reference): + """ + Represent the Reference object as a string for debugging. + """ + return f"{self.__class__.__name__}(title={repr(self.title)}, ...)" + + def __eq__(self: Reference, other: Reference): + """ + Check if two Reference objects should be considered equal. + """ + return self.authors == other.authors and \ + self.consrtm == other.consrtm and \ + self.title == other.title and \ + self.journal == other.journal and \ + self.medline_id == other.medline_id and \ + self.pubmed_id == other.pubmed_id and \ + self.comment == other.comment and \ + self.location == other.location + + def __ne__(self: Reference, other: Reference): + """ + Implement the not-equal operand. + """ + return not self == other + +#class CompoundLocation(object): +class CompoundLocation: + """ + For handling joins etc where a feature location has several parts. + """ + parts: list[FeatureLocation] + operator: str + + def __init__(self: CompoundLocation, parts: list[FeatureLocation], operator: str="join"): + """ + Initialize the class. + """ + self.operator = operator + self.parts = parts + + if len(parts) < 2: + raise ValueError(f"CompoundLocation should have at least 2 parts, not {parts}") + + def __str__(self: CompoundLocation): + """ + Return a representation of the CompoundLocation object. + """ + return f"{self.operator}, {", ".join(str(loc) for loc in self.parts)}" + + def __repr__(self: CompoundLocation): + """ + Represent the CompoundLocation object as string for debugging. + """ + return f"{self.__class__.__name__}({self.parts}, {self.operator})" + + + def _get_strand(self: CompoundLocation): + """ + Get function for the strand property (PRIVATE). + """ + if len({loc.strand for loc in self.parts}) == 1: + return self.parts[0].strand + else: + return None # i.e. mixed strands + + def _set_strand(self: CompoundLocation, value: int): + """ + Set function for the strand property (PRIVATE). + """ + for loc in self.parts: + loc.strand = value + + @property + def strand(self: CompoundLocation, fget=_get_strand, fset=_set_strand, + doc="""Overall strand of the compound location. + If all the parts have the same strand, that is returned. Otherwise + for mixed strands, this returns None. + >>> from Bio.SeqFeature import FeatureLocation, CompoundLocation + >>> f1 = FeatureLocation(15, 17, strand=1) + >>> f2 = FeatureLocation(20, 30, strand=-1) + >>> f = f1 + f2 + >>> f1.strand + 1 + >>> f2.strand + -1 + >>> f.strand + >>> f.strand is None + True + If you set the strand of a CompoundLocation, this is applied to + all the parts - use with caution: + >>> f.strand = 1 + >>> f1.strand + 1 + >>> f2.strand + 1 + >>> f.strand + 1 + """): + return self._strand + + def __add__(self: CompoundLocation, other: FeatureLocation): + """ + Combine locations. + """ + return CompoundLocation(self.parts + [other], self.operator) + + def __add__(self: CompoundLocation, other: CompoundLocation): + """ + Combine locations. + """ + if self.operator != other.operator: + # Handle join+order -> order as a special case? + raise ValueError(F"Mixed operators {self.operator} and {other.operator}") + return CompoundLocation(self.parts + other.parts, self.operator) + + def __add__(self: CompoundLocation, other: int): + """ + Shift the location by an integer offset. + """ + return self._shift(other) + + + def __radd__(self: CompoundLocation, other: FeatureLocation): + """ + Add a feature to the left. + """ + return CompoundLocation([other] + self.parts, self.operator) + + + def __radd__(self: CompoundLocation, other: int): + """ + Add a feature to the left. + """ + return self._shift(other) + + + def __contains__(self: CompoundLocation, value: int): + """ + Check if an integer position is within the CompoundLocation object. + """ + for loc in self.parts: + if value in loc: + return True + return False + + def __nonzero__(self: CompoundLocation): + """ + Return True regardless of the length of the feature. + """ + return True + + def __len__(self: CompoundLocation): + """ + Return the length of the CompoundLocation object. + """ + return sum(len(loc) for loc in self.parts) + + def __iter__(self: CompoundLocation): + """ + Iterate over the parent positions within the CompoundLocation object. + """ + for loc in self.parts: + for pos in loc: + yield pos + + def __eq__(self: CompoundLocation, other: CompoundLocation): + """ + Check if all parts of CompoundLocation are equal to all parts of other CompoundLocation. + """ + if len(self.parts) != len(other.parts): + return False + if self.operator != other.operator: + return False + for self_part, other_part in zip(self.parts, other.parts): + if self_part != other_part: + return False + return True + + def __ne__(self: CompoundLocation, other: CompoundLocation): + """ + Implement the not-equal operand. + """ + return not self == other + + def _shift(self: CompoundLocation, offset: int): + """ + Return a copy of the CompoundLocation shifted by an offset (PRIVATE). + """ + return CompoundLocation([loc._shift(offset) for loc in self.parts], + self.operator) + + def _flip(self: CompoundLocation, length: int): + """ + Return a copy of the locations after the parent is reversed (PRIVATE). + """ + return CompoundLocation([loc._flip(length) for loc in self.parts], + self.operator) + + @property + def start(self: CompoundLocation): + """ + Start location - left most (minimum) value, regardless of strand. + """ + return min(loc.start for loc in self.parts) + + @property + def end(self: CompoundLocation): + """ + End location - right most (maximum) value, regardless of strand. + """ + return max(loc.end for loc in self.parts) + + @property + def nofuzzy_start(self: CompoundLocation) -> int: + """ + Start position (integer, approximated if fuzzy, read only) (OBSOLETE). + """ + return int(self.start) + + @property + def nofuzzy_end(self: CompoundLocation) -> int: + """ + End position (integer, approximated if fuzzy, read only) (OBSOLETE). + """ + return int(self.end) + + + @property + def ref(self: CompoundLocation): + """ + Not present in CompoundLocation, dummy method for API compatibility. + """ + return None + + @property + def ref_db(self: CompoundLocation): + """ + Not present in CompoundLocation, dummy method for API compatibility. + """ + return None + + def extract(self: CompoundLocation, parent_sequence: seq): + """ + Extract the sequence from supplied parent sequence using the CompoundLocation object. + """ + # This copes with mixed strand features & all on reverse: + parts = [loc.extract(parent_sequence) for loc in self.parts] + # We use addition rather than a join to avoid alphabet issues: + f_seq = parts[0] + for part in parts[1:]: + f_seq += part + return f_seq + + def extract(self: CompoundLocation, parent_sequence: str): + """ + Extract the sequence from supplied parent sequence using the CompoundLocation object. + """ + # This copes with mixed strand features & all on reverse: + parts = [loc.extract(parent_sequence) for loc in self.parts] + # We use addition rather than a join to avoid alphabet issues: + f_seq = parts[0] + for part in parts[1:]: + f_seq += part + return f_seq + +# class AbstractPosition(object): +class AbstractPosition: + """ + Abstract base class representing a position. + """ + + def __repr__(self: AbstractPosition): + """ + Represent the AbstractPosition object as a string for debugging. + """ + return f"{self.__class__.__name__}(...)" + +# class ExactPosition(int, AbstractPosition): +class ExactPosition: + """ + Specify the specific position of a boundary. + - position - The position of the boundary. + - extension - An optional argument which must be zero since we don't + have an extension. The argument is provided so that the same number + of arguments can be passed to all position types. + """ + position: int + extension: int + + def __new__(cls: ExactPosition, position: int, extension: int=0): + """ + Create an ExactPosition object. + """ + if extension != 0: + raise AttributeError(f"Non-zero extension {extension} for exact position.") + + return int.__new__(cls, position) + + def __str__(self: ExactPosition): + """ + Return a representation of the ExactPosition object. + """ + return str(int(self)) + + def __repr__(self: ExactPosition): + """ + Represent the ExactPosition object as a string for debugging. + """ + return f"{self.__class__.__name__}({int(self)})" + + @property + def position(self: ExactPosition): + """ + Legacy attribute to get position as integer (OBSOLETE). + """ + return int(self) + + @property + def extension(self: ExactPosition): + """ + Not present in this object, return zero (OBSOLETE). + """ + return 0 + + def _shift(self: ExactPosition, offset: int): + """ + Return a copy of the position object with its location shifted (PRIVATE). + """ + # By default preserve any subclass + return self.__class__(int(self) + offset) + + def _flip(self: ExactPosition, length: int): + """ + Return a copy of the location after the parent is reversed (PRIVATE). + """ + # By default perserve any subclass + return self.__class__(length - int(self)) + + +# class UncertainPosition(ExactPosition): +# """ +# Specify a specific position which is uncertain. +# This is used in UniProt, e.g. ?222 for uncertain position 222, or in the +# XML format explicitly marked as uncertain. Does not apply to GenBank/EMBL. +# """ + +# pass + +# class UnknownPosition(AbstractPosition): +class UnknownPosition: + """ + Specify a specific position which is unknown (has no position). + """ + + def __repr__(self: UnknownPosition): + """ + Represent the UnknownPosition object as a string for debugging. + """ + return f"{self.__class__.__name__}()" + + # TODO/CAVEATS: + # @jordan - hash values + # def __hash__(self: UnknownPosition): + # """ + # Return the hash value of the UnknownPosition object. + # """ + # return hash(None) + + @property + def position(self: UnknownPosition): + """ + Legacy attribute to get location (None) (OBSOLETE). + """ + return None + + @property + def extension(self: UnknownPosition): # noqa: D402 + """ + Legacy attribute to get extension (zero) as integer (OBSOLETE). + """ + return 0 + + def _shift(self: UnknownPosition, offset: int): + """ + Return a copy of the position object with its location shifted (PRIVATE). + """ + return self + + def _flip(self: UnknownPosition, length: int): + """ + Return a copy of the location after the parent is reversed (PRIVATE). + """ + return self + +# class WithinPosition(int, AbstractPosition): +class WithinPosition: + """ + Specify the position of a boundary within some coordinates. + """ + position: int + left: int + right: int + + def __new__(cls: WithinPosition, position: int, left: int, right: int): + """ + Create a WithinPosition object. + """ + if not (position == left or position == right): + raise RuntimeError(f"WithinPosition: {position} should match left {left} or right {right}") + obj = int.__new__(cls, position) + obj._left = left + obj._right = right + return obj + + def __getnewargs__(self: WithinPosition): + """ + Return the arguments accepted by __new__. + Necessary to allow pickling and unpickling of class instances. + """ + return (int(self), self._left, self._right) + + def __repr__(self: WithinPosition): + """ + Represent the WithinPosition object as a string for debugging. + """ + return f"{self.__class__.__name__}({int(self)}, left={self._left}, right={self._right})" \ + + def __str__(self: WithinPosition): + """ + Return a representation of the WithinPosition object (with python counting). + """ + return f"({self._left}.{self._right})" + + @property + def position(self: WithinPosition): + """ + Legacy attribute to get (left) position as integer (OBSOLETE). + """ + return self._left + + @property + def extension(self: WithinPosition): # noqa: D402 + """ + Legacy attribute to get extension (from left to right) as an integer (OBSOLETE). + """ + return self._right - self._left + + def _shift(self: WithinPosition, offset: int): + """ + Return a copy of the position object with its location shifted (PRIVATE). + """ + return self.__class__(int(self) + offset, + self._left + offset, + self._right + offset) + + def _flip(self: WithinPosition, length: int): + """ + Return a copy of the location after the parent is reversed (PRIVATE). + """ + return self.__class__(length - int(self), + length - self._right, + length - self._left) + + +# class BetweenPosition(int, AbstractPosition): +class BetweenPosition: + """ + Specify the position of a boundary between two coordinates (OBSOLETE?). + """ + position: int + left: int + right: int + + def __new__(cls: BetweenPosition, position: int, left: int, right: int): + """ + Create a new instance in BetweenPosition object. + """ + assert position == left or position == right + obj = int.__new__(cls, position) + obj._left = left + obj._right = right + return obj + + def __getnewargs__(self: BetweenPosition): + """ + Return the arguments accepted by __new__. + Necessary to allow pickling and unpickling of class instances. + """ + return (int(self), self._left, self._right) + + def __repr__(self: BetweenPosition): + """ + Represent the BetweenPosition object as a string for debugging. + """ + return f"{self.__class__.__name__}({int(self)}, left={self._left}, right={self._right})" \ + + def __str__(self: BetweenPosition): + """ + Return a representation of the BetweenPosition object (with python counting). + """ + return f"({self._left}^{self._right})" + + @property + def position(self: BetweenPosition): + """ + Legacy attribute to get (left) position as integer (OBSOLETE). + """ + return self._left + + @property + def extension(self: BetweenPosition): # noqa: D402 + """ + Legacy attribute to get extension (from left to right) as an integer (OBSOLETE). + """ + return self._right - self._left + + def _shift(self: BetweenPosition, offset: int): + """ + Return a copy of the position object with its location shifted (PRIVATE). + """ + return self.__class__(int(self) + offset, + self._left + offset, + self._right + offset) + + def _flip(self: BetweenPosition, length: int): + """ + Return a copy of the location after the parent is reversed (PRIVATE). + """ + return self.__class__(length - int(self), + length - self._right, + length - self._left) + +# class BeforePosition(int, AbstractPosition): +class BeforePosition: + """ + Specify a position where the actual location occurs before it. + """ + position: int + extension: int + + # Subclasses int so can't use __init__ + def __new__(cls: BeforePosition, position: int, extension: int=0): + """ + Create a new instance in BeforePosition object. + """ + if extension != 0: + raise AttributeError(f"Non-zero extension {extension} for exact position.") + + return int.__new__(cls, position) + + @property + def position(self: BeforePosition): + """ + Legacy attribute to get position as integer (OBSOLETE). + """ + return int(self) + + @property + def extension(self: BeforePosition): # noqa: D402 + """ + Legacy attribute to get extension (zero) as integer (OBSOLETE). + """ + return 0 + + def __repr__(self: BeforePosition): + """ + Represent the location as a string for debugging. + """ + return f"{self.__class__.__name__}({int(self)})" + + def __str__(self: BeforePosition): + """ + Return a representation of the BeforePosition object (with python counting). + """ + return "<{self.position}" + + def _shift(self: BeforePosition, offset: int): + """ + Return a copy of the position object with its location shifted (PRIVATE). + """ + return self.__class__(int(self) + offset) + + def _flip(self: BeforePosition, length: int): + """ + Return a copy of the location after the parent is reversed (PRIVATE). + """ + return AfterPosition(length - int(self)) + +# class AfterPosition(int, AbstractPosition): +class AfterPosition: + """ + Specify a position where the actual location is found after it. + """ + position: int + extension: int + + # Subclasses int so can't use __init__ + def __new__(cls: AfterPosition, position: int, extension: int=0): + """ + Create a new instance of the AfterPosition object. + """ + if extension != 0: + raise AttributeError(f"Non-zero extension {extension} for exact position.") + return int.__new__(cls, position) + + @property + def position(self: AfterPosition): + """ + Legacy attribute to get position as integer (OBSOLETE). + """ + return int(self) + + @property + def extension(self: AfterPosition): # noqa: D402 + """ + Legacy attribute to get extension (zero) as integer (OBSOLETE). + """ + return 0 + + def __repr__(self: AfterPosition): + """ + Represent the location as a string for debugging. + """ + return f"{self.__class__.__name__}({int(self)})" + + def __str__(self: AfterPosition): + """ + Return a representation of the AfterPosition object (with python counting). + """ + return f">{self.position}" + + def _shift(self: AfterPosition, offset: int): + """ + Return a copy of the position object with its location shifted (PRIVATE). + """ + return self.__class__(int(self) + offset) + + def _flip(self: AfterPosition, length: int): + """ + Return a copy of the location after the parent is reversed (PRIVATE). + """ + return BeforePosition(length - int(self)) + +#class OneOfPosition(int, AbstractPosition) +class OneOfPosition: + """ + Specify a position where the location can be multiple positions. + """ + position: int + choices: list[ExactPosition] + + def __new__(cls: OneOfPosition, position: int, choices: list[ExactPosition]): + """ + Initialize with a set of possible positions. + position_list is a list of AbstractPosition derived objects, + specifying possible locations. + position is an integer specifying the default behaviour. + """ + if position not in choices: + raise ValueError(f"OneOfPosition: {position} should match one of {choices}") + obj = int.__new__(cls, position) + obj.position_choices = choices + return obj + + def __getnewargs__(self: OneOfPosition): + """ + Return the arguments accepted by __new__. + Necessary to allow pickling and unpickling of class instances. + """ + return (int(self), self.position_choices) + + @property + def position(self: OneOfPosition): + """ + Legacy attribute to get (left) position as integer (OBSOLETE). + """ + return min(int(pos) for pos in self.position_choices) + + @property + def extension(self: OneOfPosition): + """ + Legacy attribute to get extension as integer (OBSOLETE). + """ + positions = [int(pos) for pos in self.position_choices] + return max(positions) - min(positions) + + def __repr__(self: OneOfPosition): + """ + Represent the OneOfPosition object as a string for debugging. + """ + return f"{self.__class__.__name__}({int(self)}, choices={self.position_choices})" + + + def __str__(self: OneOfPosition): + """ + Return a representation of the OneOfPosition object. + """ + out = "one-of(" + for position in self.position_choices: + out += f"{position}," + # replace the last comma with the closing parenthesis + return out[:-1] + ")" + + def _shift(self: OneOfPosition, offset: int): + """ + Return a copy of the position object with its location shifted (PRIVATE). + """ + return self.__class__(int(self) + offset, + [p._shift(offset) for p in self.position_choices]) + + def _flip(self: OneOfPosition, length: int): + """ + Return a copy of the location after the parent is reversed (PRIVATE). + """ + return self.__class__(length - int(self), + [p._flip(length) for p in self.position_choices[::-1]]) + +#class PositionGap(object): +class PositionGap: + """ + Simple class to hold information about a gap between positions. + """ + gap_size: int + + def __init__(self: PositionGap, gap_size: int): + """ + Intialize with a position object containing the gap information. + """ + self.gap_size = gap_size + + def __repr__(self: PositionGap): + """ + Represent the position gap as a string for debugging. + """ + return f"{self.__class__.__name__}({repr(self.gap_size)})" + + def __str__(self: PositionGap): + """ + Return a representation of the PositionGap object. + """ + return f"gap({self.gap_size})" diff --git a/stdlib/biopy/seqrecord.seq b/stdlib/biopy/seqrecord.seq new file mode 100644 index 000000000..0bc2cd6e9 --- /dev/null +++ b/stdlib/biopy/seqrecord.seq @@ -0,0 +1,529 @@ + +"""Represent a Sequence Record, a sequence with annotation.""" + +_NO_SEQRECORD_COMPARISON = "SeqRecord comparison is deliberately not implemented. Explicitly compare the attributes of interest." + + +class _RestrictedDict: + """ + Dict which only allows sequences of given length as values (PRIVATE). + """ + d: dict[str, str] + _length: int + + def __init__(self: _RestrictedDict, length: int): + """Create an EMPTY restricted dictionary.""" + self.d.__init__(self) + self._length = length + + def __setitem__(self, key, value): + self.d.__setitem__(self, key, value) + + def update(self, new_dict): + # Force this to go via our strict __setitem__ method + for key, value in new_dict.items(): + self[key] = value + +class SeqRecord: + """ + A SeqRecord object holds a sequence and information about it. + """ + _seq: seq + id: str + name: str + description: str + dbxrefs: list[str] + annotations: dict[str, str] + letter_annotations: dict[int, str] + features: list[SeqFeature] + + def __init__(self: SeqRecord, s: seq, id: str="", name: str="", + description: str="", dbxrefs: optional[list[str]]=None, + features: list[SeqFeature]=None, annotations: optional[dict[str, str]]=None, + letter_annotations: dict[int, str]=None): + """ + Create a SeqRecord. + """ + self._seq = s + self.id = id + self.name = name + self.description = description + + # database cross references (for the whole sequence) + if dbxrefs is None: + dbxrefs = list[str]() + self.dbxrefs = dbxrefs + + # annotations about the whole sequence + if annotations is None: + annotations = dict[str, str]() + self.annotations = annotations + + if letter_annotations is None: + # annotations about each letter in the sequence + if s is None: + self._per_letter_annotations = _RestrictedDict(length=0) + else: + self._per_letter_annotations = \ + _RestrictedDict(length=len(s)) + + else: + # This will be handled via the property set function, which will + # turn this into a _RestrictedDict and thus ensure all the values + # in the dict are the right length + self.letter_annotations = letter_annotations + + # annotations about parts of the sequence + if features is None: + features = list[SeqFeature] + self.features = features + + def _set_per_letter_annotations(self, value): + # Turn this into a restricted-dictionary (and check the entries) + try: + self._per_letter_annotations = _RestrictedDict(length=len(self.s)) + except AttributeError: + # e.g. s is None + self._per_letter_annotations = _RestrictedDict(length=0) + self._per_letter_annotations.update(value) + + # letter_annotations = property( + # fget=lambda self: self._per_letter_annotations, + # fset=_set_per_letter_annotations, + # doc="""Dictionary of per-letter-annotation for the sequence. + # For example, this can hold quality scores used in FASTQ or QUAL files. + # Consider this example using Bio.SeqIO to read in an example Solexa + # variant FASTQ file as a SeqRecord: + # >>> from Bio import SeqIO + # >>> record = SeqIO.read("Quality/solexa_faked.fastq", "fastq-solexa") + # >>> print("%s %s" % (record.id, record.seq)) + # slxa_0001_1_0001_01 ACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTNNNNNN + # >>> print(list(record.letter_annotations)) + # ['solexa_quality'] + # >>> print(record.letter_annotations["solexa_quality"]) + # [40, 39, 38, 37, 36, 35, 34, 33, 32, 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0, -1, -2, -3, -4, -5] + # The letter_annotations get sliced automatically if you slice the + # parent SeqRecord, for example taking the last ten bases: + # >>> sub_record = record[-10:] + # >>> print("%s %s" % (sub_record.id, sub_record.seq)) + # slxa_0001_1_0001_01 ACGTNNNNNN + # >>> print(sub_record.letter_annotations["solexa_quality"]) + # [4, 3, 2, 1, 0, -1, -2, -3, -4, -5] + # Any python sequence (i.e. list, tuple or string) can be recorded in + # the SeqRecord's letter_annotations dictionary as long as the length + # matches that of the SeqRecord's sequence. e.g. + # >>> len(sub_record.letter_annotations) + # 1 + # >>> sub_record.letter_annotations["dummy"] = "abcdefghij" + # >>> len(sub_record.letter_annotations) + # 2 + # You can delete entries from the letter_annotations dictionary as usual: + # >>> del sub_record.letter_annotations["solexa_quality"] + # >>> sub_record.letter_annotations + # {'dummy': 'abcdefghij'} + # You can completely clear the dictionary easily as follows: + # >>> sub_record.letter_annotations = {} + # >>> sub_record.letter_annotations + # {} + # Note that if replacing the record's sequence with a sequence of a + # different length you must first clear the letter_annotations dict. + # """) + + def _set_seq(self, value): + if self._per_letter_annotations: + if len(self) != len(value): + raise ValueError("You must empty the letter annotations first!") + else: + # Leave the existing per letter annotations unchanged: + self._seq = value + else: + self._seq = value + # Reset the (empty) letter annotations dict with new length: + try: + self._per_letter_annotations = _RestrictedDict(length=len(self.seq)) + except AttributeError: + # e.g. seq is None + self._per_letter_annotations = _RestrictedDict(length=0) + + # seq = property(fget=lambda self: self._seq, + # fset=_set_seq, + # doc="The sequence itself, as a Seq or MutableSeq object.") + + def __getitem__(self: SeqRecord, index: int): + """ + Return a an individual letter. + """ + return self.seq[index] + + + def __getitem__(self: SeqRecord, index: slice): + """ + Return a sub-sequence. + Slicing, e.g. my_record[5:10], returns a new SeqRecord for + that sub-sequence with some annotation preserved. + """ + if self.seq is None: + raise ValueError("If the sequence is None, we cannot slice it.") + parent_length = len(self) + + answer = SeqRecord(self.seq[index], + id=self.id, + name=self.name, + description=self.description) + + start, stop, step = index.indices(parent_length) + if step == 1: + # Select relevant features, add them with shifted locations + # assert str(self.seq)[index] == str(self.seq)[start:stop] + for f in self.features: + if start <= f.location.nofuzzy_start \ + and f.location.nofuzzy_end <= stop: + answer.features.append(f._shift(-start)) + + # Slice all the values to match the sliced sequence + # (this should also work with strides, even negative strides): + for key, value in self.letter_annotations.items(): + answer._per_letter_annotations[key] = value[index] + + return answer + + def __iter__(self: SeqRecord): + """ + Iterate over the letters in the sequence. + """ + lines = list[str] + if self.id: + lines.append(f"ID: {self.id}") + if self.name: + lines.append(f"Name: {self.name}") + if self.description: + lines.append(f"Description: {self.description}") + if self.dbxrefs: + lines.append("Database cross-references: " + ", ".join(self.dbxrefs)) + lines.append(f"Number of features: {self.features}") + for a in self.annotations: + lines.append(f"/{a}={self.annotations[a]}") + if self.letter_annotations: + lines.append("Per letter annotation for: " + ", ".join(self.letter_annotations)) + # Don't want to include the entire sequence, + # and showing the alphabet is useful: + lines.append(repr(self.seq)) + return "\n".join(lines) + + def __repr__(self: SeqRecord): + """ + Return a concise summary of the record for debugging (string). + """ + # TODO!!! how to do this? + return "{self.__class__.__name__}(seq={self.seq}, id={self.id}, name={self.name}, description={self.description}, dbxrefs={self.dbxrefs})" + + # TODO/CAVEATS: + # @jordan - `format` and `__format__ still need to be implemented` + # def format(self: SeqRecord, format): + # """ + # Return the record as a string in the specified file format. + # """ + # # See also the __format__ added for Python 2.6 / 3.0, PEP 3101 + # # See also the Bio.Align.Generic.Alignment class and its format() + # return self.__format__(format) + + # def __format__(self: SeqRecord, format_spec: str): + # """ + # Return the record as a string in the specified file format. + # This method supports the python format() function added in + # Python 2.6/3.0. The format_spec should be a lower case string + # supported by Bio.SeqIO as an output file format. See also the + # SeqRecord's format() method. + # Under Python 3 binary formats raise a ValueError, while on + # Python 2 this will work with a deprecation warning. + # """ + # if not islower(format_spec): + # # Follow python convention and default to using __str__ + # return str(self) + # # NOT IMPLEMENTED YET + # from Bio import SeqIO + + # # Easy case, can call string-building function directly + # if format_spec in SeqIO._FormatToString: + # return SeqIO._FormatToString[format_spec](self) + + # if format_spec in SeqIO._BinaryFormats: + # import sys + # if sys.version_info[0] < 3: + # import warnings + # from Bio import BiopythonDeprecationWarning + # warnings.warn( + # "Binary format %s cannot be used with SeqRecord format method on Python 3" + # % format_spec, + # BiopythonDeprecationWarning + # ) + # # Continue - Python 2 StringIO will work... + # else: + # raise ValueError( + # "Binary format %s cannot be used with SeqRecord format method" + # % format_spec + # ) + + # # Harder case, make a temp handle instead + # from Bio._py3k import StringIO + # handle = StringIO() + # SeqIO.write(self, handle, format_spec) + # return handle.getvalue() + + def __len__(self): + """ + Return the length of the sequence. + """ + return len(self.seq) + + def __lt__(self, other): + """ + Define the less-than operand (not implemented). + """ + raise NotImplementedError(_NO_SEQRECORD_COMPARISON) + + def __le___(self, other): + """ + Define the less-than-or-equal-to operand (not implemented). + """ + raise NotImplementedError(_NO_SEQRECORD_COMPARISON) + + def __eq__(self, other): + """ + Define the equal-to operand (not implemented). + """ + raise NotImplementedError(_NO_SEQRECORD_COMPARISON) + + def __ne__(self, other): + """ + Define the not-equal-to operand (not implemented). + """ + raise NotImplementedError(_NO_SEQRECORD_COMPARISON) + + def __gt__(self, other): + """ + Define the greater-than operand (not implemented). + """ + raise NotImplementedError(_NO_SEQRECORD_COMPARISON) + + def __ge__(self, other): + """ + Define the greater-than-or-equal-to operand (not implemented). + """ + raise NotImplementedError(_NO_SEQRECORD_COMPARISON) + + # Note Python 3 does not use __cmp__ and there is no need to + # define __cmp__ on Python 2 as have all of _lt__ etc defined. + + # Python 3: + def __bool__(self): + """ + Boolean value of an instance of this class (True). + """ + return True + + def __add__(self: SeqRecord, other: str): + """ + Add another sequence or string to this sequence. + """ + # Note can't transfer any per-letter-annotations + return SeqRecord(self.seq + other, + id=self.id, name=self.name, + description=self.description, + features=self.features[:], + annotations=self.annotations.copy(), + dbxrefs=self.dbxrefs[:]) + + def __add__(self: SeqRecord, other: seq): + """ + Add another sequence or string to this sequence. + """ + # Note can't transfer any per-letter-annotations + return SeqRecord(self.seq + other, + id=self.id, name=self.name, + description=self.description, + features=self.features[:], + annotations=self.annotations.copy(), + dbxrefs=self.dbxrefs[:]) + + def __add__(self: SeqRecord, other: SeqRecord): + """ + Add another sequence or string to this sequence. + """ + + # Adding two SeqRecord objects... must merge annotation. + answer = SeqRecord(self.seq + other.seq, + features=self.features[:], + dbxrefs=self.dbxrefs[:]) + # Will take all the features and all the db cross refs, + length = len(self) + for f in other.features: + answer.features.append(f._shift(length)) + del length + for ref in other.dbxrefs: + if ref not in answer.dbxrefs: + answer.dbxrefs.append(ref) + # Take common id/name/description/annotation + if self.id == other.id: + answer.id = self.id + if self.name == other.name: + answer.name = self.name + if self.description == other.description: + answer.description = self.description + for k, v in self.annotations.items(): + if k in other.annotations and other.annotations[k] == v: + answer.annotations[k] = v + # Can append matching per-letter-annotation + for k, v in self.letter_annotations.items(): + if k in other.letter_annotations: + answer.letter_annotations[k] = v + other.letter_annotations[k] + return answer + + def __radd__(self: SeqRecord, other: str): + """ + Add another string to this sequence (from the left). + """ + # Note can't transfer any per-letter-annotations + offset = len(other) + return SeqRecord(other + self.seq, + id=self.id, name=self.name, + description=self.description, + features=[f._shift(offset) for f in self.features], + annotations=self.annotations.copy(), + dbxrefs=self.dbxrefs[:]) + + def __radd__(self: SeqRecord, other: seq): + """ + Add another sequence to this sequence (from the left). + """ + # Note can't transfer any per-letter-annotations + offset = len(other) + return SeqRecord(other + self.seq, + id=self.id, name=self.name, + description=self.description, + features=[f._shift(offset) for f in self.features], + annotations=self.annotations.copy(), + dbxrefs=self.dbxrefs[:]) + + def upper(self): + """ + Return a copy of the record with an upper case sequence. + """ + return SeqRecord(self.seq.upper(), + id=self.id, name=self.name, + description=self.description, + dbxrefs=self.dbxrefs[:], + features=self.features[:], + annotations=self.annotations.copy(), + letter_annotations=self.letter_annotations.copy()) + + def lower(self): + """ + Return a copy of the record with a lower case sequence. + """ + return SeqRecord(self.seq.lower(), + id=self.id, name=self.name, + description=self.description, + dbxrefs=self.dbxrefs[:], + features=self.features[:], + annotations=self.annotations.copy(), + letter_annotations=self.letter_annotations.copy()) + + def reverse_complement(self: SeqRecord, id: str=False, name: str=False, description: str=False, + features: list[SeqFeature]=False, annotations: dict[str, str]=False, + letter_annotations: dict[int, str]=False, dbxrefs: list[str]=False): + """ + Return new SeqRecord with reverse complement sequence. + """ + answer = SeqRecord(self.seq.reverse_complement()) + if id: + answer.id = id + else: + answer.id = self.id + + if name: + answer.name = name + else: + answer.name = self.name + + if description: + answer.description = description + else: + answer.description = self.description + + if dbxrefs: + answer.dbxrefs = dbxrefs + else: + answer.dbxrefs = self.dbxrefs[:] + + if features: + answer.features = features + else: + # Copy the old features, adjusting location and string + length = len(answer) + answer.features = [f._flip(length) for f in self.features] + answer.features.sort(key=lambda x: x.location.start.position) + + if annotations: + answer.annotations = annotations + else: + answer.annotations = self.annotations.copy() + + if letter_annotations: + answer.letter_annotations = letter_annotations + else: + # Copy the old per letter annotations, reversing them + for key, value in self.letter_annotations.items(): + answer._per_letter_annotations[key] = value[::-1] + + return answer + + def translate(self: SeqRecord, + # Seq translation arguments: + table="Standard", stop_symbol="*", to_stop=False, + cds=False, gap=None, + # SeqRecord annotation arguments: + id: str=False, name: str=False, description: str=False, + features: list[SeqFeature]=False, annotations: dict[str, str]=False, + letter_annotations: dict[int, str]=False, dbxrefs: list[str]=False): + """ + Return new SeqRecord with translated sequence. + """ + answer = SeqRecord(self.seq.translate(table=table, + stop_symbol=stop_symbol, + to_stop=to_stop, + cds=cds, + gap=gap)) + if id: + answer.id = id + else: + answer.id = self.id + + if name: + answer.name = name + else: + answer.name = self.name + + if description: + answer.description = description + else: + answer.description = self.description + + if dbxrefs: + answer.dbxrefs = dbxrefs + else: + answer.dbxrefs = self.dbxrefs[:] + + if features: + answer.features = features + + if annotations: + answer.annotations = annotations + else: + # Copy the old annotations + answer.annotations = self.annotations.copy() + + if letter_annotations: + answer.letter_annotations = letter_annotations + + return answer \ No newline at end of file From 5d22d3328fe338e4c1540b5c44744679ab381b5d Mon Sep 17 00:00:00 2001 From: Jordan Watson Date: Mon, 16 Dec 2019 11:21:23 -0800 Subject: [PATCH 11/16] Add AttributeError to err.seq --- stdlib/core/err.seq | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/stdlib/core/err.seq b/stdlib/core/err.seq index d23d98273..960d72fd5 100644 --- a/stdlib/core/err.seq +++ b/stdlib/core/err.seq @@ -149,6 +149,19 @@ class StopIteration: def message(self: StopIteration): return self._hdr.msg +class AttributeError: + _hdr: ExcHeader + + def __init__(self: AttributeError): + self._hdr = ('AttributeError', '', '', '', 0, 0) + + def __init__(self: AttributeError, message: str): + self._hdr = ('AttributeError', message, '', '', 0, 0) + + @property + def message(self: AttributeError): + return self._hdr.msg + def check_errno(prefix: str): msg = _C.seq_check_errno() if msg: From 388cf83a54847ebf61e2f94e196f9a64a1c96f3a Mon Sep 17 00:00:00 2001 From: Jordan Watson Date: Mon, 16 Dec 2019 11:24:02 -0800 Subject: [PATCH 12/16] Add RuntimeError to err.seq --- stdlib/core/err.seq | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/stdlib/core/err.seq b/stdlib/core/err.seq index 960d72fd5..a3f4bbe30 100644 --- a/stdlib/core/err.seq +++ b/stdlib/core/err.seq @@ -162,6 +162,19 @@ class AttributeError: def message(self: AttributeError): return self._hdr.msg +class RuntimeError: + _hdr: ExcHeader + + def __init__(self: RuntimeError): + self._hdr = ('RuntimeError', '', '', '', 0, 0) + + def __init__(self: RuntimeError, message: str): + self._hdr = ('RuntimeError', message, '', '', 0, 0) + + @property + def message(self: RuntimeError): + return self._hdr.msg + def check_errno(prefix: str): msg = _C.seq_check_errno() if msg: From 20bdef01111fc70c042de629bb7b4f243075399d Mon Sep 17 00:00:00 2001 From: Jordan Watson Date: Mon, 16 Dec 2019 11:30:46 -0800 Subject: [PATCH 13/16] seqfeature.seq compiles but not tested yet --- stdlib/bio/seqfeature.seq | 52 +++++++++++++++++++++------------------ 1 file changed, 28 insertions(+), 24 deletions(-) diff --git a/stdlib/bio/seqfeature.seq b/stdlib/bio/seqfeature.seq index 451eafcbb..ed46eed8e 100644 --- a/stdlib/bio/seqfeature.seq +++ b/stdlib/bio/seqfeature.seq @@ -423,7 +423,7 @@ class SeqFeature: typ=self.typ, location_operator=self.location_operator, id=self.id, - qualifiers=OrderedDict(self.qualifiers.items())) + qualifiers=self.OrderedDict(self.qualifiers.items())) def _flip(self: SeqFeature, length: int): """ @@ -433,7 +433,7 @@ class SeqFeature: typ=self.typ, location_operator=self.location_operator, id=self.id, - qualifiers=OrderedDict(self.qualifiers.items())) + qualifiers=self.OrderedDict(self.qualifiers.items())) def extract(self: SeqFeature, parent_sequence: seq): """ @@ -514,7 +514,7 @@ class Reference: """ Represent a Generic Reference object. """ - location: list[Location] + location: list[FeatureLocation] authors: str title: str journal: str @@ -526,7 +526,7 @@ class Reference: """ Initialize the class. """ - self.location = list[Location]() + self.location = list[FeatureLocation]() self.authors = "" self.consrtm = "" self.title = "" @@ -556,7 +556,7 @@ class Reference: """ Represent the Reference object as a string for debugging. """ - return f"{self.__class__.__name__}(title={repr(self.title)}, ...)" + return f"{self.__class__.__name__}(title={self.repr(self.title)}, ...)" def __eq__(self: Reference, other: Reference): """ @@ -599,7 +599,7 @@ class CompoundLocation: """ Return a representation of the CompoundLocation object. """ - return f"{self.operator}, {", ".join(str(loc) for loc in self.parts)}" + return f"""{self.operator}, {(", ".join(str(loc) for loc in self.parts))}""" def __repr__(self: CompoundLocation): """ @@ -625,7 +625,7 @@ class CompoundLocation: loc.strand = value @property - def strand(self: CompoundLocation, fget=_get_strand, fset=_set_strand, + def strand(self: CompoundLocation, fget: optional[int]=None, fset: optional[int]=None, doc="""Overall strand of the compound location. If all the parts have the same strand, that is returned. Otherwise for mixed strands, this returns None. @@ -650,6 +650,10 @@ class CompoundLocation: >>> f.strand 1 """): + if fget is None: + fget = self._get_strand + if fset is None: + fset = self._set_strand return self._strand def __add__(self: CompoundLocation, other: FeatureLocation): @@ -841,14 +845,14 @@ class ExactPosition: position: int extension: int - def __new__(cls: ExactPosition, position: int, extension: int=0): + def __init__(cls: ExactPosition, position: int, extension: int=0): """ Create an ExactPosition object. """ if extension != 0: raise AttributeError(f"Non-zero extension {extension} for exact position.") - return int.__new__(cls, position) + return int.__init__(cls, position) def __str__(self: ExactPosition): """ @@ -955,20 +959,20 @@ class WithinPosition: left: int right: int - def __new__(cls: WithinPosition, position: int, left: int, right: int): + def __init__(cls: WithinPosition, position: int, left: int, right: int): """ Create a WithinPosition object. """ if not (position == left or position == right): raise RuntimeError(f"WithinPosition: {position} should match left {left} or right {right}") - obj = int.__new__(cls, position) + obj = int.__init__(cls, position) obj._left = left obj._right = right return obj def __getnewargs__(self: WithinPosition): """ - Return the arguments accepted by __new__. + Return the arguments accepted by __init__. Necessary to allow pickling and unpickling of class instances. """ return (int(self), self._left, self._right) @@ -1025,19 +1029,19 @@ class BetweenPosition: left: int right: int - def __new__(cls: BetweenPosition, position: int, left: int, right: int): + def __init__(cls: BetweenPosition, position: int, left: int, right: int): """ Create a new instance in BetweenPosition object. """ assert position == left or position == right - obj = int.__new__(cls, position) + obj = int.__init__(cls, position) obj._left = left obj._right = right return obj def __getnewargs__(self: BetweenPosition): """ - Return the arguments accepted by __new__. + Return the arguments accepted by __init__. Necessary to allow pickling and unpickling of class instances. """ return (int(self), self._left, self._right) @@ -1093,14 +1097,14 @@ class BeforePosition: extension: int # Subclasses int so can't use __init__ - def __new__(cls: BeforePosition, position: int, extension: int=0): + def __init__(cls: BeforePosition, position: int, extension: int=0): """ Create a new instance in BeforePosition object. """ if extension != 0: raise AttributeError(f"Non-zero extension {extension} for exact position.") - return int.__new__(cls, position) + return int.__init__(cls, position) @property def position(self: BeforePosition): @@ -1138,7 +1142,7 @@ class BeforePosition: """ Return a copy of the location after the parent is reversed (PRIVATE). """ - return AfterPosition(length - int(self)) + return self.AfterPosition(length - int(self)) # class AfterPosition(int, AbstractPosition): class AfterPosition: @@ -1149,13 +1153,13 @@ class AfterPosition: extension: int # Subclasses int so can't use __init__ - def __new__(cls: AfterPosition, position: int, extension: int=0): + def __init__(cls: AfterPosition, position: int, extension: int=0): """ Create a new instance of the AfterPosition object. """ if extension != 0: raise AttributeError(f"Non-zero extension {extension} for exact position.") - return int.__new__(cls, position) + return int.__init__(cls, position) @property def position(self: AfterPosition): @@ -1203,7 +1207,7 @@ class OneOfPosition: position: int choices: list[ExactPosition] - def __new__(cls: OneOfPosition, position: int, choices: list[ExactPosition]): + def __init__(cls: OneOfPosition, position: int, choices: list[ExactPosition]): """ Initialize with a set of possible positions. position_list is a list of AbstractPosition derived objects, @@ -1212,13 +1216,13 @@ class OneOfPosition: """ if position not in choices: raise ValueError(f"OneOfPosition: {position} should match one of {choices}") - obj = int.__new__(cls, position) + obj = int.__init__(cls, position) obj.position_choices = choices return obj def __getnewargs__(self: OneOfPosition): """ - Return the arguments accepted by __new__. + Return the arguments accepted by __init__. Necessary to allow pickling and unpickling of class instances. """ return (int(self), self.position_choices) @@ -1286,7 +1290,7 @@ class PositionGap: """ Represent the position gap as a string for debugging. """ - return f"{self.__class__.__name__}({repr(self.gap_size)})" + return f"{self.__class__.__name__}({self.repr(self.gap_size)})" def __str__(self: PositionGap): """ From 9b5cf34b7df4f4292940a4d2e22af13ccba92c6e Mon Sep 17 00:00:00 2001 From: Jordan Watson Date: Tue, 17 Dec 2019 13:41:41 -0800 Subject: [PATCH 14/16] continuing to debug --- stdlib/bio/seqfeature.seq | 282 +++++++++++++++++++------------------ stdlib/biopy/seqrecord.seq | 11 +- 2 files changed, 156 insertions(+), 137 deletions(-) diff --git a/stdlib/bio/seqfeature.seq b/stdlib/bio/seqfeature.seq index ed46eed8e..95098aba3 100644 --- a/stdlib/bio/seqfeature.seq +++ b/stdlib/bio/seqfeature.seq @@ -5,7 +5,113 @@ TODO/CAVEATS: @jordan- For now MutableSeq is left out """ -#from bio.seq import reverse_complement +# class BeforePosition(int, AbstractPosition): +type BeforePosition(position: int): + """ + Specify a position where the actual location occurs before it. + """ + + # Subclasses int so can't use __init__ + def __init__(cls: BeforePosition, position: int, extension: int=0): + """ + Create a new instance in BeforePosition object. + """ + if extension != 0: + raise AttributeError(f"Non-zero extension {extension} for exact position.") + + return cls.position + + @property + def position(self: BeforePosition): + """ + Legacy attribute to get position as integer (OBSOLETE). + """ + return int(self.position) + + @property + def extension(self: BeforePosition): # noqa: D402 + """ + Legacy attribute to get extension (zero) as integer (OBSOLETE). + """ + return 0 + + def __repr__(self: BeforePosition): + """ + Represent the location as a string for debugging. + """ + return f"{self.__class__.__name__}({int(self)})" + + def __str__(self: BeforePosition): + """ + Return a representation of the BeforePosition object (with python counting). + """ + return "<{self.position}" + + def _shift(self: BeforePosition, offset: int): + """ + Return a copy of the position object with its location shifted (PRIVATE). + """ + return self.__class__(int(self) + offset) + + def _flip(self: BeforePosition, length: int): + """ + Return a copy of the location after the parent is reversed (PRIVATE). + """ + return self.AfterPosition(length - int(self)) + +# class AfterPosition(int, AbstractPosition): +type AfterPosition(position: int): + """ + Specify a position where the actual location is found after it. + """ + + # Subclasses int so can't use __init__ + def __init__(cls: AfterPosition, position: int, extension: int=0): + """ + Create a new instance of the AfterPosition object. + """ + if extension != 0: + raise AttributeError(f"Non-zero extension {extension} for exact position.") + return cls.position + + @property + def position(self: AfterPosition): + """ + Legacy attribute to get position as integer (OBSOLETE). + """ + return int(self) + + @property + def extension(self: AfterPosition): # noqa: D402 + """ + Legacy attribute to get extension (zero) as integer (OBSOLETE). + """ + return 0 + + def __repr__(self: AfterPosition): + """ + Represent the location as a string for debugging. + """ + return f"{self.__class__.__name__}({int(self)})" + + def __str__(self: AfterPosition): + """ + Return a representation of the AfterPosition object (with python counting). + """ + return f">{self.position}" + + def _shift(self: AfterPosition, offset: int): + """ + Return a copy of the position object with its location shifted (PRIVATE). + """ + return self.__class__(int(self) + offset) + + def _flip(self: AfterPosition, length: int): + """ + Return a copy of the location after the parent is reversed (PRIVATE). + """ + return BeforePosition(length - int(self)) + # --- Handling feature locations @@ -14,13 +120,13 @@ class FeatureLocation: """ Specify the location of a feature along a sequence. """ - start: int - end: int - strand: int - ref: int - ref_db: int + _start: int + _end: int + _strand: int + _ref: str + _ref_db: str - def __init__(self: FeatureLocation, start: int, end: int, strand: optional[int]=None, ref: optional[int]=None, ref_db: optional[int]=None): + def __init__(self: FeatureLocation, start: int, end: int, strand: optional[int]=None, ref: optional[str]=None, ref_db: optional[str]=None): """ Initialize the class. """ @@ -36,7 +142,23 @@ class FeatureLocation: self.ref = ref self.ref_db = ref_db - def _get_strand(self: FeatureLocation): + def __init__(self: FeatureLocation, start: BeforePosition, end: AfterPosition, strand: optional[int]=None, ref: optional[str]=None, ref_db: optional[str]=None): + """ + Initialize the class. + """ + # TODO - Check 0 <= start <= end (<= length of reference) + self._start = start + + self._end = end + + if self.start > self.end: + raise ValueError(f"""End location ({self.end}) must be greater than or equal + to start location ({self.start})""") + self.strand = strand + self.ref = ref + self.ref_db = ref_db + + def _get_strand(self: FeatureLocation) -> int: """ Get function for the strand property (PRIVATE). """ @@ -51,7 +173,7 @@ class FeatureLocation: self._strand = value @property - def strand(self: FeatureLocation, fget: optional[int]=None, fset: optional[int]=None, doc="Strand of the location (+1, -1, 0 or None)."): + def strand(self: FeatureLocation, fget: optional[int]=None, fset: optional[int]=None, doc: str="Strand of the location (+1, -1, 0 or None)."): if fget is None: fget = self._get_strand if fset is None: @@ -62,16 +184,16 @@ class FeatureLocation: """ Return a representation of the FeatureLocation object. """ - answer = f"[{self._start}:{self._end}]" - if self.ref and self.ref_db: - answer = f"{self.ref_db}:{self.ref}{answer}" - elif self.ref: - answer = self.ref + answer - if self.strand is None: + answer = f"[{self.start}:{self.end}]" + if self._ref and self._ref_db: + answer = f"{self._ref_db}:{self._ref}{answer}" + elif self._ref: + answer = self._ref + answer + if self._strand: return answer - elif self.strand == +1: + elif self._strand == +1: return answer + "(+)" - elif self.strand == -1: + elif self._strand == -1: return answer + "(-)" else: # strand = 0, stranded but strand unknown, ? in GFF3 @@ -142,9 +264,9 @@ class FeatureLocation: """ return self._start == other.start and \ self._end == other.end and \ - self._strand == other.strand and \ - self.ref == other.ref and \ - self.ref_db == other.ref_db + self._strand == other._strand and \ + self._ref == other._ref and \ + self._ref_db == other._ref_db def __ne__(self: FeatureLocation, other: FeatureLocation): """ @@ -255,14 +377,14 @@ class SeqFeature: strand: int id: str qualifiers: dict[str, list[str]] - ref: int - ref_db: int + ref: str + ref_db: str def __init__(self: SeqFeature, location: optional[FeatureLocation]=None, typ: str="", location_operator: str="", strand: optional[int]=None, id: str="", qualifiers: optional[dict[str, list[str]]]=None, - ref: optional[int]=None, ref_db: optional[int]=None): + ref: optional[str]=None, ref_db: optional[str]=None): """ Initialize a SeqFeature on a Sequence. location can either be a FeatureLocation (with strand argument also @@ -854,7 +976,7 @@ class ExactPosition: return int.__init__(cls, position) - def __str__(self: ExactPosition): + def __str__(self: ExactPosition) -> str: """ Return a representation of the ExactPosition object. """ @@ -1088,116 +1210,6 @@ class BetweenPosition: length - self._right, length - self._left) -# class BeforePosition(int, AbstractPosition): -class BeforePosition: - """ - Specify a position where the actual location occurs before it. - """ - position: int - extension: int - - # Subclasses int so can't use __init__ - def __init__(cls: BeforePosition, position: int, extension: int=0): - """ - Create a new instance in BeforePosition object. - """ - if extension != 0: - raise AttributeError(f"Non-zero extension {extension} for exact position.") - - return int.__init__(cls, position) - - @property - def position(self: BeforePosition): - """ - Legacy attribute to get position as integer (OBSOLETE). - """ - return int(self) - - @property - def extension(self: BeforePosition): # noqa: D402 - """ - Legacy attribute to get extension (zero) as integer (OBSOLETE). - """ - return 0 - - def __repr__(self: BeforePosition): - """ - Represent the location as a string for debugging. - """ - return f"{self.__class__.__name__}({int(self)})" - - def __str__(self: BeforePosition): - """ - Return a representation of the BeforePosition object (with python counting). - """ - return "<{self.position}" - - def _shift(self: BeforePosition, offset: int): - """ - Return a copy of the position object with its location shifted (PRIVATE). - """ - return self.__class__(int(self) + offset) - - def _flip(self: BeforePosition, length: int): - """ - Return a copy of the location after the parent is reversed (PRIVATE). - """ - return self.AfterPosition(length - int(self)) - -# class AfterPosition(int, AbstractPosition): -class AfterPosition: - """ - Specify a position where the actual location is found after it. - """ - position: int - extension: int - - # Subclasses int so can't use __init__ - def __init__(cls: AfterPosition, position: int, extension: int=0): - """ - Create a new instance of the AfterPosition object. - """ - if extension != 0: - raise AttributeError(f"Non-zero extension {extension} for exact position.") - return int.__init__(cls, position) - - @property - def position(self: AfterPosition): - """ - Legacy attribute to get position as integer (OBSOLETE). - """ - return int(self) - - @property - def extension(self: AfterPosition): # noqa: D402 - """ - Legacy attribute to get extension (zero) as integer (OBSOLETE). - """ - return 0 - - def __repr__(self: AfterPosition): - """ - Represent the location as a string for debugging. - """ - return f"{self.__class__.__name__}({int(self)})" - - def __str__(self: AfterPosition): - """ - Return a representation of the AfterPosition object (with python counting). - """ - return f">{self.position}" - - def _shift(self: AfterPosition, offset: int): - """ - Return a copy of the position object with its location shifted (PRIVATE). - """ - return self.__class__(int(self) + offset) - - def _flip(self: AfterPosition, length: int): - """ - Return a copy of the location after the parent is reversed (PRIVATE). - """ - return BeforePosition(length - int(self)) #class OneOfPosition(int, AbstractPosition) class OneOfPosition: @@ -1220,7 +1232,7 @@ class OneOfPosition: obj.position_choices = choices return obj - def __getnewargs__(self: OneOfPosition): + def __getnewargs__(self: OneOfPosition) -> tuple[int, list[ExactPosition]]: """ Return the arguments accepted by __init__. Necessary to allow pickling and unpickling of class instances. diff --git a/stdlib/biopy/seqrecord.seq b/stdlib/biopy/seqrecord.seq index 0bc2cd6e9..5fc23a78f 100644 --- a/stdlib/biopy/seqrecord.seq +++ b/stdlib/biopy/seqrecord.seq @@ -1,6 +1,8 @@ """Represent a Sequence Record, a sequence with annotation.""" +from bio.seqfeature import SeqFeature + _NO_SEQRECORD_COMPARISON = "SeqRecord comparison is deliberately not implemented. Explicitly compare the attributes of interest." @@ -24,6 +26,7 @@ class _RestrictedDict: for key, value in new_dict.items(): self[key] = value + class SeqRecord: """ A SeqRecord object holds a sequence and information about it. @@ -207,7 +210,7 @@ class SeqRecord: lines.append("Per letter annotation for: " + ", ".join(self.letter_annotations)) # Don't want to include the entire sequence, # and showing the alphabet is useful: - lines.append(repr(self.seq)) + lines.append(self.repr(self.seq)) return "\n".join(lines) def __repr__(self: SeqRecord): @@ -434,6 +437,9 @@ class SeqRecord: letter_annotations: dict[int, str]=False, dbxrefs: list[str]=False): """ Return new SeqRecord with reverse complement sequence. + TODO/CAVEATS: + @jordan- seq does not support lamda (check line 471) + answer.features.sort(key=lambda x: x.location.start.position) """ answer = SeqRecord(self.seq.reverse_complement()) if id: @@ -462,8 +468,9 @@ class SeqRecord: # Copy the old features, adjusting location and string length = len(answer) answer.features = [f._flip(length) for f in self.features] - answer.features.sort(key=lambda x: x.location.start.position) + # answer.features.sort(key=lambda x: x.location.start.position) + answer.features.sort(key= self.location.start.position) if annotations: answer.annotations = annotations else: From 5aae89dcca3bbef1dbfdddc9e35488e90f969c86 Mon Sep 17 00:00:00 2001 From: Jordan Watson Date: Fri, 20 Dec 2019 10:06:59 -0800 Subject: [PATCH 15/16] Testing and debugging seqfeature.seq --- stdlib/bio/seqfeature.seq | 206 +++++---------------- test/stdlib/biopy_test/seqfeature_test.seq | 94 ++++++++++ 2 files changed, 145 insertions(+), 155 deletions(-) create mode 100644 test/stdlib/biopy_test/seqfeature_test.seq diff --git a/stdlib/bio/seqfeature.seq b/stdlib/bio/seqfeature.seq index 95098aba3..cd7d6e7b5 100644 --- a/stdlib/bio/seqfeature.seq +++ b/stdlib/bio/seqfeature.seq @@ -4,113 +4,72 @@ Represent a Sequence Feature holding info about a part of a sequence. TODO/CAVEATS: @jordan- For now MutableSeq is left out """ +EXACT = 0 +BEFORE = 1 +AFTER = 2 -# class BeforePosition(int, AbstractPosition): -type BeforePosition(position: int): +type Position(position: int, mode: int, extension: int): """ - Specify a position where the actual location occurs before it. + Specify the specific position of a boundary. """ - # Subclasses int so can't use __init__ - def __init__(cls: BeforePosition, position: int, extension: int=0): - """ - Create a new instance in BeforePosition object. - """ + def __init__(cls: Position, position: int, mode: int, extension: int=0): if extension != 0: raise AttributeError(f"Non-zero extension {extension} for exact position.") - - return cls.position - - @property - def position(self: BeforePosition): - """ - Legacy attribute to get position as integer (OBSOLETE). - """ - return int(self.position) - - @property - def extension(self: BeforePosition): # noqa: D402 - """ - Legacy attribute to get extension (zero) as integer (OBSOLETE). - """ - return 0 - - def __repr__(self: BeforePosition): - """ - Represent the location as a string for debugging. - """ - return f"{self.__class__.__name__}({int(self)})" - - def __str__(self: BeforePosition): - """ - Return a representation of the BeforePosition object (with python counting). - """ - return "<{self.position}" - - def _shift(self: BeforePosition, offset: int): - """ - Return a copy of the position object with its location shifted (PRIVATE). - """ - return self.__class__(int(self) + offset) - def _flip(self: BeforePosition, length: int): - """ - Return a copy of the location after the parent is reversed (PRIVATE). - """ - return self.AfterPosition(length - int(self)) + if not 0 <= mode <= 2: + raise ValueError(f"mode must be a value of EXACT={EXACT}, BEFORE={BEFORE}, AFTER={AFTER}.") -# class AfterPosition(int, AbstractPosition): -type AfterPosition(position: int): - """ - Specify a position where the actual location is found after it. - """ - - # Subclasses int so can't use __init__ - def __init__(cls: AfterPosition, position: int, extension: int=0): - """ - Create a new instance of the AfterPosition object. - """ - if extension != 0: - raise AttributeError(f"Non-zero extension {extension} for exact position.") return cls.position - + @property - def position(self: AfterPosition): + def position(self: Position): """ Legacy attribute to get position as integer (OBSOLETE). """ - return int(self) - + return int(self.position) + @property - def extension(self: AfterPosition): # noqa: D402 + def extension(self: Position): # noqa: D402 """ Legacy attribute to get extension (zero) as integer (OBSOLETE). """ return 0 - def __repr__(self: AfterPosition): + def __repr__(self: Position): """ Represent the location as a string for debugging. """ - return f"{self.__class__.__name__}({int(self)})" + return f"{self.__class__.__name__}({int(self)})" - def __str__(self: AfterPosition): + def __str__(self: Position): """ - Return a representation of the AfterPosition object (with python counting). + Return a representation of the Position object. """ - return f">{self.position}" + if self.mode == EXACT: + return f"{self.position}" + elif self.mode == BEFORE: + return "<{self.position}" + else: + return ">{self.position}" - def _shift(self: AfterPosition, offset: int): + def _shift(self: Position, offset: int): """ Return a copy of the position object with its location shifted (PRIVATE). """ return self.__class__(int(self) + offset) - def _flip(self: AfterPosition, length: int): + def _flip(self: Position, length: int): """ Return a copy of the location after the parent is reversed (PRIVATE). """ - return BeforePosition(length - int(self)) + if self.mode == EXACT: + # By default perserve any subclass + return self.__class__(length = int(self)) + elif self.mode == BEFORE: + return Position(length - int(self), 2, 0) + else: + return Position(length - int(self), 1, 0) # --- Handling feature locations @@ -130,25 +89,23 @@ class FeatureLocation: """ Initialize the class. """ - # TODO - Check 0 <= start <= end (<= length of reference) + self._start = start - self._end = end - if self.start > self.end: - raise ValueError(f"""End location ({self.end}) must be greater than or equal - to start location ({self.start})""") + if self._start > self._end: + raise ValueError(f"End location {self._end} must be greater than or equal to start location {self._start}.") + self.strand = strand self.ref = ref self.ref_db = ref_db - - def __init__(self: FeatureLocation, start: BeforePosition, end: AfterPosition, strand: optional[int]=None, ref: optional[str]=None, ref_db: optional[str]=None): + + def __init__(self: FeatureLocation, start: Position, end: Position, strand: optional[int]=None, ref: optional[str]=None, ref_db: optional[str]=None): """ Initialize the class. """ - # TODO - Check 0 <= start <= end (<= length of reference) + self._start = start - self._end = end if self.start > self.end: @@ -212,12 +169,6 @@ class FeatureLocation: optional += f", ref_db={self.ref_db}" return f"{self.__class__.__name__}({self.start}, {self.end}{optional})" \ - def __add__(self: FeatureLocation, other: FeatureLocation): - """ - Combine location with another FeatureLocation object, or shift it. - """ - return self.CompoundLocation([self, other]) - # Not sure about `other` type def __radd__(self: FeatureLocation, other: int): """ @@ -721,7 +672,7 @@ class CompoundLocation: """ Return a representation of the CompoundLocation object. """ - return f"""{self.operator}, {(", ".join(str(loc) for loc in self.parts))}""" + return f"""{self.operator}{{(", ".join(list(str(loc) for loc in self.parts)))}} """ def __repr__(self: CompoundLocation): """ @@ -943,6 +894,13 @@ class CompoundLocation: f_seq += part return f_seq +extend FeatureLocation: + def __add__(self: FeatureLocation, other: FeatureLocation): + """ + Combine location with another FeatureLocation object, or shift it. + """ + return CompoundLocation([self, other], "join") + # class AbstractPosition(object): class AbstractPosition: """ @@ -955,69 +913,7 @@ class AbstractPosition: """ return f"{self.__class__.__name__}(...)" -# class ExactPosition(int, AbstractPosition): -class ExactPosition: - """ - Specify the specific position of a boundary. - - position - The position of the boundary. - - extension - An optional argument which must be zero since we don't - have an extension. The argument is provided so that the same number - of arguments can be passed to all position types. - """ - position: int - extension: int - - def __init__(cls: ExactPosition, position: int, extension: int=0): - """ - Create an ExactPosition object. - """ - if extension != 0: - raise AttributeError(f"Non-zero extension {extension} for exact position.") - - return int.__init__(cls, position) - - def __str__(self: ExactPosition) -> str: - """ - Return a representation of the ExactPosition object. - """ - return str(int(self)) - - def __repr__(self: ExactPosition): - """ - Represent the ExactPosition object as a string for debugging. - """ - return f"{self.__class__.__name__}({int(self)})" - - @property - def position(self: ExactPosition): - """ - Legacy attribute to get position as integer (OBSOLETE). - """ - return int(self) - - @property - def extension(self: ExactPosition): - """ - Not present in this object, return zero (OBSOLETE). - """ - return 0 - - def _shift(self: ExactPosition, offset: int): - """ - Return a copy of the position object with its location shifted (PRIVATE). - """ - # By default preserve any subclass - return self.__class__(int(self) + offset) - - def _flip(self: ExactPosition, length: int): - """ - Return a copy of the location after the parent is reversed (PRIVATE). - """ - # By default perserve any subclass - return self.__class__(length - int(self)) - - -# class UncertainPosition(ExactPosition): +# class UncertainPosition(Position): # """ # Specify a specific position which is uncertain. # This is used in UniProt, e.g. ?222 for uncertain position 222, or in the @@ -1217,9 +1113,9 @@ class OneOfPosition: Specify a position where the location can be multiple positions. """ position: int - choices: list[ExactPosition] + choices: list[Position] - def __init__(cls: OneOfPosition, position: int, choices: list[ExactPosition]): + def __init__(cls: OneOfPosition, position: int, choices: list[Position]): """ Initialize with a set of possible positions. position_list is a list of AbstractPosition derived objects, @@ -1232,7 +1128,7 @@ class OneOfPosition: obj.position_choices = choices return obj - def __getnewargs__(self: OneOfPosition) -> tuple[int, list[ExactPosition]]: + def __getnewargs__(self: OneOfPosition) -> tuple[int, list[Position]]: """ Return the arguments accepted by __init__. Necessary to allow pickling and unpickling of class instances. diff --git a/test/stdlib/biopy_test/seqfeature_test.seq b/test/stdlib/biopy_test/seqfeature_test.seq new file mode 100644 index 000000000..46c7dee67 --- /dev/null +++ b/test/stdlib/biopy_test/seqfeature_test.seq @@ -0,0 +1,94 @@ +from Bio.SeqFeature import FeatureLocation, Position +from Bio.SeqFeature import CompoundLocation, UnknownPosition, SeqFeature +from Bio.SeqFeature import WithinPosition, BetweenPosition +from Bio.SeqFeature import OneOfPosition + + +####### Test FeatureLocation ####### +@test +def test_eq_identical(): + """ + Test two identical locations are equal. + """ + loc1 = FeatureLocation(23, 42, 1, '', '') + loc2 = FeatureLocation(23, 42, 1, '', '') + assert loc1 == loc2 + + loc1 = FeatureLocation(23, 42, -1, '', '') + loc2 = FeatureLocation(23, 42, -1, '', '') + assert loc1 == loc2 + + loc1 = FeatureLocation(23, 42, -1, '', '') + loc2 = FeatureLocation(23, 42, -1, '', '') + assert loc1 == loc2 + + # loc1 = FeatureLocation(Position(23, 1, 0), Position(42, 2, 0), 1, '', '') + # loc2 = FeatureLocation(23, 42, 1, '', '') + # print (loc1, loc2) + + loc1 = FeatureLocation(23, 42, 1, "foo", "bar") + loc2 = FeatureLocation(23, 42, 1, "foo", "bar") + assert loc1 == loc2 +test_eq_identical() + +@test +def test_eq_not_identical(): + """ + Test two different locations are not equal. + """ + loc1 = FeatureLocation(22, 42, 1, '', '') + loc2 = FeatureLocation(23, 42, 1, '', '') + assert loc1 != loc2 + + loc1 = FeatureLocation(23, 42, 1, '', '') + loc2 = FeatureLocation(23, 43, 1, '', '') + assert loc1 != loc2 + + loc1 = FeatureLocation(23, 42, 1, '', '') + loc2 = FeatureLocation(23, 42, -1, '', '') + assert loc1 != loc2 + + loc1 = FeatureLocation(23, 42, 1, 'foo', '') + loc2 = FeatureLocation(23, 42, 1, 'bar', '') + assert loc1 != loc2 + + loc1 = FeatureLocation(23, 42, 1, 'foo', 'bar') + loc2 = FeatureLocation(23, 42, 1, 'foo', 'baz') + assert loc1 != loc2 +test_eq_not_identical() + +@test +# not finished +def test_start_before_end(): + expected = "must be greater than or equal to start location" + print FeatureLocation(42, 23, 1, '', '') + +test_start_before_end() + +####### Test CompoundLocation ####### + +@test +def test_eq_identical1(): + loc1 = FeatureLocation(12, 17, 1, '', '') + FeatureLocation(23, 42, 1, '', '') + loc2 = FeatureLocation(12, 17, 1, '', '') + FeatureLocation(23, 42, 1, '', '') + assert loc1 == loc2 + + loc1 = FeatureLocation(12, 17, 1, '', '') + FeatureLocation(23, 42, 1, '', '') + loc2 = CompoundLocation([FeatureLocation(12, 17, 1, '', ''), FeatureLocation(23, 42, 1, '', '')], 'join') + assert loc1 == loc2 +test_eq_identical1() + +@test +def test_eq_not_identical1(): + loc1 = FeatureLocation(12, 17, 1, '', '') + FeatureLocation(23, 42, 1, '', '') + loc2 = FeatureLocation(12, 17, 1, '', '') + FeatureLocation(23, 42, 1, '', '') + FeatureLocation(50, 60, 1, '', '') + assert loc1 != loc2 + + loc1 = FeatureLocation(12, 17, 1, '', '') + FeatureLocation(23, 42, 1, '', '') + loc2 = FeatureLocation(12, 17, -1, '', '') + FeatureLocation(23, 42, -1, '', '') + assert loc1 != loc2 + + loc1 = CompoundLocation([FeatureLocation(12, 17, 1, '', ''), FeatureLocation(23, 42, 1, '', '')], "join") + loc2 = CompoundLocation([FeatureLocation(12, 17, 1, '', ''), FeatureLocation(23, 42, 1, '', '')], 'order') + assert loc1 != loc2 +test_eq_not_identical1() From f48a7bfed582e6a5dd4e6cc1a52e3dd4c4242e8f Mon Sep 17 00:00:00 2001 From: Jordan Watson Date: Tue, 24 Dec 2019 12:34:28 -0800 Subject: [PATCH 16/16] Added more testing and debugging --- stdlib/bio/seqfeature.seq | 410 +++++++++++---------- test/stdlib/biopy_test/seqfeature_test.seq | 38 +- 2 files changed, 244 insertions(+), 204 deletions(-) diff --git a/stdlib/bio/seqfeature.seq b/stdlib/bio/seqfeature.seq index cd7d6e7b5..bb3fe186d 100644 --- a/stdlib/bio/seqfeature.seq +++ b/stdlib/bio/seqfeature.seq @@ -8,10 +8,13 @@ EXACT = 0 BEFORE = 1 AFTER = 2 -type Position(position: int, mode: int, extension: int): +class Position: """ Specify the specific position of a boundary. """ + pos: int + mode: int + extension: int def __init__(cls: Position, position: int, mode: int, extension: int=0): if extension != 0: @@ -20,14 +23,15 @@ type Position(position: int, mode: int, extension: int): if not 0 <= mode <= 2: raise ValueError(f"mode must be a value of EXACT={EXACT}, BEFORE={BEFORE}, AFTER={AFTER}.") - return cls.position + cls.pos = position + cls.mode = mode @property def position(self: Position): """ Legacy attribute to get position as integer (OBSOLETE). """ - return int(self.position) + return self.pos @property def extension(self: Position): # noqa: D402 @@ -40,24 +44,24 @@ type Position(position: int, mode: int, extension: int): """ Represent the location as a string for debugging. """ - return f"{self.__class__.__name__}({int(self)})" + return f"{self.__class__.__name__}({self.pos})" def __str__(self: Position): """ Return a representation of the Position object. """ if self.mode == EXACT: - return f"{self.position}" + return f"{self.pos}" elif self.mode == BEFORE: - return "<{self.position}" + return f"<{self.pos}" else: - return ">{self.position}" + return f">{self.pos}" def _shift(self: Position, offset: int): """ Return a copy of the position object with its location shifted (PRIVATE). """ - return self.__class__(int(self) + offset) + return self.__class__(self.pos + offset) def _flip(self: Position, length: int): """ @@ -65,12 +69,83 @@ type Position(position: int, mode: int, extension: int): """ if self.mode == EXACT: # By default perserve any subclass - return self.__class__(length = int(self)) + return self.__class__(length = self.pos) elif self.mode == BEFORE: - return Position(length - int(self), 2, 0) + return Position(length - self.pos, 2, 0) else: - return Position(length - int(self), 1, 0) + return Position(length - self.pos, 1, 0) +# class WithinPosition(int, AbstractPosition): +class WithinPosition: + """ + Specify the position of a boundary within some coordinates. + """ + pos: int + left: int + right: int + + def __init__(cls: WithinPosition, position: int, left: int, right: int): + """ + Create a WithinPosition object. + """ + if not (position == left or position == right): + raise RuntimeError(f"WithinPosition: {position} should match left {left} or right {right}") + cls.pos = position + cls.left = left + cls.right = right + + def __getnewargs__(self: WithinPosition): + """ + Return the arguments accepted by __init__. + Necessary to allow pickling and unpickling of class instances. + """ + return (int(self), self._left, self._right) + + def __repr__(self: WithinPosition): + """ + Represent the WithinPosition object as a string for debugging. + """ + return f"{self.__class__.__name__}({int(self)}, left={self._left}, right={self._right})" \ + + def __str__(self: WithinPosition): + """ + Return a representation of the WithinPosition object. + """ + return f"({self.left}.{self.right})" + + @property + def position(self: WithinPosition): + """ + Legacy attribute to get (left) position as integer (OBSOLETE). + """ + return self.pos + + @property + def extension(self: WithinPosition): # noqa: D402 + """ + Legacy attribute to get extension (from left to right) as an integer (OBSOLETE). + """ + return self.right - self.left + + def _shift(self: WithinPosition, offset: int): + """ + Return a copy of the position object with its location shifted (PRIVATE). + """ + return self.__class__(int(self) + offset, + self.left + offset, + self.right + offset) + + def _flip(self: WithinPosition, length: int): + """ + Return a copy of the location after the parent is reversed (PRIVATE). + """ + return self.__class__(length - int(self), + length - self.right, + length - self.left) + +extend int: + def __init__(self: int, w: WithinPosition): + return w.pos # --- Handling feature locations @@ -96,24 +171,39 @@ class FeatureLocation: if self._start > self._end: raise ValueError(f"End location {self._end} must be greater than or equal to start location {self._start}.") - self.strand = strand - self.ref = ref - self.ref_db = ref_db + self._strand = strand + self._ref = ref + self._ref_db = ref_db - def __init__(self: FeatureLocation, start: Position, end: Position, strand: optional[int]=None, ref: optional[str]=None, ref_db: optional[str]=None): + def __init__(self: FeatureLocation, start: Position, end: Position, strand: int, ref: str, ref_db: str): """ Initialize the class. """ - self._start = start - self._end = end + self._start = start.pos + self._end = end.pos - if self.start > self.end: - raise ValueError(f"""End location ({self.end}) must be greater than or equal - to start location ({self.start})""") - self.strand = strand - self.ref = ref - self.ref_db = ref_db + if self._start > self._end: + raise ValueError(f"""End location ({self._end}) must be greater than or equal + to start location ({self._start})""") + self._strand = strand + self._ref = ref + self._ref_db = ref_db + + # def __init__(self: FeatureLocation, start: Position, end: WithinPosition, strand: int, ref: str, ref_db: str): + # """ + # Initialize the class. + # """ + + # self._start = start.pos + # self._end = end.pos + + # if self._start > self._end: + # raise ValueError(f"""End location ({self._end}) must be greater than or equal + # to start location ({self._start})""") + # self._strand = strand + # self._ref = ref + # self._ref_db = ref_db def _get_strand(self: FeatureLocation) -> int: """ @@ -141,7 +231,7 @@ class FeatureLocation: """ Return a representation of the FeatureLocation object. """ - answer = f"[{self.start}:{self.end}]" + answer = f"[{self._start}:{self._end}]" if self._ref and self._ref_db: answer = f"{self._ref_db}:{self._ref}{answer}" elif self._ref: @@ -161,13 +251,13 @@ class FeatureLocation: Represent the FeatureLocation object as a string for debugging. """ optional = "" - if self.strand is not None: - optional += F", strand={self.strand}" + if self._strand is not None: + optional += F", strand={self._strand}" if self.ref is not None: - optional += f", ref={self.ref}" + optional += f", ref={self._ref}" if self.ref_db is not None: - optional += f", ref_db={self.ref_db}" - return f"{self.__class__.__name__}({self.start}, {self.end}{optional})" \ + optional += f", ref_db={self._ref_db}" + return f"{self.__class__.__name__}({self._start}, {self._end}{optional})" \ # Not sure about `other` type def __radd__(self: FeatureLocation, other: int): @@ -213,8 +303,8 @@ class FeatureLocation: """ Implement equality by comparing all the location attributes. """ - return self._start == other.start and \ - self._end == other.end and \ + return self._start == other._start and \ + self._end == other._end and \ self._strand == other._strand and \ self._ref == other._ref and \ self._ref_db == other._ref_db @@ -229,27 +319,27 @@ class FeatureLocation: """ Return a copy of the FeatureLocation shifted by an offset (PRIVATE). """ - if self.ref or self.ref_db: + if self._ref or self._ref_db: raise ValueError("Feature references another sequence.") return FeatureLocation(start=self._start._shift(offset), end=self._end._shift(offset), - strand=self.strand) + strand=self._strand) def _flip(self: FeatureLocation, length): """ Return a copy of the location after the parent is reversed (PRIVATE). """ flip_strand = 0 - if self.ref or self.ref_db: + if self.ref or self._ref_db: raise ValueError("Feature references another sequence.") # Note this will flip the start and end too! - if self.strand == +1: + if self._strand == +1: flip_strand = -1 - elif self.strand == -1: + elif self._strand == -1: flip_strand = +1 else: # 0 or None - flip_strand = self.strand + flip_strand = self._strand return FeatureLocation(start=self._end._flip(length), end=self._start._flip(length), strand=flip_strand) @@ -280,14 +370,14 @@ class FeatureLocation: """ Start position (integer, approximated if fuzzy, read only) (OBSOLETE). """ - return int(self._start) + return self._start @property def nofuzzy_end(self: FeatureLocation): """ End position (integer, approximated if fuzzy, read only) (OBSOLETE). """ - return int(self._end) + return self._end # TODO/CAVEATS: # @jordan- reverse_complement needs to be implemented still @@ -913,15 +1003,6 @@ class AbstractPosition: """ return f"{self.__class__.__name__}(...)" -# class UncertainPosition(Position): -# """ -# Specify a specific position which is uncertain. -# This is used in UniProt, e.g. ?222 for uncertain position 222, or in the -# XML format explicitly marked as uncertain. Does not apply to GenBank/EMBL. -# """ - -# pass - # class UnknownPosition(AbstractPosition): class UnknownPosition: """ @@ -968,82 +1049,12 @@ class UnknownPosition: """ return self -# class WithinPosition(int, AbstractPosition): -class WithinPosition: - """ - Specify the position of a boundary within some coordinates. - """ - position: int - left: int - right: int - - def __init__(cls: WithinPosition, position: int, left: int, right: int): - """ - Create a WithinPosition object. - """ - if not (position == left or position == right): - raise RuntimeError(f"WithinPosition: {position} should match left {left} or right {right}") - obj = int.__init__(cls, position) - obj._left = left - obj._right = right - return obj - - def __getnewargs__(self: WithinPosition): - """ - Return the arguments accepted by __init__. - Necessary to allow pickling and unpickling of class instances. - """ - return (int(self), self._left, self._right) - - def __repr__(self: WithinPosition): - """ - Represent the WithinPosition object as a string for debugging. - """ - return f"{self.__class__.__name__}({int(self)}, left={self._left}, right={self._right})" \ - - def __str__(self: WithinPosition): - """ - Return a representation of the WithinPosition object (with python counting). - """ - return f"({self._left}.{self._right})" - - @property - def position(self: WithinPosition): - """ - Legacy attribute to get (left) position as integer (OBSOLETE). - """ - return self._left - - @property - def extension(self: WithinPosition): # noqa: D402 - """ - Legacy attribute to get extension (from left to right) as an integer (OBSOLETE). - """ - return self._right - self._left - - def _shift(self: WithinPosition, offset: int): - """ - Return a copy of the position object with its location shifted (PRIVATE). - """ - return self.__class__(int(self) + offset, - self._left + offset, - self._right + offset) - - def _flip(self: WithinPosition, length: int): - """ - Return a copy of the location after the parent is reversed (PRIVATE). - """ - return self.__class__(length - int(self), - length - self._right, - length - self._left) - - # class BetweenPosition(int, AbstractPosition): class BetweenPosition: """ Specify the position of a boundary between two coordinates (OBSOLETE?). """ - position: int + pos: int left: int right: int @@ -1051,11 +1062,11 @@ class BetweenPosition: """ Create a new instance in BetweenPosition object. """ - assert position == left or position == right - obj = int.__init__(cls, position) - obj._left = left - obj._right = right - return obj + assert position == left or position == right + cls.pos = position + cls.left = left + cls.right = right + def __getnewargs__(self: BetweenPosition): """ @@ -1074,14 +1085,14 @@ class BetweenPosition: """ Return a representation of the BetweenPosition object (with python counting). """ - return f"({self._left}^{self._right})" + return f"({self.left}^{self.right})" @property def position(self: BetweenPosition): """ Legacy attribute to get (left) position as integer (OBSOLETE). """ - return self._left + return self.left @property def extension(self: BetweenPosition): # noqa: D402 @@ -1105,81 +1116,82 @@ class BetweenPosition: return self.__class__(length - int(self), length - self._right, length - self._left) - - -#class OneOfPosition(int, AbstractPosition) -class OneOfPosition: - """ - Specify a position where the location can be multiple positions. - """ - position: int - choices: list[Position] - - def __init__(cls: OneOfPosition, position: int, choices: list[Position]): - """ - Initialize with a set of possible positions. - position_list is a list of AbstractPosition derived objects, - specifying possible locations. - position is an integer specifying the default behaviour. - """ - if position not in choices: - raise ValueError(f"OneOfPosition: {position} should match one of {choices}") - obj = int.__init__(cls, position) - obj.position_choices = choices - return obj - - def __getnewargs__(self: OneOfPosition) -> tuple[int, list[Position]]: - """ - Return the arguments accepted by __init__. - Necessary to allow pickling and unpickling of class instances. - """ - return (int(self), self.position_choices) - - @property - def position(self: OneOfPosition): - """ - Legacy attribute to get (left) position as integer (OBSOLETE). - """ - return min(int(pos) for pos in self.position_choices) - - @property - def extension(self: OneOfPosition): - """ - Legacy attribute to get extension as integer (OBSOLETE). - """ - positions = [int(pos) for pos in self.position_choices] - return max(positions) - min(positions) - - def __repr__(self: OneOfPosition): - """ - Represent the OneOfPosition object as a string for debugging. - """ - return f"{self.__class__.__name__}({int(self)}, choices={self.position_choices})" +extend int: + def __init__(self: int, b: BetweenPosition): + return b.pos + +# TODO/CAVEATS +# @jordan- OneofPosition has not been tested yet. +# #class OneOfPosition(int, AbstractPosition) +# class OneOfPosition: +# """ +# Specify a position where the location can be multiple positions. +# """ +# position_choices: int +# choices: list[Position] + +# def __init__(cls: OneOfPosition, position: int, choices: list[Position]): +# """ +# Initialize with a set of possible positions. +# position_list is a list of AbstractPosition derived objects, +# specifying possible locations. +# position is an integer specifying the default behaviour. +# """ +# cls.position_choices = position +# cls.choices = choices + +# def __getnewargs__(self: OneOfPosition) -> tuple[OneOfPosition, list[Position]]: +# """ +# Return the arguments accepted by __init__. +# Necessary to allow pickling and unpickling of class instances. +# """ +# return (self, self.position_choices) + +# @property +# def position(self: OneOfPosition): +# """ +# Legacy attribute to get (left) position as integer (OBSOLETE). +# """ +# return min(int(pos) for pos in self.position_choices) + +# @property +# def extension(self: OneOfPosition): +# """ +# Legacy attribute to get extension as integer (OBSOLETE). +# """ +# positions = [int(pos) for pos in self.position_choices] +# return max(positions) - min(positions) + +# def __repr__(self: OneOfPosition): +# """ +# Represent the OneOfPosition object as a string for debugging. +# """ +# return f"{self.__class__.__name__}({int(self)}, choices={self.position_choices})" - def __str__(self: OneOfPosition): - """ - Return a representation of the OneOfPosition object. - """ - out = "one-of(" - for position in self.position_choices: - out += f"{position}," - # replace the last comma with the closing parenthesis - return out[:-1] + ")" - - def _shift(self: OneOfPosition, offset: int): - """ - Return a copy of the position object with its location shifted (PRIVATE). - """ - return self.__class__(int(self) + offset, - [p._shift(offset) for p in self.position_choices]) - - def _flip(self: OneOfPosition, length: int): - """ - Return a copy of the location after the parent is reversed (PRIVATE). - """ - return self.__class__(length - int(self), - [p._flip(length) for p in self.position_choices[::-1]]) +# def __str__(self: OneOfPosition): +# """ +# Return a representation of the OneOfPosition object. +# """ +# out = "one-of(" +# for position in self.position_choices: +# out += f"{position}," +# # replace the last comma with the closing parenthesis +# return out[:-1] + ")" + +# def _shift(self: OneOfPosition, offset: int): +# """ +# Return a copy of the position object with its location shifted (PRIVATE). +# """ +# return self.__class__(int(self) + offset, +# [p._shift(offset) for p in self.position_choices]) + +# def _flip(self: OneOfPosition, length: int): +# """ +# Return a copy of the location after the parent is reversed (PRIVATE). +# """ +# return self.__class__(length - int(self), +# [p._flip(length) for p in self.position_choices[::-1]]) #class PositionGap(object): class PositionGap: diff --git a/test/stdlib/biopy_test/seqfeature_test.seq b/test/stdlib/biopy_test/seqfeature_test.seq index 46c7dee67..440eeb8b2 100644 --- a/test/stdlib/biopy_test/seqfeature_test.seq +++ b/test/stdlib/biopy_test/seqfeature_test.seq @@ -1,7 +1,6 @@ from Bio.SeqFeature import FeatureLocation, Position from Bio.SeqFeature import CompoundLocation, UnknownPosition, SeqFeature from Bio.SeqFeature import WithinPosition, BetweenPosition -from Bio.SeqFeature import OneOfPosition ####### Test FeatureLocation ####### @@ -22,9 +21,9 @@ def test_eq_identical(): loc2 = FeatureLocation(23, 42, -1, '', '') assert loc1 == loc2 - # loc1 = FeatureLocation(Position(23, 1, 0), Position(42, 2, 0), 1, '', '') - # loc2 = FeatureLocation(23, 42, 1, '', '') - # print (loc1, loc2) + loc1 = FeatureLocation(Position(23, 1, 0), Position(42, 2, 0), 1, '', '') + loc2 = FeatureLocation(23, 42, 1, '', '') + assert loc1 == loc2 loc1 = FeatureLocation(23, 42, 1, "foo", "bar") loc2 = FeatureLocation(23, 42, 1, "foo", "bar") @@ -58,7 +57,7 @@ def test_eq_not_identical(): test_eq_not_identical() @test -# not finished +# not finished need to have raise error with start > end def test_start_before_end(): expected = "must be greater than or equal to start location" print FeatureLocation(42, 23, 1, '', '') @@ -92,3 +91,32 @@ def test_eq_not_identical1(): loc2 = CompoundLocation([FeatureLocation(12, 17, 1, '', ''), FeatureLocation(23, 42, 1, '', '')], 'order') assert loc1 != loc2 test_eq_not_identical1() + +####### Test Locations ####### + +@test +# Not finished +def test_fuzzy(): + exact_pos = Position(5, 0, 0) + within_pos_s = WithinPosition(10, left=10, right=13) + within_pos_e = WithinPosition(13, left=10, right=13) + between_pos_e = BetweenPosition(24, 20, 24) + before_pos = Position(15, 1, 0) + after_pos = Position(40, 2, 0) + + assert int(within_pos_s) == 10 + assert str(within_pos_s) == "(10.13)" + assert int(within_pos_e) == 13 + assert str(within_pos_e) == "(10.13)" + assert int(between_pos_e) == 24 + assert str(between_pos_e) == "(20^24)" + assert str(before_pos) == "<15" + assert str(after_pos) == ">40" + + # TODO/CAVEATS: + # FeatureLocation does not support these types for now. + # location1 = FeatureLocation(exact_pos, within_pos_e, 1, '', '') + # location2 = FeatureLocation(before_pos, between_pos_e) + # location3 = FeatureLocation(within_pos_s, after_pos) +test_fuzzy() +