jrid:0020
- GL Transmission Format: GLTF runtime 3D asset library (Generic)
+ GL Transmission Format: GLTF runtime 3D asset library schema (Generic)
False
False
False
True
- {'KEY': '$schema', 'STARTSWITH': 'http://json-schema.org/'}
-{'KEY': '$schema', 'ENDSWITH': '/schema#'}
+ {'KEY': '$schema', 'STARTSWITH': 'https://json-schema.org/'}
+{'KEY': '$schema', 'ENDSWITH': '/schema'}
{'KEY': 'title', 'EXISTS': None}
{'KEY': 'type', 'IS': 'object'}
{'KEY': 'description', 'IS': 'The root object for a glTF asset.'}
@@ -1510,7 +1510,7 @@ Contents
jrid:0017: Open Resume Document
jrid:0018: jacker song
jrid:0019: JSON Patch
-jrid:0020: GL Transmission Format: GLTF runtime 3D asset library (Generic)
+jrid:0020: GL Transmission Format: GLTF runtime 3D asset library schema (Generic)
jrid:0021: Tweet Object
jrid:0022: sandboxels save file
jrid:0023: dublin core metadata (archivematica)
diff --git a/docs/utils/index.html b/docs/utils/index.html
index ce33717..6236290 100644
--- a/docs/utils/index.html
+++ b/docs/utils/index.html
@@ -45,6 +45,11 @@
json2json will convert JSON compatible objects from one encoding
to UTF-8.
+src.utils.jsonid2pronom
+
+jsonid2pronom provides a helper script to enable export of generic
+JSONID compatible markers to a PRONOM compatible signature file.
+
@@ -67,6 +72,7 @@
diff --git a/docs/utils/json2json.html b/docs/utils/json2json.html
index d68fa96..62dce2d 100644
--- a/docs/utils/json2json.html
+++ b/docs/utils/json2json.html
@@ -78,7 +78,7 @@
async def identify_json(paths: list[str]):
"""Identify objects."""
for idx, path in enumerate(paths):
- valid, data, _, _ = await identify_plaintext_bytestream(path)
+ valid, data = await identify_plaintext_bytestream(path)
if not valid:
continue
print(json.dumps(data, indent=2))
@@ -143,7 +143,7 @@
parser = argparse.ArgumentParser(
prog="json2json",
description="parse JSON UTF-16 (BE-LE) objects and output them as UTF-8 for the sake of developer ergonomics",
- epilog="for more information visit https://github.com/ffdev-info/json-id",
+ epilog="for more information visit https://github.com/ffdev-info/jsonid",
)
parser.add_argument(
"--debug",
diff --git a/docs/utils/jsonid2pronom.html b/docs/utils/jsonid2pronom.html
new file mode 100644
index 0000000..09d9b27
--- /dev/null
+++ b/docs/utils/jsonid2pronom.html
@@ -0,0 +1,211 @@
+
+
+
+
+
+
+src.utils.jsonid2pronom API documentation
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+Module src.utils.jsonid2pronom
+
+
+jsonid2pronom provides a helper script to enable export of generic
+JSONID compatible markers to a PRONOM compatible signature file.
+
+
+
+
+
+
+
+async def load_patterns (path: str) ‑> list
+
+
+
+
+Expand source code
+
+async def load_patterns(path: str) -> list:
+ """Load patterns from a file for conversion to a signature file."""
+ patterns = []
+ with open(path, "r", encoding="utf-8") as patterns_file:
+ patterns = json.loads(patterns_file.read())
+ return patterns
+
+Load patterns from a file for conversion to a signature file.
+
+
+def main () ‑> None
+
+
+
+
+Expand source code
+
+def main() -> None:
+ """Primary entry point for this script."""
+ parser = argparse.ArgumentParser(
+ prog="jsonid2pronom",
+ description="convert JSONID compatible markers to PRONOM",
+ epilog="for more information visit https://github.com/ffdev-info/jsonid",
+ )
+ parser.add_argument(
+ "--debug",
+ help="use debug loggng",
+ required=False,
+ action="store_true",
+ )
+ parser.add_argument(
+ "--path",
+ "-p",
+ help="file path to process",
+ required=False,
+ )
+ args = parser.parse_args()
+ logging.getLogger(__name__).setLevel(logging.DEBUG if args.debug else logging.INFO)
+ logger.debug("debug logging is configured")
+ if not args.path:
+ parser.print_help(sys.stderr)
+ sys.exit()
+ asyncio.run(
+ output_signature(
+ path=args.path,
+ )
+ )
+
+Primary entry point for this script.
+
+
+async def output_signature (path: str)
+
+
+
+
+Expand source code
+
+async def output_signature(path: str):
+ """Output JSONID compatible signatures to PRONOM."""
+
+ formats = []
+
+ encodings = ("UTF-8", "UTF-16", "UTF-16BE", "UTF-32LE")
+ priorities = []
+
+ increment_id = 0
+
+ markers = await load_patterns(path)
+
+ if not markers:
+ logger.error("no patterns provided via path arg")
+ sys.exit(1)
+
+ for encoding in encodings:
+ increment_id += 1
+ json_puid = "jsonid2pronom/1"
+ name_ = f"JSONID2PRONOM Conversion ({encoding})"
+ try:
+ mime = "application/json"
+ except IndexError:
+ mime = ""
+ try:
+ sequences = pronom.process_markers(
+ copy.deepcopy(markers),
+ increment_id,
+ encoding=encoding,
+ )
+ except pronom.UnprocessableEntity as err:
+ logger.error(
+ "%s %s: cannot handle: %s",
+ json_puid,
+ name_,
+ err,
+ )
+ for err_marker in markers:
+ logger.debug("--- START ---")
+ logger.debug("marker: %s", err_marker)
+ logger.debug("--- END ---")
+ continue
+ fmt = pronom.Format(
+ id=increment_id,
+ name=name_,
+ version="",
+ puid=json_puid,
+ mime=mime,
+ classification="structured text",
+ external_signatures=[
+ pronom.ExternalSignature(
+ id=increment_id,
+ signature="json",
+ type=pronom.EXT,
+ )
+ ],
+ internal_signatures=sequences,
+ priorities=list(set(priorities)),
+ )
+ priorities.append(f"{increment_id}")
+ formats.append(fmt)
+
+ pronom.process_formats_to_stdout(formats)
+
+Output JSONID compatible signatures to PRONOM.
+
+
+
+
+
+
+
+
+
+
diff --git a/json2json.py b/json2json.py
new file mode 100644
index 0000000..1bcbe2b
--- /dev/null
+++ b/json2json.py
@@ -0,0 +1,12 @@
+"""JSONID entry-points."""
+
+from src.utils import json2json
+
+
+def main():
+ """Primary entry point for this script."""
+ json2json.main()
+
+
+if __name__ == "__main__":
+ main()
diff --git a/json2pronom.py b/json2pronom.py
new file mode 100644
index 0000000..0f2c3f7
--- /dev/null
+++ b/json2pronom.py
@@ -0,0 +1,12 @@
+"""JSONID entry-points."""
+
+from src.utils import jsonid2pronom
+
+
+def main():
+ """Primary entry point for this script."""
+ jsonid2pronom.main()
+
+
+if __name__ == "__main__":
+ main()
diff --git a/jsonid.py b/jsonid.py
index 5c087ae..597256b 100644
--- a/jsonid.py
+++ b/jsonid.py
@@ -1,4 +1,4 @@
-"""Donovan entry-points."""
+"""JSONID entry-points."""
from src.jsonid import jsonid
diff --git a/justfile b/justfile
index bfb4a32..9be2689 100644
--- a/justfile
+++ b/justfile
@@ -83,6 +83,10 @@ check-debug:
@hexdump file:
hexdump -v -e '"\\\x" 1/1 "%02x"' {{file}}
+# Hexdump bytes only.
+@hexdump-plain file:
+ hexdump -ve '1/1 "%.2x"' {{file}}
+
# code coverage
coverage:
python -m tox -e coverage
@@ -102,3 +106,25 @@ lookup-ref-ex:
# lookup ref
@lookup-ref ref:
python jsonid.py lookup {{ref}}
+
+# export as PRONOM
+pronom:
+ python jsonid.py --pronom --debug
+
+# move pronom to .droid folder
+pronom-to-droid:
+ ~/.droid6/clean.sh
+ rm -f ~/.droid6/signature_files/jsonid_pronom.xml
+ cp jsonid_pronom.xml ~/.droid6/signature_files/
+
+dir := `pwd`
+
+# load pronom to siegfried via roy
+roy:
+ @echo {{dir}}
+ roy build --noreports -extend {{dir}}/jsonid_pronom.xml
+
+# droid-cli
+droid-cli path: pronom-to-droid
+ java -jar ~/dp/droid/droid-binary-6.8.0-bin/droid-command-line-6.8.0.jar -s 1
+ java -jar ~/dp/droid/droid-binary-6.8.0-bin/droid-command-line-6.8.0.jar -ri {{path}}
diff --git a/pronom_example/patterns_example.json b/pronom_example/patterns_example.json
new file mode 100644
index 0000000..8631d38
--- /dev/null
+++ b/pronom_example/patterns_example.json
@@ -0,0 +1,4 @@
+[
+ {"KEY": "key1", "EXISTS": null},
+ {"KEY": "key2", "EXISTS": null}
+]
diff --git a/pyproject.toml b/pyproject.toml
index b7fe1b5..991bcdf 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -40,6 +40,7 @@ dependencies = {file = ["requirements/requirements.txt"]}
jsonid = "jsonid.jsonid:main"
momoa = "jsonid.jsonid:main"
json2json = "utils.json2json:main"
+json2pronom = "utils.jsonid2pronom:main"
[build-system]
requires = ["setuptools>=80.9.0", "wheel>=0.45.1", "setuptools_scm[toml]>=9.2.2"]
diff --git a/src/jsonid/export.py b/src/jsonid/export.py
index c062aa8..7164a0c 100644
--- a/src/jsonid/export.py
+++ b/src/jsonid/export.py
@@ -1,25 +1,35 @@
"""Functions to support export."""
+import copy
import datetime
import json
import logging
from datetime import timezone
+from typing import Final
try:
+ import pronom
import registry_data
import version
except ModuleNotFoundError:
try:
- from src.jsonid import registry_data, version
+ from src.jsonid import pronom, registry_data, version
except ModuleNotFoundError:
- from jsonid import registry_data, version
+ from jsonid import pronom, registry_data, version
logger = logging.getLogger(__name__)
+PRONOM_FILENAME: Final[str] = "jsonid_pronom.xml"
+
+
+class PRONOMException(Exception):
+ """Exception class if we can't create a PRONOM signature as expected."""
+
+
def exportJSON() -> None: # pylint: disable=C0103
"""Export to JSON."""
- logger.debug("exporting registry ad JSON")
+ logger.debug("exporting registry as JSON")
data = registry_data.registry()
json_obj = []
id_ = {
@@ -35,3 +45,102 @@ def exportJSON() -> None: # pylint: disable=C0103
for datum in data:
json_obj.append(datum.json())
print(json.dumps(json_obj, indent=2))
+
+
+def export_pronom() -> None:
+ """Export a PRONOM compatible set of signatures.
+
+ Export is done in two phases. A set of proposed "Baseline" JSON
+ signatures to catch many JSON instances.
+
+ Second the JSONID registry is exported.
+
+ Every export has a priority over the other so that there should
+ be no multiple identification results.
+ """
+
+ # pylint: disable=R0914; too-many local variables.
+
+ logger.debug("exporting registry as PRONOM")
+
+ reg_data = registry_data.registry()
+ formats = []
+
+ encodings = ("UTF-8", "UTF-16", "UTF-16BE", "UTF-32LE")
+ priorities = []
+
+ increment_id = 0
+
+ for encoding in encodings:
+ all_baseline = pronom.create_baseline_json_sequences(encoding)
+ for baseline in all_baseline:
+ increment_id += 1
+ fmt = pronom.Format(
+ id=increment_id,
+ name=f"JSON (Baseline - fmt/817) ({encoding})",
+ version="",
+ puid="jsonid:0000",
+ mime="application/json",
+ classification="structured text",
+ external_signatures=[
+ pronom.ExternalSignature(
+ id=increment_id,
+ signature="json",
+ type=pronom.EXT,
+ )
+ ],
+ internal_signatures=[baseline],
+ priorities=priorities,
+ )
+ priorities.append(f"{increment_id}")
+ formats.append(fmt)
+
+ for encoding in encodings:
+ for entry in reg_data:
+ increment_id += 1
+ json_puid = f"{entry.json()['identifier']};{encoding}"
+ name_ = f"{entry.json()['name'][0]['@en']} ({encoding})"
+ markers = entry.json()["markers"]
+ try:
+ mime = entry.json()["mime"][0]
+ except IndexError:
+ mime = ""
+ try:
+ sequences = pronom.process_markers(
+ copy.deepcopy(markers),
+ increment_id,
+ encoding=encoding,
+ )
+ except pronom.UnprocessableEntity as err:
+ logger.error(
+ "%s %s: cannot handle: %s",
+ json_puid,
+ name_,
+ err,
+ )
+ for marker in markers:
+ logger.debug("--- START ---")
+ logger.debug("marker: %s", marker)
+ logger.debug("--- END ---")
+ continue
+ fmt = pronom.Format(
+ id=increment_id,
+ name=name_,
+ version="",
+ puid=json_puid,
+ mime=mime,
+ classification="structured text",
+ external_signatures=[
+ pronom.ExternalSignature(
+ id=increment_id,
+ signature="json",
+ type=pronom.EXT,
+ )
+ ],
+ internal_signatures=sequences,
+ priorities=copy.deepcopy(list(set(priorities))),
+ )
+ priorities.append(f"{increment_id}")
+ formats.append(fmt)
+
+ pronom.process_formats_and_save(formats, PRONOM_FILENAME)
diff --git a/src/jsonid/export_helpers.py b/src/jsonid/export_helpers.py
new file mode 100644
index 0000000..0dfec2f
--- /dev/null
+++ b/src/jsonid/export_helpers.py
@@ -0,0 +1,30 @@
+"""Helpers for the export functions."""
+
+import datetime
+from datetime import timezone
+from typing import Final
+from xml.dom.minidom import parseString
+
+UTC_TIME_FORMAT: Final[str] = "%Y-%m-%dT%H:%M:%SZ"
+
+
+def get_utc_timestamp_now():
+ """Get a formatted UTC timestamp for 'now' that can be used when
+ a timestamp is needed.
+ """
+ return datetime.datetime.now(timezone.utc).strftime(UTC_TIME_FORMAT)
+
+
+def new_prettify(c):
+ """Remove excess newlines from DOM output.
+
+ via: https://stackoverflow.com/a/14493981
+ """
+ reparsed = parseString(c)
+ return "\n".join(
+ [
+ line
+ for line in reparsed.toprettyxml(indent=" " * 2).split("\n")
+ if line.strip()
+ ]
+ )
diff --git a/src/jsonid/file_processing.py b/src/jsonid/file_processing.py
index 1d083a4..1da44ee 100644
--- a/src/jsonid/file_processing.py
+++ b/src/jsonid/file_processing.py
@@ -338,6 +338,27 @@ async def identify_plaintext_bytestream(
If analysis is `True` we try to return more low-level file
information to help folks make appraisal decisions.
+
+ Encodings in Python are split into the following, where UTF-32 on
+ its own is a little confusing. If WE are writing the encoding then
+ I believe it leaves off the byte-order-marker and we want to
+ select UTF-32LE to make sure it is written.
+
+ If we are decoding, then I don't think it matters. I think we
+ try to decode and if it works it works.
+
+ Encodings:
+
+ "UTF-8",
+ "UTF-16",
+ "UTF-16LE",
+ "UTF-16BE",
+ "UTF-32",
+ "UTF-32LE",
+ "UTF-32BE",
+ "SHIFT-JIS",
+ "BIG5",
+
"""
# pylint: disable=R0911
diff --git a/src/jsonid/helpers.py b/src/jsonid/helpers.py
index ca28bc1..3776fc1 100644
--- a/src/jsonid/helpers.py
+++ b/src/jsonid/helpers.py
@@ -2,7 +2,7 @@
import logging
import time
-from typing import Union
+from typing import Final, Union
try:
import htm_template
@@ -75,6 +75,15 @@ def format_marker(marker_text: str, marker: dict) -> str:
return f"{marker_text}{marker_formatted}\n"
+TYPE_BOOL: Final[str] = "bool"
+TYPE_FLOAT: Final[str] = "float"
+TYPE_INTEGER: Final[str] = "integer"
+TYPE_LIST: Final[str] = "list"
+TYPE_NONE: Final[str] = "NoneType"
+TYPE_MAP: Final[str] = "map"
+TYPE_STRING: Final[str] = "string"
+
+
def substitute_type_text(replace_me: Union[str, type]):
"""Output a text substitution for a type that will otherwise not
pretty-print.
@@ -83,19 +92,19 @@ def substitute_type_text(replace_me: Union[str, type]):
# pylint: disable=R0911
if replace_me.__name__ == "dict":
- return "map"
+ return TYPE_MAP
if replace_me.__name__ == "int":
- return "integer"
+ return TYPE_INTEGER
if replace_me.__name__ == "list":
- return "list"
+ return TYPE_LIST
if replace_me.__name__ == "str":
- return "string"
+ return TYPE_STRING
if replace_me.__name__ == "float":
- return "float"
+ return TYPE_FLOAT
if replace_me.__name__ == "bool":
- return "bool"
+ return TYPE_BOOL
if replace_me.__name__ == "NoneType":
- return "NoneType"
+ return TYPE_NONE
if not isinstance(replace_me, type):
pass
return replace_me
diff --git a/src/jsonid/jsonid.py b/src/jsonid/jsonid.py
index 0dfa6b8..7475e0d 100644
--- a/src/jsonid/jsonid.py
+++ b/src/jsonid/jsonid.py
@@ -98,7 +98,7 @@ def main() -> None:
# pylint: disable=R0912,R0915
parser = argparse.ArgumentParser(
- prog="json-id",
+ prog="jsonid",
description="proof-of-concept identifier for JSON objects on disk based on identifying valid objects and their key-values",
epilog="for more information visit https://github.com/ffdev-info/json-id",
)
@@ -156,10 +156,12 @@ def main() -> None:
help="path to a custom registry to lead into memory replacing the default",
required=False,
)
+ # NB. consider output to stdout once the feature is more stable.
parser.add_argument(
"--pronom",
- help="return a PRONOM-centric view of the results",
+ help=f"return a PRONOM-centric view of the results to `{export.PRONOM_FILENAME}` (BETA)",
required=False,
+ action="store_true",
)
parser.add_argument(
"--export",
@@ -220,7 +222,8 @@ def main() -> None:
if args.registry:
raise NotImplementedError("custom registry is not yet available")
if args.pronom:
- raise NotImplementedError("pronom view is not yet implemented")
+ export.export_pronom()
+ sys.exit()
if args.language:
raise NotImplementedError("multiple languages are not yet implemented")
if args.export:
diff --git a/src/jsonid/pronom.py b/src/jsonid/pronom.py
new file mode 100644
index 0000000..5a279df
--- /dev/null
+++ b/src/jsonid/pronom.py
@@ -0,0 +1,686 @@
+"""PRONOM export routines.
+
+XML tooling: https://xmllint.com/
+"""
+
+import binascii
+import codecs
+import logging
+import xml.dom.minidom
+from dataclasses import dataclass
+from functools import lru_cache
+from typing import Any, Final
+
+try:
+ import export_helpers
+ import helpers
+ import registry_matchers
+except ModuleNotFoundError:
+ try:
+ from src.jsonid import export_helpers, helpers, registry_matchers
+ except ModuleNotFoundError:
+ from jsonid import export_helpers, helpers, registry_matchers
+
+
+logger = logging.getLogger(__name__)
+
+
+DISK_SECTOR_SIZE: Final[int] = 4095
+
+# Common PRONOM characters.
+COLON: Final[str] = "3A"
+CURLY_OPEN: Final[str] = "7B"
+CURLY_CLOSE: Final[str] = "7D"
+SQUARE_OPEN: Final[str] = "5B"
+SQUARE_CLOSE: Final[str] = "5D"
+DOUBLE_QUOTE: Final[str] = "22"
+
+# Constant values.
+NUMBER_REGEX: Final[str] = "[30:39]"
+TRUE_VALUE: Final[str] = "74727565"
+FALSE_VALUE: Final[str] = "66616C7365"
+NULL_VALUE: Final[str] = "6E756C6C"
+
+# Our whitespace values could potentially be optimized per encoding,
+# e.g. to be more or less per encoding. 16 is a good default to enable
+# some UTF32-identification.
+WHITESPACE_REGEX: Final[str] = "{0-16}"
+
+# External signature types.
+EXT: Final[str] = "file extension"
+
+# Replacement markers for PRONOM pre-processing.
+MARKER_INDEX_START = "INDEX_START"
+MARKER_INDEX_END = "INDEX END"
+
+
+class UnprocessableEntity(Exception):
+ """Provide a way to give complete feedback to the caller to allow
+ it to exit."""
+
+
+@dataclass
+class ExternalSignature:
+ id: str
+ signature: str
+ type: str
+
+
+@dataclass
+class ByteSequence:
+ id: str
+ pos: str
+ min_off: str
+ max_off: str
+ endian: str
+ value: str
+
+
+@dataclass
+class InternalSignature:
+ id: str
+ name: str
+ byte_sequences: list[ByteSequence]
+
+
+@dataclass
+class Priority:
+ type: str
+ id: str
+
+
+@dataclass
+class Identifier:
+ type: str
+ value: str
+
+
+@dataclass
+class Format: # pylint: disable=R0902
+ id: str
+ name: str
+ version: str
+ puid: str
+ mime: str
+ classification: str
+ external_signatures: list[ExternalSignature]
+ internal_signatures: list[InternalSignature]
+ priorities: list[int]
+
+
+@lru_cache()
+def _get_bom() -> list:
+ """Generate a list of byte-order markers that allow us to replace
+ markers introduced through various encoding operations.
+ """
+ replaces = [
+ codecs.BOM,
+ codecs.BOM_BE,
+ codecs.BOM_LE,
+ codecs.BOM_UTF8,
+ codecs.BOM_UTF16,
+ codecs.BOM_UTF16_BE,
+ codecs.BOM_UTF16_LE,
+ codecs.BOM_UTF32,
+ codecs.BOM_UTF32_BE,
+ codecs.BOM_UTF32_LE,
+ ]
+ res = []
+ for bom in replaces:
+ hex_bom = ""
+ for marker in bom:
+ char = hex(marker)
+ hex_bom = f"{hex_bom}{char.replace('0x', '')}".upper()
+ res.append(hex_bom)
+ return res
+
+
+def create_many_to_one_byte_sequence(internal_signatures: list[InternalSignature]):
+ """Create a many to one byte sequence, i.e. a format with multiple
+ Internal Signatures.
+ """
+ internal_signature = ""
+ for internal in internal_signatures:
+ id_ = internal.id
+ bs = create_one_to_many_byte_sequence(internal.byte_sequences)
+ internal_signature = f"""
+{internal_signature}
+ {bs}
+
+ """
+ return internal_signature.strip()
+
+
+def calculate_variable_off_bof(item: ByteSequence):
+ """Given variable offsets, calculate the correct syntax."""
+ seq = item.value
+ if (
+ item.min_off != ""
+ and int(item.min_off) > 0
+ and item.max_off != ""
+ and int(item.max_off) > 0
+ ):
+ seq = f"{{{item.min_off}-{int(item.min_off)+int(item.max_off)}}}{seq}"
+ elif item.max_off != "" and int(item.max_off) > 0:
+ seq = f"{{0-{item.max_off}}}{seq}"
+ elif item.min_off != "" and int(item.min_off) > 0:
+ seq = f"{{{item.min_off}}}{seq}"
+ return seq
+
+
+def calculate_variable_off_eof(item: ByteSequence):
+ """Given variable offsets, calculate the correct syntax."""
+ seq = item.value
+ if (
+ item.min_off != ""
+ and int(item.min_off) > 0
+ and item.max_off != ""
+ and int(item.max_off) > 0
+ ):
+ seq = f"{seq}{{{item.min_off}-{int(item.min_off)+int(item.max_off)}}}"
+ elif item.max_off != "" and int(item.max_off) > 0:
+ seq = f"{seq}{{0-{item.max_off}}}"
+ elif item.min_off != "" and int(item.min_off) > 0:
+ seq = f"{seq}{{{item.min_off}}}"
+ return seq
+
+
+def create_one_to_many_byte_sequence(byte_sequences: list[ByteSequence]):
+ """Create a byte sequence object."""
+ byte_sequence = ""
+ for item in byte_sequences:
+ seq = item.value
+ if item.pos.startswith("EOF"):
+ seq = calculate_variable_off_eof(item)
+ elif item.pos.startswith("BOF"):
+ seq = calculate_variable_off_bof(item)
+ byte_sequence = f"""
+{byte_sequence.strip()}
+
+ """
+ return byte_sequence.strip()
+
+
+def create_file_format_collection(fmt: list[Format]):
+ """Create the FileFormatCollection object.
+
+ E.g.
+ ```
+
+ 1
+ ext
+
+
+
+ 880
+ 881
+ ai
+ 86
+ 331
+ 332
+ 771
+ 773
+
+ ```
+
+ """
+ internal_sigs = [
+ f"{sig.id} "
+ for sig in fmt.internal_signatures
+ ]
+ external_sigs = [
+ f"{sig.signature} "
+ for sig in fmt.external_signatures
+ if sig.type.lower() == EXT
+ ]
+ priorities = [
+ f"{priority} "
+ for priority in fmt.priorities
+ ]
+ ff = f"""
+
+ {"".join(internal_sigs).strip()}
+ {"".join(external_sigs).strip()}
+ {"".join(priorities).strip()}
+
+ """
+ return ff.strip()
+
+
+def _process_formats(formats: list[Format]):
+ """Process formats into a PRONOM XML file."""
+ isc = []
+ ffc = []
+ for fmt in formats:
+ ffc.append(create_file_format_collection(fmt))
+ if fmt.internal_signatures:
+ isc.append(create_many_to_one_byte_sequence(fmt.internal_signatures))
+ droid_template = f"""
+
+
+
+ {"".join(isc).strip()}
+
+
+ {"".join(ffc).strip()}
+
+
+ """
+ dom = None
+ signature_file = droid_template.strip().replace("\n", "")
+ try:
+ dom = xml.dom.minidom.parseString(signature_file)
+ except xml.parsers.expat.ExpatError as err:
+ logger.error("cannot process xml: %s", err)
+ return ""
+ pretty_xml = dom.toprettyxml(indent=" ", encoding="utf-8")
+ prettier_xml = export_helpers.new_prettify(pretty_xml)
+ return prettier_xml
+
+
+def process_formats_and_save(formats: list[Format], filename: str):
+ """Process the collected formats and output a signature file.
+
+ NB. Given our dataclasses here, we have the opportunity to rework
+ this data into many new structures. We output XML because DROID
+ expects XML.
+ """
+ prettier_xml = _process_formats(formats)
+ logger.info("outputting to: %s", filename)
+ with open(filename, "w", encoding="utf=8") as output_file:
+ output_file.write(prettier_xml)
+
+
+def process_formats_to_stdout(formats: list[Format]):
+ """Process the collected formats and output a signature file.
+
+ NB. Given our dataclasses here, we have the opportunity to rework
+ this data into many new structures. We output XML because DROID
+ expects XML.
+ """
+ prettier_xml = _process_formats(formats)
+ logger.info("outputting to: stdout")
+ print(prettier_xml)
+
+
+def encode_roundtrip(hexed_val: str, encoding: str) -> str:
+ """We want to get a plain-text byte-sequence into a new
+ encoding. It takes a few hops and skips.
+ """
+ val = hexed_val.strip()
+ try:
+ re_encoded = binascii.unhexlify(hexed_val).decode("utf-8").encode(encoding)
+ except (binascii.Error, UnicodeDecodeError) as err:
+ logger.error("cannot convert: %s len: %s ('%s')", hexed_val, len(val), err)
+ return val
+ hex_val = binascii.hexlify(re_encoded).decode().upper()
+ for bom in _get_bom():
+ if not hex_val.startswith(bom):
+ continue
+ return hex_val.replace(bom, "")
+ return hex_val
+
+
+def _type_to_str(type_: type, encoding: str) -> str:
+ """Given a data type marker we need to convert the type into a
+ byte sequence that will match the type.
+
+ E.g. BOOLEAN types evaluate to true or false encoded in ASCII.
+ E.g. STRING types need to begin and end with double-quotes but the
+ string itself is just a wildcard. The wildcard will match any
+ value between the double quotes.
+ """
+
+ curly_open_encoded = encode_roundtrip(CURLY_OPEN, encoding)
+ curly_close_encoded: Final[str] = encode_roundtrip(CURLY_CLOSE, encoding)
+ square_open_encoded: Final[str] = encode_roundtrip(SQUARE_OPEN, encoding)
+ square_close_encoded: Final[str] = encode_roundtrip(SQUARE_CLOSE, encoding)
+ double_quote_encoded: Final[str] = encode_roundtrip(DOUBLE_QUOTE, encoding)
+
+ try:
+ type_ = helpers.substitute_type_text(type_)
+ except AttributeError:
+ logger.debug("type_ already converted: %s", type_)
+
+ if type_ in (helpers.TYPE_INTEGER, type_ == helpers.TYPE_FLOAT):
+ # an integer field will begin 0-9 but it is unclear how to
+ # represent larger numbers? and whether we need to?
+ return NUMBER_REGEX
+ if type_ == helpers.TYPE_BOOL:
+ # true | false
+ return f"({encode_roundtrip(TRUE_VALUE, encoding)}|{encode_roundtrip(FALSE_VALUE, encoding)})"
+ if type_ == helpers.TYPE_STRING:
+ # string begins with a double quote and ends in a double quote.
+ return f"'{double_quote_encoded}*{double_quote_encoded}"
+ if type_ == helpers.TYPE_MAP:
+ # { == 7B; } == 7D
+ return f"{curly_open_encoded}*{curly_close_encoded}"
+ if type_ == helpers.TYPE_LIST:
+ # [ == 5B; ] == 5D
+ return f"{square_open_encoded}*{square_close_encoded}"
+ if type_ == helpers.TYPE_NONE:
+ # null
+ return f"{encode_roundtrip(NULL_VALUE, encoding)}".encode(encoding)
+ # This should only trigger for incorrect values at this point..
+ raise UnprocessableEntity(f"type_to_str: {type_}")
+
+
+def _complex_is_type(marker: Any) -> str:
+ """Complex IS might be another data structure, e.g. a dict, or
+ something else that we can't convert easily. It is simply a WIP
+ for now.
+ """
+ raise UnprocessableEntity(f"complex IS type: '{marker}' (WIP)")
+
+
+def _str_to_hex_str(string: str) -> str:
+ """Convert string to hexadecimal bytes.
+
+ We convert to bytes here first without encoding and then convert
+ the bytes to an encoding second. It should be possible to combine
+ those two procedures, but this has worked well during the
+ prototyping phase.
+ """
+ hex_bytes = []
+ for byte_ in string.encode():
+ hex_bytes.append(hex(byte_).replace("0x", ""))
+ hex_str = "".join(hex_bytes).upper()
+ return hex_str
+
+
+def quote_and_encode(value, encoding) -> str:
+ """Quote and encode a given value."""
+
+ double_quote_encoded: Final[str] = encode_roundtrip(DOUBLE_QUOTE, encoding)
+ # return f"{double_quote_encoded}{value}{double_quote_encoded}"
+ return f"{double_quote_encoded}{encode_roundtrip(value, encoding)}{double_quote_encoded}"
+
+
+def convert_marker_to_signature_sequence(marker: dict, encoding: str) -> str:
+ """Convert a JSONID marker into a signature sequence."""
+
+ # pylint: disable=R0914; too-many local variables.
+ # pylint: disable=R0911; too-many return statements.
+ # pylint: disable=R0915; too-many statements.
+
+ logger.debug("marker: %s", marker)
+
+ colon_encoded: Final[str] = encode_roundtrip(COLON, encoding)
+ double_quote_encoded: Final[str] = encode_roundtrip(DOUBLE_QUOTE, encoding)
+ curly_open_encoded: Final[str] = encode_roundtrip(CURLY_OPEN, encoding)
+ curly_close_encoded: Final[str] = encode_roundtrip(CURLY_CLOSE, encoding)
+ colon_encoded: Final[str] = encode_roundtrip(COLON, encoding)
+ square_open_encoded: Final[str] = encode_roundtrip(SQUARE_OPEN, encoding)
+ square_close_encoded: Final[str] = encode_roundtrip(SQUARE_CLOSE, encoding)
+ double_quote_encoded: Final[str] = encode_roundtrip(DOUBLE_QUOTE, encoding)
+
+ instruction = ""
+ if registry_matchers.MARKER_GOTO in marker.keys():
+ # GOTO KEY and match KEY.
+ goto_key = _str_to_hex_str(marker["GOTO"])
+ key_at_goto = _str_to_hex_str(marker["KEY"])
+ goto_encoded = quote_and_encode(goto_key, encoding)
+ key_encoded = quote_and_encode(key_at_goto, encoding)
+ instruction = f"{goto_encoded}{WHITESPACE_REGEX}{colon_encoded}*{WHITESPACE_REGEX}{key_encoded}{WHITESPACE_REGEX}{colon_encoded}"
+ marker.pop("GOTO")
+ marker.pop("KEY")
+ return instruction.upper()
+ if registry_matchers.MARKER_INDEX in marker.keys():
+ key = _str_to_hex_str(marker["KEY"])
+ instruction = f"{WHITESPACE_REGEX}{square_open_encoded}*{curly_open_encoded}*{double_quote_encoded}{encode_roundtrip(key, encoding)}{double_quote_encoded}{WHITESPACE_REGEX}{colon_encoded}*{curly_close_encoded}*{square_close_encoded}"
+ marker.pop("INDEX")
+ marker.pop("KEY")
+ return instruction.upper()
+ if "KEY" in marker.keys():
+ key = _str_to_hex_str(marker["KEY"])
+ instruction = quote_and_encode(key, encoding)
+ marker.pop("KEY")
+ if registry_matchers.MARKER_KEY_EXISTS in marker.keys():
+ instruction = f"{instruction}{WHITESPACE_REGEX}{colon_encoded}".upper()
+ return instruction
+ if registry_matchers.MARKER_IS_TYPE in marker.keys():
+ is_type = _type_to_str(marker["ISTYPE"], encoding=encoding)
+ type_val = (
+ f"{instruction}{WHITESPACE_REGEX}{colon_encoded}{WHITESPACE_REGEX}{is_type}"
+ )
+ return type_val.upper()
+ if registry_matchers.MARKER_IS in marker.keys():
+ marker_is = marker["IS"]
+ if not isinstance(marker_is, str):
+ _complex_is_type(marker_is)
+ equals = _str_to_hex_str(marker_is)
+ is_val = f"{instruction}{WHITESPACE_REGEX}{encode_roundtrip(equals, encoding)}"
+ return is_val.upper()
+ if registry_matchers.MARKER_STARTSWITH in marker.keys():
+ starts_with = _str_to_hex_str(marker["STARTSWITH"])
+ starts_with_val = f"{instruction}{WHITESPACE_REGEX}{colon_encoded}{WHITESPACE_REGEX}{double_quote_encoded}{encode_roundtrip(starts_with, encoding)}"
+ return starts_with_val.upper()
+ if registry_matchers.MARKER_ENDSWITH in marker.keys():
+ ends_with = _str_to_hex_str(marker["ENDSWITH"])
+ ends_with_val = f"{instruction}{WHITESPACE_REGEX}{colon_encoded}{WHITESPACE_REGEX}*{encode_roundtrip(ends_with, encoding)}{double_quote_encoded}"
+ return ends_with_val.upper()
+ if registry_matchers.MARKER_CONTAINS in marker.keys():
+ contains = _str_to_hex_str(marker["CONTAINS"])
+ contains_val = f"{instruction}{WHITESPACE_REGEX}{colon_encoded}{WHITESPACE_REGEX}{double_quote_encoded}*{encode_roundtrip(contains, encoding)}*{double_quote_encoded}"
+ return contains_val.upper()
+ if registry_matchers.MARKER_REGEX in marker.keys():
+ raise UnprocessableEntity("REGEX not yet implemented")
+ if registry_matchers.MARKER_KEY_NO_EXIST in marker.keys():
+ raise UnprocessableEntity("KEY NO EXIST not yet implemented")
+ # We should never arrive here. In the future clean this up so we
+ # only return when we have information.
+ return ""
+
+
+def preprocess_goto_markers(markers: dict) -> list:
+ """Preprocess markers to remove data that is otherwise duplicated
+ when converted to a PRONOM signature, e.g. GOTO."""
+
+ out = []
+ for marker in markers:
+ if registry_matchers.MARKER_GOTO not in marker:
+ out.append(marker)
+ continue
+ key = marker.pop("GOTO")
+ new_marker = {"KEY": key, "EXISTS": None}
+ if new_marker not in out:
+ out.append(new_marker)
+ out.append(marker)
+ return out
+
+
+def process_markers(
+ markers: list, sig_id: int, encoding: str = ""
+) -> tuple[list | bool]:
+ """Given a set of markers for a document type, process them into
+ a set of byte sequences and finally an internal signature sequence
+ that can be output as a PRONOM signature.
+
+ returns a tuple describing the processed value and a flag to
+ highlight the result is potentially lossless, e.g. in the case
+ of matching types, e.g. strings.
+
+ dict_keys(['CONTAINS'])
+ dict_keys(['ENDSWITH'])
+ dict_keys(['IS']
+ dict_keys(['ISTYPE'])
+ dict_keys(['STARTSWITH'])
+
+ key(0-n):(0-n)value
+
+ Need to return something like:
+
+
+
+ Different encodings need to be accounted for, e.g. (with added
+ whitespace below)
+
+ UTF-32-LE:
+
+ 00000000: 2000 0000 2000 0000 2000 0000 2000 0000 ... ... ... ...
+ 00000010: 2000 0000 2000 0000 0a00 0000 0a00 0000 ... ...........
+ 00000020: 0a00 0000 0a00 0000 7b00 0000 2200 0000 ........{..."...
+ 00000030: 6100 0000 2200 0000 3a00 0000 2000 0000 a..."...:... ...
+ 00000040: 2200 0000 6200 0000 2200 0000 7d00 0000 "...b..."...}...
+ 00000050: 0a00 0000 ....
+
+ UTF-32-BE:
+
+ 00000000: 0000 0020 0000 0020 0000 0020 0000 0020 ... ... ... ...
+ 00000010: 0000 0020 0000 0020 0000 000a 0000 000a ... ... ........
+ 00000020: 0000 000a 0000 000a 0000 007b 0000 0022 ...........{..."
+ 00000030: 0000 0061 0000 0022 0000 003a 0000 0020 ...a..."...:...
+ 00000040: 0000 0022 0000 0062 0000 0022 0000 007d ..."...b..."...}
+ 00000050: 0000 000a ....
+
+
+ UTF-16-LE:
+
+ 00000000: 2000 2000 2000 2000 2000 2000 0a00 0a00 . . . . . .....
+ 00000010: 0a00 0a00 7b00 2200 6100 2200 3a00 2000 ....{.".a.".:. .
+ 00000020: 2200 6200 2200 7d00 0a00 ".b.".}...
+
+ UTF-16-BE:
+
+ 00000000: 0020 0020 0020 0020 0020 0020 000a 000a . . . . . . ....
+ 00000010: 000a 000a 007b 0022 0061 0022 003a 0020 .....{.".a.".:.
+ 00000020: 0022 0062 0022 007d 000a .".b.".}..
+
+
+ """
+
+ curly_open_encoded: Final[str] = encode_roundtrip(CURLY_OPEN, encoding)
+ curly_close_encoded: Final[str] = encode_roundtrip(CURLY_CLOSE, encoding)
+
+ sequences = []
+
+ markers = preprocess_goto_markers(markers)
+
+ for marker in markers:
+ sig_sequence = convert_marker_to_signature_sequence(marker, encoding)
+ sequences.append(sig_sequence)
+
+ byte_sequences = []
+
+ byte_sequences.append(
+ ByteSequence(
+ id=0,
+ pos="BOF",
+ min_off=0,
+ max_off=f"{DISK_SECTOR_SIZE}",
+ endian="",
+ value=curly_open_encoded,
+ )
+ )
+
+ for idx, item in enumerate(sequences, 0):
+ logger.debug("%s. %s", idx, item)
+ byte_sequence = ByteSequence(
+ id=idx,
+ pos="VAR",
+ min_off="",
+ max_off="",
+ endian="",
+ value=item,
+ )
+ byte_sequences.append(byte_sequence)
+
+ byte_sequences.append(
+ ByteSequence(
+ id=0,
+ pos="EOF",
+ min_off="0",
+ max_off=f"{DISK_SECTOR_SIZE}",
+ endian="",
+ value=curly_close_encoded,
+ )
+ )
+
+ internal_signature = InternalSignature(
+ id=sig_id,
+ name="",
+ byte_sequences=byte_sequences,
+ )
+
+ return [internal_signature]
+
+
+def create_baseline_json_sequences(encoding: str):
+ """Create baseline JSON sequences that match map and list types
+ with various different encodings.
+ """
+
+ # pylint: disable=R0914; too-many local variables.
+
+ curly_open_encoded: Final[str] = encode_roundtrip(CURLY_OPEN, encoding)
+ curly_close_encoded: Final[str] = encode_roundtrip(CURLY_CLOSE, encoding)
+ square_open_encoded: Final[str] = encode_roundtrip(SQUARE_OPEN, encoding)
+ square_close_encoded: Final[str] = encode_roundtrip(SQUARE_CLOSE, encoding)
+
+ colon_encoded: Final[str] = encode_roundtrip(COLON, encoding)
+ double_quote_encoded: Final[str] = encode_roundtrip(DOUBLE_QUOTE, encoding)
+
+ bof = f"({curly_open_encoded}|{square_open_encoded})"
+ eof = f"({curly_close_encoded}|{square_close_encoded})"
+
+ no_encoded: Final[str] = NUMBER_REGEX
+ true_encoded: Final[str] = encode_roundtrip(TRUE_VALUE, encoding)
+ false_encoded: Final[str] = encode_roundtrip(FALSE_VALUE, encoding)
+ null_encoded: Final[str] = encode_roundtrip(NULL_VALUE, encoding)
+
+ options = (
+ f"{double_quote_encoded}{WHITESPACE_REGEX}{colon_encoded}",
+ no_encoded,
+ f"({true_encoded}|{false_encoded})",
+ null_encoded,
+ )
+
+ sigs = []
+
+ for opt in options:
+
+ bs = []
+
+ bs.append(
+ ByteSequence(
+ id=1,
+ pos="BOF",
+ min_off=0,
+ max_off=f"{DISK_SECTOR_SIZE}",
+ endian="",
+ value=bof,
+ )
+ )
+
+ bs.append(
+ ByteSequence(
+ id=1,
+ pos="VAR",
+ min_off=0,
+ max_off=0,
+ endian="",
+ value=opt,
+ )
+ )
+
+ bs.append(
+ ByteSequence(
+ id=1,
+ pos="EOF",
+ min_off="0",
+ max_off=f"{DISK_SECTOR_SIZE}",
+ endian="",
+ value=eof,
+ )
+ )
+
+ iss = InternalSignature(
+ id=0,
+ name="",
+ byte_sequences=bs,
+ )
+
+ sigs.append(iss)
+
+ return sigs
diff --git a/src/utils/json2json.py b/src/utils/json2json.py
index bdbbf21..c54b022 100644
--- a/src/utils/json2json.py
+++ b/src/utils/json2json.py
@@ -62,7 +62,7 @@ async def identify_plaintext_bytestream(path: str) -> Tuple[bool, str]:
async def identify_json(paths: list[str]):
"""Identify objects."""
for idx, path in enumerate(paths):
- valid, data, _, _ = await identify_plaintext_bytestream(path)
+ valid, data = await identify_plaintext_bytestream(path)
if not valid:
continue
print(json.dumps(data, indent=2))
@@ -114,7 +114,7 @@ def main() -> None:
parser = argparse.ArgumentParser(
prog="json2json",
description="parse JSON UTF-16 (BE-LE) objects and output them as UTF-8 for the sake of developer ergonomics",
- epilog="for more information visit https://github.com/ffdev-info/json-id",
+ epilog="for more information visit https://github.com/ffdev-info/jsonid",
)
parser.add_argument(
"--debug",
diff --git a/src/utils/jsonid2pronom.py b/src/utils/jsonid2pronom.py
new file mode 100644
index 0000000..444b641
--- /dev/null
+++ b/src/utils/jsonid2pronom.py
@@ -0,0 +1,139 @@
+"""jsonid2pronom provides a helper script to enable export of generic
+JSONID compatible markers to a PRONOM compatible signature file.
+"""
+
+import argparse
+import asyncio
+import copy
+import json
+import logging
+import sys
+
+try:
+ from src.jsonid import pronom
+except ModuleNotFoundError:
+ try:
+ from jsonid import pronom
+ except ModuleNotFoundError:
+ import pronom
+
+# Set up logging.
+logging.basicConfig(
+ format="%(asctime)-15s %(levelname)s :: %(filename)s:%(lineno)s:%(funcName)s() :: %(message)s", # noqa: E501
+ datefmt="%Y-%m-%d %H:%M:%S",
+ level="INFO",
+ handlers=[
+ logging.StreamHandler(),
+ ],
+)
+
+logger = logging.getLogger(__name__)
+
+
+async def load_patterns(path: str) -> list:
+ """Load patterns from a file for conversion to a signature file."""
+ patterns = []
+ with open(path, "r", encoding="utf-8") as patterns_file:
+ patterns = json.loads(patterns_file.read())
+ return patterns
+
+
+async def output_signature(path: str):
+ """Output JSONID compatible signatures to PRONOM."""
+
+ formats = []
+
+ encodings = ("UTF-8", "UTF-16", "UTF-16BE", "UTF-32LE")
+ priorities = []
+
+ increment_id = 0
+
+ markers = await load_patterns(path)
+
+ if not markers:
+ logger.error("no patterns provided via path arg")
+ sys.exit(1)
+
+ for encoding in encodings:
+ increment_id += 1
+ json_puid = f"jsonid2pronom/{increment_id}"
+ name_ = f"JSONID2PRONOM Conversion ({encoding})"
+ try:
+ mime = "application/json"
+ except IndexError:
+ mime = ""
+ try:
+ sequences = pronom.process_markers(
+ copy.deepcopy(markers),
+ increment_id,
+ encoding=encoding,
+ )
+ except pronom.UnprocessableEntity as err:
+ logger.error(
+ "%s %s: cannot handle: %s",
+ json_puid,
+ name_,
+ err,
+ )
+ for err_marker in markers:
+ logger.debug("--- START ---")
+ logger.debug("marker: %s", err_marker)
+ logger.debug("--- END ---")
+ continue
+ fmt = pronom.Format(
+ id=increment_id,
+ name=name_,
+ version="",
+ puid=json_puid,
+ mime=mime,
+ classification="structured text",
+ external_signatures=[
+ pronom.ExternalSignature(
+ id=increment_id,
+ signature="json",
+ type=pronom.EXT,
+ )
+ ],
+ internal_signatures=sequences,
+ priorities=list(set(priorities)),
+ )
+ priorities.append(f"{increment_id}")
+ formats.append(fmt)
+
+ pronom.process_formats_to_stdout(formats)
+
+
+def main() -> None:
+ """Primary entry point for this script."""
+ parser = argparse.ArgumentParser(
+ prog="jsonid2pronom",
+ description="convert JSONID compatible markers to PRONOM",
+ epilog="for more information visit https://github.com/ffdev-info/jsonid",
+ )
+ parser.add_argument(
+ "--debug",
+ help="use debug loggng",
+ required=False,
+ action="store_true",
+ )
+ parser.add_argument(
+ "--path",
+ "-p",
+ help="file path to process",
+ required=False,
+ )
+ args = parser.parse_args()
+ logging.getLogger(__name__).setLevel(logging.DEBUG if args.debug else logging.INFO)
+ logger.debug("debug logging is configured")
+ if not args.path:
+ parser.print_help(sys.stderr)
+ sys.exit()
+ asyncio.run(
+ output_signature(
+ path=args.path,
+ )
+ )
+
+
+if __name__ == "__main__":
+ main()
diff --git a/tests/test_pronom_export.py b/tests/test_pronom_export.py
new file mode 100644
index 0000000..7b6e9d5
--- /dev/null
+++ b/tests/test_pronom_export.py
@@ -0,0 +1,420 @@
+"""Test PRONOM export functions.
+
+NB. many of the tests here can be paramettrized once we have good
+coverage. They've all been tested individually as the conversion
+scripts are ironed out. This will take on more relevance if the
+tool is used more for JSON signature creation in general.
+"""
+
+import pytest
+
+from src.jsonid import pronom
+
+encode_roundtrip_tests = [
+ (
+ "74657374",
+ "7400650073007400",
+ "utf-16",
+ )
+]
+
+
+@pytest.mark.parametrize("hex_sequences, expected, encoding", encode_roundtrip_tests)
+def test_encode_roundtrip(hex_sequences, expected, encoding):
+ """Re-encode a set of hexadecimal values to a new encoding."""
+
+ res = pronom.encode_roundtrip(hex_sequences, encoding)
+ assert res == expected
+
+
+preprocess_goto_tests = [
+ (
+ [
+ {"KEY": "sops", "EXISTS": None},
+ {"GOTO": "sops", "KEY": "kms", "EXISTS": None},
+ {"GOTO": "sops", "KEY": "pgp", "EXISTS": None},
+ ],
+ [
+ {"KEY": "sops", "EXISTS": None},
+ {"KEY": "kms", "EXISTS": None},
+ {"KEY": "pgp", "EXISTS": None},
+ ],
+ )
+]
+
+
+@pytest.mark.parametrize("markers_in, markers_out", preprocess_goto_tests)
+def test_preprocess_goto_markers(markers_in: list, markers_out: list):
+ """Make sure preprocess markers works as anticipated.
+
+ GOTO is also largely synonymous with "KEY" exists and so we can
+ remove duplicate examples of GOTO and ensure just one "EXISTS" for
+ that GOTO exists.
+ """
+
+ res = pronom.preprocess_goto_markers(markers_in)
+ assert res == markers_out
+
+
+preprocess_index_tests = [
+ (
+ [
+ {"INDEX": 0, "KEY": "Content-Length", "EXISTS": None},
+ {"INDEX": 0, "KEY": "Content-Type", "EXISTS": None},
+ {"INDEX": 0, "KEY": "X-TIKA:Parsed-By", "EXISTS": None},
+ {"INDEX": 0, "KEY": "X-TIKA:parse_time_millis", "EXISTS": None},
+ ],
+ [
+ {"INDEX_START": None},
+ {"KEY": "Content-Length", "EXISTS": None},
+ {"INDEX END": None},
+ {"INDEX_START": None},
+ {"KEY": "Content-Type", "EXISTS": None},
+ {"INDEX END": None},
+ {"INDEX_START": None},
+ {"KEY": "X-TIKA:Parsed-By", "EXISTS": None},
+ {"INDEX END": None},
+ {"INDEX_START": None},
+ {"KEY": "X-TIKA:parse_time_millis", "EXISTS": None},
+ {"INDEX END": None},
+ ],
+ )
+]
+
+
+def test_export_sops():
+ """SOPS is a good example of a more complex signature. Ensure
+ it works here.
+ """
+
+ markers = [
+ {"KEY": "sops", "EXISTS": None},
+ {"GOTO": "sops", "KEY": "kms", "EXISTS": None},
+ {"GOTO": "sops", "KEY": "pgp", "EXISTS": None},
+ ]
+
+ processed = pronom.process_markers(markers, 0, "utf-8")
+
+ res = []
+ for sequence in processed[0].byte_sequences:
+ res.append(sequence.value)
+
+ expected = [
+ "7B",
+ "22736F707322{0-16}3A",
+ "226B6D7322{0-16}3A",
+ "2270677022{0-16}3A",
+ "7D",
+ ]
+
+ assert res == expected
+
+
+def test_ocfl_inventory():
+ """OFCL inventoty is one of the first examples that worked out of
+ the box and provides good control for errors."""
+
+ markers = [
+ {"KEY": "type", "STARTSWITH": "https://ocfl.io/"},
+ {"KEY": "type", "CONTAINS": "spec/#inventory"},
+ {"KEY": "head", "EXISTS": None},
+ {"KEY": "manifest", "EXISTS": None},
+ ]
+
+ processed = pronom.process_markers(markers, 0, "utf-8")
+
+ res = []
+ for sequence in processed[0].byte_sequences:
+ res.append(sequence.value)
+
+ expected = [
+ "7B",
+ "227479706522{0-16}3A{0-16}2268747470733A2F2F6F63666C2E696F2F",
+ "227479706522{0-16}3A{0-16}22*737065632F23696E76656E746F7279*22",
+ "226865616422{0-16}3A",
+ "226D616E696665737422{0-16}3A",
+ "7D",
+ ]
+ assert res == expected
+
+
+def test_json_patch():
+ """Ensure that JSON patch style markers are converted
+ correctly.
+ """
+
+ markers = [
+ {"INDEX": 0, "KEY": "op", "EXISTS": None},
+ {"INDEX": 0, "KEY": "path", "EXISTS": None},
+ ]
+
+ processed = pronom.process_markers(markers, 0, "utf-8")
+
+ res = []
+ for sequence in processed[0].byte_sequences:
+ res.append(sequence.value)
+
+ expected = [
+ "7B",
+ "{0-16}5B*7B*226F7022{0-16}3A*7D*5D",
+ "{0-16}5B*7B*227061746822{0-16}3A*7D*5D",
+ "7D",
+ ]
+
+ assert res == expected
+
+
+def test_gltf_schema():
+ """Ensure that GLTF style patterns are converted correctly.
+
+ NB. the pattern here is potentially different from that in the
+ registry and the registry may need updating to be more
+ permissive, or simply corrected.
+ """
+
+ markers = [
+ {"KEY": "$schema", "STARTSWITH": "https://json-schema.org/"},
+ {"KEY": "$schema", "ENDSWITH": "/schema"},
+ {"KEY": "title", "EXISTS": None},
+ {"KEY": "type", "IS": "object"},
+ {"KEY": "description", "IS": "The root object for a glTF asset."},
+ ]
+
+ processed = pronom.process_markers(markers, 0, "utf-8")
+
+ res = []
+ for sequence in processed[0].byte_sequences:
+ res.append(sequence.value)
+
+ expected = [
+ "7B",
+ "2224736368656D6122{0-16}3A{0-16}2268747470733A2F2F6A736F6E2D736368656D612E6F72672F",
+ "2224736368656D6122{0-16}3A{0-16}*2F736368656D6122",
+ "227469746C6522{0-16}3A",
+ "227479706522{0-16}6F626A656374",
+ "226465736372697074696F6E22{0-16}54686520726F6F74206F626A65637420666F72206120676C54462061737365742E",
+ "7D",
+ ]
+
+ assert res == expected
+
+
+def test_tika_recursive():
+ """Ensure that TIKA style signatures (relying largely on INDEX) are
+ converted correctly.
+ """
+
+ markers = [
+ {"INDEX": 0, "KEY": "Content-Length", "EXISTS": None},
+ {"INDEX": 0, "KEY": "Content-Type", "EXISTS": None},
+ {"INDEX": 0, "KEY": "X-TIKA:Parsed-By", "EXISTS": None},
+ {"INDEX": 0, "KEY": "X-TIKA:parse_time_millis", "EXISTS": None},
+ ]
+
+ processed = pronom.process_markers(markers, 0, "utf-8")
+
+ res = []
+ for sequence in processed[0].byte_sequences:
+ res.append(sequence.value)
+
+ expected = [
+ "7B",
+ "{0-16}5B*7B*22436F6E74656E742D4C656E67746822{0-16}3A*7D*5D",
+ "{0-16}5B*7B*22436F6E74656E742D5479706522{0-16}3A*7D*5D",
+ "{0-16}5B*7B*22582D54494B413A5061727365642D427922{0-16}3A*7D*5D",
+ "{0-16}5B*7B*22582D54494B413A70617273655F74696D655F6D696C6C697322{0-16}3A*7D*5D",
+ "7D",
+ ]
+
+ assert res == expected
+
+
+encoding_tests = [
+ (
+ [
+ {"KEY": "test", "IS": "data"},
+ {"KEY": "file", "ISTYPE": int},
+ {"KEY": "bool", "ISTYPE": bool},
+ {"KEY": "here", "EXISTS": None},
+ {"KEY": "within", "CONTAINS": "value"},
+ {"KEY": "start", "STARTSWITH": "value"},
+ {"KEY": "end", "ENDSWITH": "value"},
+ {"GOTO": "key", "KEY": "at", "EXISTS": None},
+ ],
+ [
+ "7B",
+ "227465737422{0-16}64617461",
+ "2266696C6522{0-16}3A{0-16}[30:39]",
+ "22626F6F6C22{0-16}3A{0-16}(74727565|66616C7365)",
+ "226865726522{0-16}3A",
+ "2277697468696E22{0-16}3A{0-16}22*76616C7565*22",
+ "22737461727422{0-16}3A{0-16}2276616C7565",
+ "22656E6422{0-16}3A{0-16}*76616C756522",
+ "226B657922{0-16}3A",
+ "22617422{0-16}3A",
+ "7D",
+ ],
+ "utf-8",
+ ),
+ (
+ [
+ {"INDEX": 1, "KEY": "key", "EXISTS": None},
+ ],
+ [
+ "7B",
+ "{0-16}5B*7B*226B657922{0-16}3A*7D*5D",
+ "7D",
+ ],
+ "utf-8",
+ ),
+ (
+ [
+ {"KEY": "test", "IS": "data"},
+ {"KEY": "file", "ISTYPE": int},
+ {"KEY": "bool", "ISTYPE": bool},
+ {"KEY": "here", "EXISTS": None},
+ {"KEY": "within", "CONTAINS": "value"},
+ {"KEY": "start", "STARTSWITH": "value"},
+ {"KEY": "end", "ENDSWITH": "value"},
+ {"GOTO": "key", "KEY": "at", "EXISTS": None},
+ ],
+ [
+ "7B00",
+ "220074006500730074002200{0-16}6400610074006100",
+ "2200660069006C0065002200{0-16}3A00{0-16}[30:39]",
+ "220062006F006F006C002200{0-16}3A00{0-16}(7400720075006500|660061006C0073006500)",
+ "220068006500720065002200{0-16}3A00",
+ "2200770069007400680069006E002200{0-16}3A00{0-16}2200*760061006C0075006500*2200",
+ "2200730074006100720074002200{0-16}3A00{0-16}2200760061006C0075006500",
+ "220065006E0064002200{0-16}3A00{0-16}*760061006C00750065002200",
+ "22006B00650079002200{0-16}3A00",
+ "2200610074002200{0-16}3A00",
+ "7D00",
+ ],
+ "utf-16",
+ ),
+ (
+ [
+ {"INDEX": 1, "KEY": "key", "EXISTS": None},
+ ],
+ [
+ "7B00",
+ "{0-16}5B00*7B00*22006B00650079002200{0-16}3A00*7D00*5D00",
+ "7D00",
+ ],
+ "utf-16",
+ ),
+ (
+ [
+ {"KEY": "test", "IS": "data"},
+ {"KEY": "file", "ISTYPE": int},
+ {"KEY": "bool", "ISTYPE": bool},
+ {"KEY": "here", "EXISTS": None},
+ {"KEY": "within", "CONTAINS": "value"},
+ {"KEY": "start", "STARTSWITH": "value"},
+ {"KEY": "end", "ENDSWITH": "value"},
+ {"GOTO": "key", "KEY": "at", "EXISTS": None},
+ ],
+ [
+ "007B",
+ "002200740065007300740022{0-16}0064006100740061",
+ "002200660069006C00650022{0-16}003A{0-16}[30:39]",
+ "00220062006F006F006C0022{0-16}003A{0-16}(0074007200750065|00660061006C00730065)",
+ "002200680065007200650022{0-16}003A",
+ "002200770069007400680069006E0022{0-16}003A{0-16}0022*00760061006C00750065*0022",
+ "0022007300740061007200740022{0-16}003A{0-16}002200760061006C00750065",
+ "00220065006E00640022{0-16}003A{0-16}*00760061006C007500650022",
+ "0022006B006500790022{0-16}003A",
+ "0022006100740022{0-16}003A",
+ "007D",
+ ],
+ "utf-16BE",
+ ),
+ (
+ [
+ {"INDEX": 1, "KEY": "key", "EXISTS": None},
+ ],
+ [
+ "007B",
+ "{0-16}005B*007B*0022006B006500790022{0-16}003A*007D*005D",
+ "007D",
+ ],
+ "utf-16BE",
+ ),
+ (
+ [
+ {"KEY": "test", "IS": "data"},
+ {"KEY": "file", "ISTYPE": int},
+ {"KEY": "bool", "ISTYPE": bool},
+ {"KEY": "here", "EXISTS": None},
+ {"KEY": "within", "CONTAINS": "value"},
+ {"KEY": "start", "STARTSWITH": "value"},
+ {"KEY": "end", "ENDSWITH": "value"},
+ {"GOTO": "key", "KEY": "at", "EXISTS": None},
+ ],
+ [
+ "7B000000",
+ "220000007400000065000000730000007400000022000000{0-16}64000000610000007400000061000000",
+ "2200000066000000690000006C0000006500000022000000{0-16}3A000000{0-16}[30:39]",
+ "22000000620000006F0000006F0000006C00000022000000{0-16}3A000000{0-16}(74000000720000007500000065000000|66000000610000006C0000007300000065000000)",
+ "220000006800000065000000720000006500000022000000{0-16}3A000000",
+ "2200000077000000690000007400000068000000690000006E00000022000000{0-16}3A000000{0-16}22000000*76000000610000006C0000007500000065000000*22000000",
+ "22000000730000007400000061000000720000007400000022000000{0-16}3A000000{0-16}2200000076000000610000006C0000007500000065000000",
+ "22000000650000006E0000006400000022000000{0-16}3A000000{0-16}*76000000610000006C000000750000006500000022000000",
+ "220000006B000000650000007900000022000000{0-16}3A000000",
+ "22000000610000007400000022000000{0-16}3A000000",
+ "7D000000",
+ ],
+ "utf-32le",
+ ),
+ (
+ [
+ {"INDEX": 1, "KEY": "key", "EXISTS": None},
+ ],
+ [
+ "7B000000",
+ "{0-16}5B000000*7B000000*220000006B000000650000007900000022000000{0-16}3A000000*7D000000*5D000000",
+ "7D000000",
+ ],
+ "utf-32le",
+ ),
+]
+
+
+@pytest.mark.parametrize("markers, expected, encoding", encoding_tests)
+def test_unicode_signatures(markers, expected, encoding):
+ """Provide a basic unicode tests.
+
+ These tests are based on the following two sample files:
+
+ ```json
+ {
+ "test": "data",
+ "file": 1,
+ "bool": true,
+ "here": "random...",
+ "within": "_value_",
+ "start": "value_",
+ "end": "_value",
+ "key": {
+ "at": "value"
+ }
+ }
+ ```
+
+ ```json
+ [
+ 0,
+ {
+ "key": "value"
+ }
+ ]
+ ```
+
+ """
+
+ processed = pronom.process_markers(markers, 0, encoding)
+ res = []
+ for sequence in processed[0].byte_sequences:
+ res.append(sequence.value)
+ assert res == expected