From 489da4a2da8afb8ce8a8d4d682390f2ec98937ea Mon Sep 17 00:00:00 2001 From: ross-spencer Date: Tue, 11 Nov 2025 22:47:21 +0100 Subject: [PATCH 1/7] Add PRONOM export of registry This feature demonstrates how to use JSONID as a high-level language for defining PRONOM signatures. JSONID's registry is output as a PRONOM signature file and should identify JSON objects via DROID. NB. the format isn't compatible with Siegfried as of yet. It uses modern DROID syntax. A small number of other fixes have been made at the same time as I worked throuhg the export functionality. CHanges are only minor, e.g. making sure json2json continued to work whre an earlier commit introduced an error. --- .gitignore | 1 + justfile | 26 ++ src/jsonid/export.py | 115 +++++- src/jsonid/export_helpers.py | 30 ++ src/jsonid/file_processing.py | 21 ++ src/jsonid/helpers.py | 25 +- src/jsonid/jsonid.py | 9 +- src/jsonid/pronom.py | 668 ++++++++++++++++++++++++++++++++++ src/utils/json2json.py | 4 +- tests/test_pronom_export.py | 420 +++++++++++++++++++++ 10 files changed, 1303 insertions(+), 16 deletions(-) create mode 100644 src/jsonid/export_helpers.py create mode 100644 src/jsonid/pronom.py create mode 100644 tests/test_pronom_export.py diff --git a/.gitignore b/.gitignore index 07e77b2..6071354 100644 --- a/.gitignore +++ b/.gitignore @@ -141,3 +141,4 @@ jsonid-integration-files/ # Secreta token.pypi +jsonid_pronom.xml diff --git a/justfile b/justfile index bfb4a32..9be2689 100644 --- a/justfile +++ b/justfile @@ -83,6 +83,10 @@ check-debug: @hexdump file: hexdump -v -e '"\\\x" 1/1 "%02x"' {{file}} +# Hexdump bytes only. +@hexdump-plain file: + hexdump -ve '1/1 "%.2x"' {{file}} + # code coverage coverage: python -m tox -e coverage @@ -102,3 +106,25 @@ lookup-ref-ex: # lookup ref @lookup-ref ref: python jsonid.py lookup {{ref}} + +# export as PRONOM +pronom: + python jsonid.py --pronom --debug + +# move pronom to .droid folder +pronom-to-droid: + ~/.droid6/clean.sh + rm -f ~/.droid6/signature_files/jsonid_pronom.xml + cp jsonid_pronom.xml ~/.droid6/signature_files/ + +dir := `pwd` + +# load pronom to siegfried via roy +roy: + @echo {{dir}} + roy build --noreports -extend {{dir}}/jsonid_pronom.xml + +# droid-cli +droid-cli path: pronom-to-droid + java -jar ~/dp/droid/droid-binary-6.8.0-bin/droid-command-line-6.8.0.jar -s 1 + java -jar ~/dp/droid/droid-binary-6.8.0-bin/droid-command-line-6.8.0.jar -ri {{path}} diff --git a/src/jsonid/export.py b/src/jsonid/export.py index c062aa8..b128b5a 100644 --- a/src/jsonid/export.py +++ b/src/jsonid/export.py @@ -1,25 +1,35 @@ """Functions to support export.""" +import copy import datetime import json import logging from datetime import timezone +from typing import Final try: + import pronom import registry_data import version except ModuleNotFoundError: try: - from src.jsonid import registry_data, version + from src.jsonid import pronom, registry_data, version except ModuleNotFoundError: - from jsonid import registry_data, version + from jsonid import pronom, registry_data, version logger = logging.getLogger(__name__) +PRONOM_FILENAME: Final[str] = "jsonid_pronom.xml" + + +class PRONOMException(Exception): + """Exception class if we can't create a PRONOM signature as expected.""" + + def exportJSON() -> None: # pylint: disable=C0103 """Export to JSON.""" - logger.debug("exporting registry ad JSON") + logger.debug("exporting registry as JSON") data = registry_data.registry() json_obj = [] id_ = { @@ -35,3 +45,102 @@ def exportJSON() -> None: # pylint: disable=C0103 for datum in data: json_obj.append(datum.json()) print(json.dumps(json_obj, indent=2)) + + +def export_pronom() -> None: + """Export a PRONOM compatible set of signatures. + + Export is done in two phases. A set of proposed "Baseline" JSON + signatures to catch many JSON instances. + + Second the JSONID registry is exported. + + Every export has a priority over the other so that there should + be no multiple identification results. + """ + + # pylint: disable=R0914; too-many local variables. + + logger.debug("exporting registry as PRONOM") + + reg_data = registry_data.registry() + formats = [] + + encodings = ("UTF-8", "UTF-16", "UTF-16BE", "UTF-32LE") + priorities = [] + + increment_id = 0 + + for encoding in encodings: + all_baseline = pronom.create_baseline_json_sequences(encoding) + for baseline in all_baseline: + increment_id += 1 + fmt = pronom.Format( + id=increment_id, + name=f"JSON (Baseline - fmt/817) ({encoding})", + version="", + puid="jsonid:0000", + mime="application/json", + classification="structured text", + external_signatures=[ + pronom.ExternalSignature( + id=increment_id, + signature="json", + type=pronom.EXT, + ) + ], + internal_signatures=[baseline], + priorities=priorities, + ) + priorities.append(f"{increment_id}") + formats.append(fmt) + + for encoding in encodings: + for entry in reg_data: + increment_id += 1 + json_puid = f"{entry.json()['identifier']};{encoding}" + name_ = f"{entry.json()['name'][0]['@en']} ({encoding})" + markers = entry.json()["markers"] + try: + mime = entry.json()["mime"][0] + except IndexError: + mime = "" + try: + sequences = pronom.process_markers( + markers.copy(), + increment_id, + encoding=encoding, + ) + except pronom.UnprocessableEntity as err: + logger.error( + "%s %s: cannot handle: %s", + json_puid, + name_, + err, + ) + for marker in markers: + logger.debug("--- START ---") + logger.debug("marker: %s", marker) + logger.debug("--- END ---") + continue + fmt = pronom.Format( + id=increment_id, + name=name_, + version="", + puid=json_puid, + mime=mime, + classification="structured text", + external_signatures=[ + pronom.ExternalSignature( + id=increment_id, + signature="json", + type=pronom.EXT, + ) + ], + internal_signatures=sequences, + priorities=copy.deepcopy(list(set(priorities))), + ) + priorities.append(f"{increment_id}") + formats.append(fmt) + + pronom.process_formats_and_save(formats, PRONOM_FILENAME) diff --git a/src/jsonid/export_helpers.py b/src/jsonid/export_helpers.py new file mode 100644 index 0000000..0dfec2f --- /dev/null +++ b/src/jsonid/export_helpers.py @@ -0,0 +1,30 @@ +"""Helpers for the export functions.""" + +import datetime +from datetime import timezone +from typing import Final +from xml.dom.minidom import parseString + +UTC_TIME_FORMAT: Final[str] = "%Y-%m-%dT%H:%M:%SZ" + + +def get_utc_timestamp_now(): + """Get a formatted UTC timestamp for 'now' that can be used when + a timestamp is needed. + """ + return datetime.datetime.now(timezone.utc).strftime(UTC_TIME_FORMAT) + + +def new_prettify(c): + """Remove excess newlines from DOM output. + + via: https://stackoverflow.com/a/14493981 + """ + reparsed = parseString(c) + return "\n".join( + [ + line + for line in reparsed.toprettyxml(indent=" " * 2).split("\n") + if line.strip() + ] + ) diff --git a/src/jsonid/file_processing.py b/src/jsonid/file_processing.py index 1d083a4..1da44ee 100644 --- a/src/jsonid/file_processing.py +++ b/src/jsonid/file_processing.py @@ -338,6 +338,27 @@ async def identify_plaintext_bytestream( If analysis is `True` we try to return more low-level file information to help folks make appraisal decisions. + + Encodings in Python are split into the following, where UTF-32 on + its own is a little confusing. If WE are writing the encoding then + I believe it leaves off the byte-order-marker and we want to + select UTF-32LE to make sure it is written. + + If we are decoding, then I don't think it matters. I think we + try to decode and if it works it works. + + Encodings: + + "UTF-8", + "UTF-16", + "UTF-16LE", + "UTF-16BE", + "UTF-32", + "UTF-32LE", + "UTF-32BE", + "SHIFT-JIS", + "BIG5", + """ # pylint: disable=R0911 diff --git a/src/jsonid/helpers.py b/src/jsonid/helpers.py index ca28bc1..3776fc1 100644 --- a/src/jsonid/helpers.py +++ b/src/jsonid/helpers.py @@ -2,7 +2,7 @@ import logging import time -from typing import Union +from typing import Final, Union try: import htm_template @@ -75,6 +75,15 @@ def format_marker(marker_text: str, marker: dict) -> str: return f"{marker_text}{marker_formatted}\n" +TYPE_BOOL: Final[str] = "bool" +TYPE_FLOAT: Final[str] = "float" +TYPE_INTEGER: Final[str] = "integer" +TYPE_LIST: Final[str] = "list" +TYPE_NONE: Final[str] = "NoneType" +TYPE_MAP: Final[str] = "map" +TYPE_STRING: Final[str] = "string" + + def substitute_type_text(replace_me: Union[str, type]): """Output a text substitution for a type that will otherwise not pretty-print. @@ -83,19 +92,19 @@ def substitute_type_text(replace_me: Union[str, type]): # pylint: disable=R0911 if replace_me.__name__ == "dict": - return "map" + return TYPE_MAP if replace_me.__name__ == "int": - return "integer" + return TYPE_INTEGER if replace_me.__name__ == "list": - return "list" + return TYPE_LIST if replace_me.__name__ == "str": - return "string" + return TYPE_STRING if replace_me.__name__ == "float": - return "float" + return TYPE_FLOAT if replace_me.__name__ == "bool": - return "bool" + return TYPE_BOOL if replace_me.__name__ == "NoneType": - return "NoneType" + return TYPE_NONE if not isinstance(replace_me, type): pass return replace_me diff --git a/src/jsonid/jsonid.py b/src/jsonid/jsonid.py index 0dfa6b8..7475e0d 100644 --- a/src/jsonid/jsonid.py +++ b/src/jsonid/jsonid.py @@ -98,7 +98,7 @@ def main() -> None: # pylint: disable=R0912,R0915 parser = argparse.ArgumentParser( - prog="json-id", + prog="jsonid", description="proof-of-concept identifier for JSON objects on disk based on identifying valid objects and their key-values", epilog="for more information visit https://github.com/ffdev-info/json-id", ) @@ -156,10 +156,12 @@ def main() -> None: help="path to a custom registry to lead into memory replacing the default", required=False, ) + # NB. consider output to stdout once the feature is more stable. parser.add_argument( "--pronom", - help="return a PRONOM-centric view of the results", + help=f"return a PRONOM-centric view of the results to `{export.PRONOM_FILENAME}` (BETA)", required=False, + action="store_true", ) parser.add_argument( "--export", @@ -220,7 +222,8 @@ def main() -> None: if args.registry: raise NotImplementedError("custom registry is not yet available") if args.pronom: - raise NotImplementedError("pronom view is not yet implemented") + export.export_pronom() + sys.exit() if args.language: raise NotImplementedError("multiple languages are not yet implemented") if args.export: diff --git a/src/jsonid/pronom.py b/src/jsonid/pronom.py new file mode 100644 index 0000000..ddf0d80 --- /dev/null +++ b/src/jsonid/pronom.py @@ -0,0 +1,668 @@ +"""PRONOM export routines. + +XML tooling: https://xmllint.com/ +""" + +import binascii +import codecs +import logging +import xml.dom.minidom +from dataclasses import dataclass +from functools import lru_cache +from typing import Any, Final + +try: + import export_helpers + import helpers + import registry_matchers +except ModuleNotFoundError: + try: + from src.jsonid import export_helpers, helpers, registry_matchers + except ModuleNotFoundError: + from jsonid import export_helpers, helpers, registry_matchers + + +logger = logging.getLogger(__name__) + + +DISK_SECTOR_SIZE: Final[int] = 4095 + +# Common PRONOM characters. +COLON: Final[str] = "3A" +CURLY_OPEN: Final[str] = "7B" +CURLY_CLOSE: Final[str] = "7D" +SQUARE_OPEN: Final[str] = "5B" +SQUARE_CLOSE: Final[str] = "5D" +DOUBLE_QUOTE: Final[str] = "22" + +# Constant values. +NUMBER_REGEX: Final[str] = "[30:39]" +TRUE_VALUE: Final[str] = "74727565" +FALSE_VALUE: Final[str] = "66616C7365" +NULL_VALUE: Final[str] = "6E756C6C" + +# Our whitespace values could potentially be optimized per encoding, +# e.g. to be more or less per encoding. 16 is a good default to enable +# some UTF32-identification. +WHITESPACE_REGEX: Final[str] = "{0-16}" + +# External signature types. +EXT: Final[str] = "file extension" + +# Replacement markers for PRONOM pre-processing. +MARKER_INDEX_START = "INDEX_START" +MARKER_INDEX_END = "INDEX END" + + +class UnprocessableEntity(Exception): + """Provide a way to give complete feedback to the caller to allow + it to exit.""" + + +@dataclass +class ExternalSignature: + id: str + signature: str + type: str + + +@dataclass +class ByteSequence: + id: str + pos: str + min_off: str + max_off: str + endian: str + value: str + + +@dataclass +class InternalSignature: + id: str + name: str + byte_sequences: list[ByteSequence] + + +@dataclass +class Priority: + type: str + id: str + + +@dataclass +class Identifier: + type: str + value: str + + +@dataclass +class Format: # pylint: disable=R0902 + id: str + name: str + version: str + puid: str + mime: str + classification: str + external_signatures: list[ExternalSignature] + internal_signatures: list[InternalSignature] + priorities: list[int] + + +@lru_cache() +def _get_bom() -> list: + """Generate a list of byte-order markers that allow us to replace + markers introduced through various encoding operations. + """ + replaces = [ + codecs.BOM, + codecs.BOM_BE, + codecs.BOM_LE, + codecs.BOM_UTF8, + codecs.BOM_UTF16, + codecs.BOM_UTF16_BE, + codecs.BOM_UTF16_LE, + codecs.BOM_UTF32, + codecs.BOM_UTF32_BE, + codecs.BOM_UTF32_LE, + ] + res = [] + for bom in replaces: + hex_bom = "" + for marker in bom: + char = hex(marker) + hex_bom = f"{hex_bom}{char.replace('0x', '')}".upper() + res.append(hex_bom) + return res + + +def create_many_to_one_byte_sequence(internal_signatures: list[InternalSignature]): + """Create a many to one byte sequence, i.e. a format with multiple + Internal Signatures. + """ + internal_signature = "" + for internal in internal_signatures: + id_ = internal.id + bs = create_one_to_many_byte_sequence(internal.byte_sequences) + internal_signature = f""" +{internal_signature} + {bs} + + """ + return internal_signature.strip() + + +def calculate_variable_off_bof(item: ByteSequence): + """Given variable offsets, calculate the correct syntax.""" + seq = item.value + if ( + item.min_off != "" + and int(item.min_off) > 0 + and item.max_off != "" + and int(item.max_off) > 0 + ): + seq = f"{{{item.min_off}-{int(item.min_off)+int(item.max_off)}}}{seq}" + elif item.max_off != "" and int(item.max_off) > 0: + seq = f"{{0-{item.max_off}}}{seq}" + elif item.min_off != "" and int(item.min_off) > 0: + seq = f"{{{item.min_off}}}{seq}" + return seq + + +def calculate_variable_off_eof(item: ByteSequence): + """Given variable offsets, calculate the correct syntax.""" + seq = item.value + if ( + item.min_off != "" + and int(item.min_off) > 0 + and item.max_off != "" + and int(item.max_off) > 0 + ): + seq = f"{seq}{{{item.min_off}-{int(item.min_off)+int(item.max_off)}}}" + elif item.max_off != "" and int(item.max_off) > 0: + seq = f"{seq}{{0-{item.max_off}}}" + elif item.min_off != "" and int(item.min_off) > 0: + seq = f"{seq}{{{item.min_off}}}" + return seq + + +def create_one_to_many_byte_sequence(byte_sequences: list[ByteSequence]): + """Create a byte sequence object.""" + byte_sequence = "" + for item in byte_sequences: + seq = item.value + if item.pos.startswith("EOF"): + seq = calculate_variable_off_eof(item) + elif item.pos.startswith("BOF"): + seq = calculate_variable_off_bof(item) + byte_sequence = f""" +{byte_sequence.strip()} + + """ + return byte_sequence.strip() + + +def create_file_format_collection(fmt: list[Format]): + """Create the FileFormatCollection object. + + E.g. + ``` + + 1 + ext + + + + 880 + 881 + ai + 86 + 331 + 332 + 771 + 773 + + ``` + + """ + internal_sigs = [ + f"{sig.id}" + for sig in fmt.internal_signatures + ] + external_sigs = [ + f"{sig.signature}" + for sig in fmt.external_signatures + if sig.type.lower() == EXT + ] + priorities = [ + f"{priority}" + for priority in fmt.priorities + ] + ff = f""" + + {"".join(internal_sigs).strip()} + {"".join(external_sigs).strip()} + {"".join(priorities).strip()} + + """ + return ff.strip() + + +def process_formats_and_save(formats: list[Format], filename: str): + """Process the collected formats and output a signature file. + + NB. Given our dataclasses here, we have the opportunity to rework + this data into many new structures. We output XML because DROID + expects XML. + """ + isc = [] + ffc = [] + for fmt in formats: + ffc.append(create_file_format_collection(fmt)) + if fmt.internal_signatures: + isc.append(create_many_to_one_byte_sequence(fmt.internal_signatures)) + droid_template = f""" + + + + {"".join(isc).strip()} + + + {"".join(ffc).strip()} + + + """ + dom = None + signature_file = droid_template.strip().replace("\n", "") + try: + dom = xml.dom.minidom.parseString(signature_file) + except xml.parsers.expat.ExpatError as err: + logger.error("cannot process xml: %s", err) + return + pretty_xml = dom.toprettyxml(indent=" ", encoding="utf-8") + prettier_xml = export_helpers.new_prettify(pretty_xml) + logger.info("outputting to: %s", filename) + with open(filename, "w", encoding="utf=8") as output_file: + output_file.write(prettier_xml) + + +def encode_roundtrip(hexed_val: str, encoding: str) -> str: + """We want to get a plain-text byte-sequence into a new + encoding. It takes a few hops and skips. + """ + val = hexed_val.strip() + try: + re_encoded = binascii.unhexlify(hexed_val).decode("utf-8").encode(encoding) + except (binascii.Error, UnicodeDecodeError) as err: + logger.error("cannot convert: %s len: %s ('%s')", hexed_val, len(val), err) + return val + hex_val = binascii.hexlify(re_encoded).decode().upper() + for bom in _get_bom(): + if not hex_val.startswith(bom): + continue + return hex_val.replace(bom, "") + return hex_val + + +def _type_to_str(type_: type, encoding: str) -> str: + """Given a data type marker we need to convert the type into a + byte sequence that will match the type. + + E.g. BOOLEAN types evaluate to true or false encoded in ASCII. + E.g. STRING types need to begin and end with double-quotes but the + string itself is just a wildcard. The wildcard will match any + value between the double quotes. + """ + + curly_open_encoded = encode_roundtrip(CURLY_OPEN, encoding) + curly_close_encoded: Final[str] = encode_roundtrip(CURLY_CLOSE, encoding) + square_open_encoded: Final[str] = encode_roundtrip(SQUARE_OPEN, encoding) + square_close_encoded: Final[str] = encode_roundtrip(SQUARE_CLOSE, encoding) + double_quote_encoded: Final[str] = encode_roundtrip(DOUBLE_QUOTE, encoding) + + try: + type_ = helpers.substitute_type_text(type_) + except AttributeError: + logger.debug("type_ already converted: %s", type_) + + if type_ in (helpers.TYPE_INTEGER, type_ == helpers.TYPE_FLOAT): + # an integer field will begin 0-9 but it is unclear how to + # represent larger numbers? and whether we need to? + return NUMBER_REGEX + if type_ == helpers.TYPE_BOOL: + # true | false + return f"({encode_roundtrip(TRUE_VALUE, encoding)}|{encode_roundtrip(FALSE_VALUE, encoding)})" + if type_ == helpers.TYPE_STRING: + # string begins with a double quote and ends in a double quote. + return f"'{double_quote_encoded}*{double_quote_encoded}" + if type_ == helpers.TYPE_MAP: + # { == 7B; } == 7D + return f"{curly_open_encoded}*{curly_close_encoded}" + if type_ == helpers.TYPE_LIST: + # [ == 5B; ] == 5D + return f"{square_open_encoded}*{square_close_encoded}" + if type_ == helpers.TYPE_NONE: + # null + return f"{encode_roundtrip(NULL_VALUE, encoding)}".encode(encoding) + # This should only trigger for incorrect values at this point.. + raise UnprocessableEntity(f"type_to_str: {type_}") + + +def _complex_is_type(marker: Any) -> str: + """Complex IS might be another data structure, e.g. a dict, or + something else that we can't convert easily. It is simply a WIP + for now. + """ + raise UnprocessableEntity(f"complex IS type: '{marker}' (WIP)") + + +def _str_to_hex_str(string: str) -> str: + """Convert string to hexadecimal bytes. + + We convert to bytes here first without encoding and then convert + the bytes to an encoding second. It should be possible to combine + those two procedures, but this has worked well during the + prototyping phase. + """ + hex_bytes = [] + for byte_ in string.encode(): + hex_bytes.append(hex(byte_).replace("0x", "")) + hex_str = "".join(hex_bytes).upper() + return hex_str + + +def quote_and_encode(value, encoding) -> str: + """Quote and encode a given value.""" + + double_quote_encoded: Final[str] = encode_roundtrip(DOUBLE_QUOTE, encoding) + # return f"{double_quote_encoded}{value}{double_quote_encoded}" + return f"{double_quote_encoded}{encode_roundtrip(value, encoding)}{double_quote_encoded}" + + +def convert_marker_to_signature_sequence(marker: dict, encoding: str) -> str: + """Convert a JSONID marker into a signature sequence.""" + + # pylint: disable=R0914; too-many local variables. + # pylint: disable=R0911; too-many return statements. + # pylint: disable=R0915; too-many statements. + + logger.debug("marker: %s", marker) + + colon_encoded: Final[str] = encode_roundtrip(COLON, encoding) + double_quote_encoded: Final[str] = encode_roundtrip(DOUBLE_QUOTE, encoding) + curly_open_encoded: Final[str] = encode_roundtrip(CURLY_OPEN, encoding) + curly_close_encoded: Final[str] = encode_roundtrip(CURLY_CLOSE, encoding) + colon_encoded: Final[str] = encode_roundtrip(COLON, encoding) + square_open_encoded: Final[str] = encode_roundtrip(SQUARE_OPEN, encoding) + square_close_encoded: Final[str] = encode_roundtrip(SQUARE_CLOSE, encoding) + double_quote_encoded: Final[str] = encode_roundtrip(DOUBLE_QUOTE, encoding) + + instruction = "" + if registry_matchers.MARKER_GOTO in marker.keys(): + # GOTO KEY and match KEY. + goto_key = _str_to_hex_str(marker["GOTO"]) + key_at_goto = _str_to_hex_str(marker["KEY"]) + goto_encoded = quote_and_encode(goto_key, encoding) + key_encoded = quote_and_encode(key_at_goto, encoding) + instruction = f"{goto_encoded}{WHITESPACE_REGEX}{colon_encoded}*{WHITESPACE_REGEX}{key_encoded}{WHITESPACE_REGEX}{colon_encoded}" + marker.pop("GOTO") + marker.pop("KEY") + return instruction.upper() + if registry_matchers.MARKER_INDEX in marker.keys(): + key = _str_to_hex_str(marker["KEY"]) + instruction = f"{WHITESPACE_REGEX}{square_open_encoded}*{curly_open_encoded}*{double_quote_encoded}{encode_roundtrip(key, encoding)}{double_quote_encoded}{WHITESPACE_REGEX}{colon_encoded}*{curly_close_encoded}*{square_close_encoded}" + marker.pop("INDEX") + marker.pop("KEY") + return instruction.upper() + if "KEY" in marker.keys(): + key = _str_to_hex_str(marker["KEY"]) + instruction = quote_and_encode(key, encoding) + marker.pop("KEY") + if registry_matchers.MARKER_KEY_EXISTS in marker.keys(): + instruction = f"{instruction}{WHITESPACE_REGEX}{colon_encoded}".upper() + return instruction + if registry_matchers.MARKER_IS_TYPE in marker.keys(): + is_type = _type_to_str(marker["ISTYPE"], encoding=encoding) + type_val = ( + f"{instruction}{WHITESPACE_REGEX}{colon_encoded}{WHITESPACE_REGEX}{is_type}" + ) + return type_val.upper() + if registry_matchers.MARKER_IS in marker.keys(): + marker_is = marker["IS"] + if not isinstance(marker_is, str): + _complex_is_type(marker_is) + equals = _str_to_hex_str(marker_is) + is_val = f"{instruction}{WHITESPACE_REGEX}{encode_roundtrip(equals, encoding)}" + return is_val.upper() + if registry_matchers.MARKER_STARTSWITH in marker.keys(): + starts_with = _str_to_hex_str(marker["STARTSWITH"]) + starts_with_val = f"{instruction}{WHITESPACE_REGEX}{colon_encoded}{WHITESPACE_REGEX}{double_quote_encoded}{encode_roundtrip(starts_with, encoding)}" + return starts_with_val.upper() + if registry_matchers.MARKER_ENDSWITH in marker.keys(): + ends_with = _str_to_hex_str(marker["ENDSWITH"]) + ends_with_val = f"{instruction}{WHITESPACE_REGEX}{colon_encoded}{WHITESPACE_REGEX}*{encode_roundtrip(ends_with, encoding)}{double_quote_encoded}" + return ends_with_val.upper() + if registry_matchers.MARKER_CONTAINS in marker.keys(): + contains = _str_to_hex_str(marker["CONTAINS"]) + contains_val = f"{instruction}{WHITESPACE_REGEX}{colon_encoded}{WHITESPACE_REGEX}{double_quote_encoded}*{encode_roundtrip(contains, encoding)}*{double_quote_encoded}" + return contains_val.upper() + if registry_matchers.MARKER_REGEX in marker.keys(): + raise UnprocessableEntity("REGEX not yet implemented") + if registry_matchers.MARKER_KEY_NO_EXIST in marker.keys(): + raise UnprocessableEntity("KEY NO EXIST not yet implemented") + # We should never arrive here. In the future clean this up so we + # only return when we have information. + return "" + + +def preprocess_goto_markers(markers: dict) -> list: + """Preprocess markers to remove data that is otherwise duplicated + when converted to a PRONOM signature, e.g. GOTO.""" + + out = [] + for marker in markers: + if registry_matchers.MARKER_GOTO not in marker: + out.append(marker) + continue + key = marker.pop("GOTO") + new_marker = {"KEY": key, "EXISTS": None} + if new_marker not in out: + out.append(new_marker) + out.append(marker) + return out + + +def process_markers( + markers: list, sig_id: int, encoding: str = "" +) -> tuple[list | bool]: + """Given a set of markers for a document type, process them into + a set of byte sequences and finally an internal signature sequence + that can be output as a PRONOM signature. + + returns a tuple describing the processed value and a flag to + highlight the result is potentially lossless, e.g. in the case + of matching types, e.g. strings. + + dict_keys(['CONTAINS']) + dict_keys(['ENDSWITH']) + dict_keys(['IS'] + dict_keys(['ISTYPE']) + dict_keys(['STARTSWITH']) + + key(0-n):(0-n)value + + Need to return something like: + + + + Different encodings need to be accounted for, e.g. (with added + whitespace below) + + UTF-32-LE: + + 00000000: 2000 0000 2000 0000 2000 0000 2000 0000 ... ... ... ... + 00000010: 2000 0000 2000 0000 0a00 0000 0a00 0000 ... ........... + 00000020: 0a00 0000 0a00 0000 7b00 0000 2200 0000 ........{..."... + 00000030: 6100 0000 2200 0000 3a00 0000 2000 0000 a..."...:... ... + 00000040: 2200 0000 6200 0000 2200 0000 7d00 0000 "...b..."...}... + 00000050: 0a00 0000 .... + + UTF-32-BE: + + 00000000: 0000 0020 0000 0020 0000 0020 0000 0020 ... ... ... ... + 00000010: 0000 0020 0000 0020 0000 000a 0000 000a ... ... ........ + 00000020: 0000 000a 0000 000a 0000 007b 0000 0022 ...........{..." + 00000030: 0000 0061 0000 0022 0000 003a 0000 0020 ...a..."...:... + 00000040: 0000 0022 0000 0062 0000 0022 0000 007d ..."...b..."...} + 00000050: 0000 000a .... + + + UTF-16-LE: + + 00000000: 2000 2000 2000 2000 2000 2000 0a00 0a00 . . . . . ..... + 00000010: 0a00 0a00 7b00 2200 6100 2200 3a00 2000 ....{.".a.".:. . + 00000020: 2200 6200 2200 7d00 0a00 ".b.".}... + + UTF-16-BE: + + 00000000: 0020 0020 0020 0020 0020 0020 000a 000a . . . . . . .... + 00000010: 000a 000a 007b 0022 0061 0022 003a 0020 .....{.".a.".:. + 00000020: 0022 0062 0022 007d 000a .".b.".}.. + + + """ + + curly_open_encoded: Final[str] = encode_roundtrip(CURLY_OPEN, encoding) + curly_close_encoded: Final[str] = encode_roundtrip(CURLY_CLOSE, encoding) + + sequences = [] + + markers = preprocess_goto_markers(markers) + + for marker in markers: + sig_sequence = convert_marker_to_signature_sequence(marker, encoding) + sequences.append(sig_sequence) + + byte_sequences = [] + + byte_sequences.append( + ByteSequence( + id=0, + pos="BOF", + min_off=0, + max_off=f"{DISK_SECTOR_SIZE}", + endian="", + value=curly_open_encoded, + ) + ) + + for idx, item in enumerate(sequences, 0): + logger.debug("%s. %s", idx, item) + byte_sequence = ByteSequence( + id=idx, + pos="VAR", + min_off="", + max_off="", + endian="", + value=item, + ) + byte_sequences.append(byte_sequence) + + byte_sequences.append( + ByteSequence( + id=0, + pos="EOF", + min_off="0", + max_off=f"{DISK_SECTOR_SIZE}", + endian="", + value=curly_close_encoded, + ) + ) + + internal_signature = InternalSignature( + id=sig_id, + name="", + byte_sequences=byte_sequences, + ) + + return [internal_signature] + + +def create_baseline_json_sequences(encoding: str): + """Create baseline JSON sequences that match map and list types + with various different encodings. + """ + + # pylint: disable=R0914; too-many local variables. + + curly_open_encoded: Final[str] = encode_roundtrip(CURLY_OPEN, encoding) + curly_close_encoded: Final[str] = encode_roundtrip(CURLY_CLOSE, encoding) + square_open_encoded: Final[str] = encode_roundtrip(SQUARE_OPEN, encoding) + square_close_encoded: Final[str] = encode_roundtrip(SQUARE_CLOSE, encoding) + + colon_encoded: Final[str] = encode_roundtrip(COLON, encoding) + double_quote_encoded: Final[str] = encode_roundtrip(DOUBLE_QUOTE, encoding) + + bof = f"({curly_open_encoded}|{square_open_encoded})" + eof = f"({curly_close_encoded}|{square_close_encoded})" + + no_encoded: Final[str] = NUMBER_REGEX + true_encoded: Final[str] = encode_roundtrip(TRUE_VALUE, encoding) + false_encoded: Final[str] = encode_roundtrip(FALSE_VALUE, encoding) + null_encoded: Final[str] = encode_roundtrip(NULL_VALUE, encoding) + + options = ( + f"{double_quote_encoded}{WHITESPACE_REGEX}{colon_encoded}", + no_encoded, + f"({true_encoded}|{false_encoded})", + null_encoded, + ) + + sigs = [] + + for opt in options: + + bs = [] + + bs.append( + ByteSequence( + id=1, + pos="BOF", + min_off=0, + max_off=f"{DISK_SECTOR_SIZE}", + endian="", + value=bof, + ) + ) + + bs.append( + ByteSequence( + id=1, + pos="VAR", + min_off=0, + max_off=0, + endian="", + value=opt, + ) + ) + + bs.append( + ByteSequence( + id=1, + pos="EOF", + min_off="0", + max_off=f"{DISK_SECTOR_SIZE}", + endian="", + value=eof, + ) + ) + + iss = InternalSignature( + id=0, + name="", + byte_sequences=bs, + ) + + sigs.append(iss) + + return sigs diff --git a/src/utils/json2json.py b/src/utils/json2json.py index bdbbf21..c54b022 100644 --- a/src/utils/json2json.py +++ b/src/utils/json2json.py @@ -62,7 +62,7 @@ async def identify_plaintext_bytestream(path: str) -> Tuple[bool, str]: async def identify_json(paths: list[str]): """Identify objects.""" for idx, path in enumerate(paths): - valid, data, _, _ = await identify_plaintext_bytestream(path) + valid, data = await identify_plaintext_bytestream(path) if not valid: continue print(json.dumps(data, indent=2)) @@ -114,7 +114,7 @@ def main() -> None: parser = argparse.ArgumentParser( prog="json2json", description="parse JSON UTF-16 (BE-LE) objects and output them as UTF-8 for the sake of developer ergonomics", - epilog="for more information visit https://github.com/ffdev-info/json-id", + epilog="for more information visit https://github.com/ffdev-info/jsonid", ) parser.add_argument( "--debug", diff --git a/tests/test_pronom_export.py b/tests/test_pronom_export.py new file mode 100644 index 0000000..7b6e9d5 --- /dev/null +++ b/tests/test_pronom_export.py @@ -0,0 +1,420 @@ +"""Test PRONOM export functions. + +NB. many of the tests here can be paramettrized once we have good +coverage. They've all been tested individually as the conversion +scripts are ironed out. This will take on more relevance if the +tool is used more for JSON signature creation in general. +""" + +import pytest + +from src.jsonid import pronom + +encode_roundtrip_tests = [ + ( + "74657374", + "7400650073007400", + "utf-16", + ) +] + + +@pytest.mark.parametrize("hex_sequences, expected, encoding", encode_roundtrip_tests) +def test_encode_roundtrip(hex_sequences, expected, encoding): + """Re-encode a set of hexadecimal values to a new encoding.""" + + res = pronom.encode_roundtrip(hex_sequences, encoding) + assert res == expected + + +preprocess_goto_tests = [ + ( + [ + {"KEY": "sops", "EXISTS": None}, + {"GOTO": "sops", "KEY": "kms", "EXISTS": None}, + {"GOTO": "sops", "KEY": "pgp", "EXISTS": None}, + ], + [ + {"KEY": "sops", "EXISTS": None}, + {"KEY": "kms", "EXISTS": None}, + {"KEY": "pgp", "EXISTS": None}, + ], + ) +] + + +@pytest.mark.parametrize("markers_in, markers_out", preprocess_goto_tests) +def test_preprocess_goto_markers(markers_in: list, markers_out: list): + """Make sure preprocess markers works as anticipated. + + GOTO is also largely synonymous with "KEY" exists and so we can + remove duplicate examples of GOTO and ensure just one "EXISTS" for + that GOTO exists. + """ + + res = pronom.preprocess_goto_markers(markers_in) + assert res == markers_out + + +preprocess_index_tests = [ + ( + [ + {"INDEX": 0, "KEY": "Content-Length", "EXISTS": None}, + {"INDEX": 0, "KEY": "Content-Type", "EXISTS": None}, + {"INDEX": 0, "KEY": "X-TIKA:Parsed-By", "EXISTS": None}, + {"INDEX": 0, "KEY": "X-TIKA:parse_time_millis", "EXISTS": None}, + ], + [ + {"INDEX_START": None}, + {"KEY": "Content-Length", "EXISTS": None}, + {"INDEX END": None}, + {"INDEX_START": None}, + {"KEY": "Content-Type", "EXISTS": None}, + {"INDEX END": None}, + {"INDEX_START": None}, + {"KEY": "X-TIKA:Parsed-By", "EXISTS": None}, + {"INDEX END": None}, + {"INDEX_START": None}, + {"KEY": "X-TIKA:parse_time_millis", "EXISTS": None}, + {"INDEX END": None}, + ], + ) +] + + +def test_export_sops(): + """SOPS is a good example of a more complex signature. Ensure + it works here. + """ + + markers = [ + {"KEY": "sops", "EXISTS": None}, + {"GOTO": "sops", "KEY": "kms", "EXISTS": None}, + {"GOTO": "sops", "KEY": "pgp", "EXISTS": None}, + ] + + processed = pronom.process_markers(markers, 0, "utf-8") + + res = [] + for sequence in processed[0].byte_sequences: + res.append(sequence.value) + + expected = [ + "7B", + "22736F707322{0-16}3A", + "226B6D7322{0-16}3A", + "2270677022{0-16}3A", + "7D", + ] + + assert res == expected + + +def test_ocfl_inventory(): + """OFCL inventoty is one of the first examples that worked out of + the box and provides good control for errors.""" + + markers = [ + {"KEY": "type", "STARTSWITH": "https://ocfl.io/"}, + {"KEY": "type", "CONTAINS": "spec/#inventory"}, + {"KEY": "head", "EXISTS": None}, + {"KEY": "manifest", "EXISTS": None}, + ] + + processed = pronom.process_markers(markers, 0, "utf-8") + + res = [] + for sequence in processed[0].byte_sequences: + res.append(sequence.value) + + expected = [ + "7B", + "227479706522{0-16}3A{0-16}2268747470733A2F2F6F63666C2E696F2F", + "227479706522{0-16}3A{0-16}22*737065632F23696E76656E746F7279*22", + "226865616422{0-16}3A", + "226D616E696665737422{0-16}3A", + "7D", + ] + assert res == expected + + +def test_json_patch(): + """Ensure that JSON patch style markers are converted + correctly. + """ + + markers = [ + {"INDEX": 0, "KEY": "op", "EXISTS": None}, + {"INDEX": 0, "KEY": "path", "EXISTS": None}, + ] + + processed = pronom.process_markers(markers, 0, "utf-8") + + res = [] + for sequence in processed[0].byte_sequences: + res.append(sequence.value) + + expected = [ + "7B", + "{0-16}5B*7B*226F7022{0-16}3A*7D*5D", + "{0-16}5B*7B*227061746822{0-16}3A*7D*5D", + "7D", + ] + + assert res == expected + + +def test_gltf_schema(): + """Ensure that GLTF style patterns are converted correctly. + + NB. the pattern here is potentially different from that in the + registry and the registry may need updating to be more + permissive, or simply corrected. + """ + + markers = [ + {"KEY": "$schema", "STARTSWITH": "https://json-schema.org/"}, + {"KEY": "$schema", "ENDSWITH": "/schema"}, + {"KEY": "title", "EXISTS": None}, + {"KEY": "type", "IS": "object"}, + {"KEY": "description", "IS": "The root object for a glTF asset."}, + ] + + processed = pronom.process_markers(markers, 0, "utf-8") + + res = [] + for sequence in processed[0].byte_sequences: + res.append(sequence.value) + + expected = [ + "7B", + "2224736368656D6122{0-16}3A{0-16}2268747470733A2F2F6A736F6E2D736368656D612E6F72672F", + "2224736368656D6122{0-16}3A{0-16}*2F736368656D6122", + "227469746C6522{0-16}3A", + "227479706522{0-16}6F626A656374", + "226465736372697074696F6E22{0-16}54686520726F6F74206F626A65637420666F72206120676C54462061737365742E", + "7D", + ] + + assert res == expected + + +def test_tika_recursive(): + """Ensure that TIKA style signatures (relying largely on INDEX) are + converted correctly. + """ + + markers = [ + {"INDEX": 0, "KEY": "Content-Length", "EXISTS": None}, + {"INDEX": 0, "KEY": "Content-Type", "EXISTS": None}, + {"INDEX": 0, "KEY": "X-TIKA:Parsed-By", "EXISTS": None}, + {"INDEX": 0, "KEY": "X-TIKA:parse_time_millis", "EXISTS": None}, + ] + + processed = pronom.process_markers(markers, 0, "utf-8") + + res = [] + for sequence in processed[0].byte_sequences: + res.append(sequence.value) + + expected = [ + "7B", + "{0-16}5B*7B*22436F6E74656E742D4C656E67746822{0-16}3A*7D*5D", + "{0-16}5B*7B*22436F6E74656E742D5479706522{0-16}3A*7D*5D", + "{0-16}5B*7B*22582D54494B413A5061727365642D427922{0-16}3A*7D*5D", + "{0-16}5B*7B*22582D54494B413A70617273655F74696D655F6D696C6C697322{0-16}3A*7D*5D", + "7D", + ] + + assert res == expected + + +encoding_tests = [ + ( + [ + {"KEY": "test", "IS": "data"}, + {"KEY": "file", "ISTYPE": int}, + {"KEY": "bool", "ISTYPE": bool}, + {"KEY": "here", "EXISTS": None}, + {"KEY": "within", "CONTAINS": "value"}, + {"KEY": "start", "STARTSWITH": "value"}, + {"KEY": "end", "ENDSWITH": "value"}, + {"GOTO": "key", "KEY": "at", "EXISTS": None}, + ], + [ + "7B", + "227465737422{0-16}64617461", + "2266696C6522{0-16}3A{0-16}[30:39]", + "22626F6F6C22{0-16}3A{0-16}(74727565|66616C7365)", + "226865726522{0-16}3A", + "2277697468696E22{0-16}3A{0-16}22*76616C7565*22", + "22737461727422{0-16}3A{0-16}2276616C7565", + "22656E6422{0-16}3A{0-16}*76616C756522", + "226B657922{0-16}3A", + "22617422{0-16}3A", + "7D", + ], + "utf-8", + ), + ( + [ + {"INDEX": 1, "KEY": "key", "EXISTS": None}, + ], + [ + "7B", + "{0-16}5B*7B*226B657922{0-16}3A*7D*5D", + "7D", + ], + "utf-8", + ), + ( + [ + {"KEY": "test", "IS": "data"}, + {"KEY": "file", "ISTYPE": int}, + {"KEY": "bool", "ISTYPE": bool}, + {"KEY": "here", "EXISTS": None}, + {"KEY": "within", "CONTAINS": "value"}, + {"KEY": "start", "STARTSWITH": "value"}, + {"KEY": "end", "ENDSWITH": "value"}, + {"GOTO": "key", "KEY": "at", "EXISTS": None}, + ], + [ + "7B00", + "220074006500730074002200{0-16}6400610074006100", + "2200660069006C0065002200{0-16}3A00{0-16}[30:39]", + "220062006F006F006C002200{0-16}3A00{0-16}(7400720075006500|660061006C0073006500)", + "220068006500720065002200{0-16}3A00", + "2200770069007400680069006E002200{0-16}3A00{0-16}2200*760061006C0075006500*2200", + "2200730074006100720074002200{0-16}3A00{0-16}2200760061006C0075006500", + "220065006E0064002200{0-16}3A00{0-16}*760061006C00750065002200", + "22006B00650079002200{0-16}3A00", + "2200610074002200{0-16}3A00", + "7D00", + ], + "utf-16", + ), + ( + [ + {"INDEX": 1, "KEY": "key", "EXISTS": None}, + ], + [ + "7B00", + "{0-16}5B00*7B00*22006B00650079002200{0-16}3A00*7D00*5D00", + "7D00", + ], + "utf-16", + ), + ( + [ + {"KEY": "test", "IS": "data"}, + {"KEY": "file", "ISTYPE": int}, + {"KEY": "bool", "ISTYPE": bool}, + {"KEY": "here", "EXISTS": None}, + {"KEY": "within", "CONTAINS": "value"}, + {"KEY": "start", "STARTSWITH": "value"}, + {"KEY": "end", "ENDSWITH": "value"}, + {"GOTO": "key", "KEY": "at", "EXISTS": None}, + ], + [ + "007B", + "002200740065007300740022{0-16}0064006100740061", + "002200660069006C00650022{0-16}003A{0-16}[30:39]", + "00220062006F006F006C0022{0-16}003A{0-16}(0074007200750065|00660061006C00730065)", + "002200680065007200650022{0-16}003A", + "002200770069007400680069006E0022{0-16}003A{0-16}0022*00760061006C00750065*0022", + "0022007300740061007200740022{0-16}003A{0-16}002200760061006C00750065", + "00220065006E00640022{0-16}003A{0-16}*00760061006C007500650022", + "0022006B006500790022{0-16}003A", + "0022006100740022{0-16}003A", + "007D", + ], + "utf-16BE", + ), + ( + [ + {"INDEX": 1, "KEY": "key", "EXISTS": None}, + ], + [ + "007B", + "{0-16}005B*007B*0022006B006500790022{0-16}003A*007D*005D", + "007D", + ], + "utf-16BE", + ), + ( + [ + {"KEY": "test", "IS": "data"}, + {"KEY": "file", "ISTYPE": int}, + {"KEY": "bool", "ISTYPE": bool}, + {"KEY": "here", "EXISTS": None}, + {"KEY": "within", "CONTAINS": "value"}, + {"KEY": "start", "STARTSWITH": "value"}, + {"KEY": "end", "ENDSWITH": "value"}, + {"GOTO": "key", "KEY": "at", "EXISTS": None}, + ], + [ + "7B000000", + "220000007400000065000000730000007400000022000000{0-16}64000000610000007400000061000000", + "2200000066000000690000006C0000006500000022000000{0-16}3A000000{0-16}[30:39]", + "22000000620000006F0000006F0000006C00000022000000{0-16}3A000000{0-16}(74000000720000007500000065000000|66000000610000006C0000007300000065000000)", + "220000006800000065000000720000006500000022000000{0-16}3A000000", + "2200000077000000690000007400000068000000690000006E00000022000000{0-16}3A000000{0-16}22000000*76000000610000006C0000007500000065000000*22000000", + "22000000730000007400000061000000720000007400000022000000{0-16}3A000000{0-16}2200000076000000610000006C0000007500000065000000", + "22000000650000006E0000006400000022000000{0-16}3A000000{0-16}*76000000610000006C000000750000006500000022000000", + "220000006B000000650000007900000022000000{0-16}3A000000", + "22000000610000007400000022000000{0-16}3A000000", + "7D000000", + ], + "utf-32le", + ), + ( + [ + {"INDEX": 1, "KEY": "key", "EXISTS": None}, + ], + [ + "7B000000", + "{0-16}5B000000*7B000000*220000006B000000650000007900000022000000{0-16}3A000000*7D000000*5D000000", + "7D000000", + ], + "utf-32le", + ), +] + + +@pytest.mark.parametrize("markers, expected, encoding", encoding_tests) +def test_unicode_signatures(markers, expected, encoding): + """Provide a basic unicode tests. + + These tests are based on the following two sample files: + + ```json + { + "test": "data", + "file": 1, + "bool": true, + "here": "random...", + "within": "_value_", + "start": "value_", + "end": "_value", + "key": { + "at": "value" + } + } + ``` + + ```json + [ + 0, + { + "key": "value" + } + ] + ``` + + """ + + processed = pronom.process_markers(markers, 0, encoding) + res = [] + for sequence in processed[0].byte_sequences: + res.append(sequence.value) + assert res == expected From 5c02a8d36a4530fe456083e804e1ae8cef82ad4a Mon Sep 17 00:00:00 2001 From: ross-spencer Date: Sun, 4 Jan 2026 17:15:22 +0100 Subject: [PATCH 2/7] Add json2json entry point --- json2json.py | 12 ++++++++++++ jsonid.py | 2 +- 2 files changed, 13 insertions(+), 1 deletion(-) create mode 100644 json2json.py diff --git a/json2json.py b/json2json.py new file mode 100644 index 0000000..1bcbe2b --- /dev/null +++ b/json2json.py @@ -0,0 +1,12 @@ +"""JSONID entry-points.""" + +from src.utils import json2json + + +def main(): + """Primary entry point for this script.""" + json2json.main() + + +if __name__ == "__main__": + main() diff --git a/jsonid.py b/jsonid.py index 5c087ae..597256b 100644 --- a/jsonid.py +++ b/jsonid.py @@ -1,4 +1,4 @@ -"""Donovan entry-points.""" +"""JSONID entry-points.""" from src.jsonid import jsonid From 44f9a0b7394c414f651449dc7939f6df869840f0 Mon Sep 17 00:00:00 2001 From: ross-spencer Date: Sun, 4 Jan 2026 17:18:40 +0100 Subject: [PATCH 3/7] Add standalone PRONOM export Enables the user to supply a single patterns file to a PRONOM utility to output a PRONOM signature file. Additionally deepcopy has been used in the export script to ensure immutability of list parameters. --- json2pronom.py | 12 +++ pronom_example/patterns_example.json | 4 + src/jsonid/export.py | 2 +- src/jsonid/pronom.py | 34 +++++-- src/utils/jsonid2pronom.py | 133 +++++++++++++++++++++++++++ 5 files changed, 176 insertions(+), 9 deletions(-) create mode 100644 json2pronom.py create mode 100644 pronom_example/patterns_example.json create mode 100644 src/utils/jsonid2pronom.py diff --git a/json2pronom.py b/json2pronom.py new file mode 100644 index 0000000..0f2c3f7 --- /dev/null +++ b/json2pronom.py @@ -0,0 +1,12 @@ +"""JSONID entry-points.""" + +from src.utils import jsonid2pronom + + +def main(): + """Primary entry point for this script.""" + jsonid2pronom.main() + + +if __name__ == "__main__": + main() diff --git a/pronom_example/patterns_example.json b/pronom_example/patterns_example.json new file mode 100644 index 0000000..8631d38 --- /dev/null +++ b/pronom_example/patterns_example.json @@ -0,0 +1,4 @@ +[ + {"KEY": "key1", "EXISTS": null}, + {"KEY": "key2", "EXISTS": null} +] diff --git a/src/jsonid/export.py b/src/jsonid/export.py index b128b5a..7164a0c 100644 --- a/src/jsonid/export.py +++ b/src/jsonid/export.py @@ -107,7 +107,7 @@ def export_pronom() -> None: mime = "" try: sequences = pronom.process_markers( - markers.copy(), + copy.deepcopy(markers), increment_id, encoding=encoding, ) diff --git a/src/jsonid/pronom.py b/src/jsonid/pronom.py index ddf0d80..5a279df 100644 --- a/src/jsonid/pronom.py +++ b/src/jsonid/pronom.py @@ -248,13 +248,8 @@ def create_file_format_collection(fmt: list[Format]): return ff.strip() -def process_formats_and_save(formats: list[Format], filename: str): - """Process the collected formats and output a signature file. - - NB. Given our dataclasses here, we have the opportunity to rework - this data into many new structures. We output XML because DROID - expects XML. - """ +def _process_formats(formats: list[Format]): + """Process formats into a PRONOM XML file.""" isc = [] ffc = [] for fmt in formats: @@ -278,14 +273,37 @@ def process_formats_and_save(formats: list[Format], filename: str): dom = xml.dom.minidom.parseString(signature_file) except xml.parsers.expat.ExpatError as err: logger.error("cannot process xml: %s", err) - return + return "" pretty_xml = dom.toprettyxml(indent=" ", encoding="utf-8") prettier_xml = export_helpers.new_prettify(pretty_xml) + return prettier_xml + + +def process_formats_and_save(formats: list[Format], filename: str): + """Process the collected formats and output a signature file. + + NB. Given our dataclasses here, we have the opportunity to rework + this data into many new structures. We output XML because DROID + expects XML. + """ + prettier_xml = _process_formats(formats) logger.info("outputting to: %s", filename) with open(filename, "w", encoding="utf=8") as output_file: output_file.write(prettier_xml) +def process_formats_to_stdout(formats: list[Format]): + """Process the collected formats and output a signature file. + + NB. Given our dataclasses here, we have the opportunity to rework + this data into many new structures. We output XML because DROID + expects XML. + """ + prettier_xml = _process_formats(formats) + logger.info("outputting to: stdout") + print(prettier_xml) + + def encode_roundtrip(hexed_val: str, encoding: str) -> str: """We want to get a plain-text byte-sequence into a new encoding. It takes a few hops and skips. diff --git a/src/utils/jsonid2pronom.py b/src/utils/jsonid2pronom.py new file mode 100644 index 0000000..be3fae2 --- /dev/null +++ b/src/utils/jsonid2pronom.py @@ -0,0 +1,133 @@ +"""jsonid2pronom provides a helper script to enable export of generic +JSONID compatible markers to a PRONOM compatible signature file. +""" + +import argparse +import asyncio +import copy +import json +import logging +import sys + +from src.jsonid import pronom + +# Set up logging. +logging.basicConfig( + format="%(asctime)-15s %(levelname)s :: %(filename)s:%(lineno)s:%(funcName)s() :: %(message)s", # noqa: E501 + datefmt="%Y-%m-%d %H:%M:%S", + level="INFO", + handlers=[ + logging.StreamHandler(), + ], +) + +logger = logging.getLogger(__name__) + + +async def load_patterns(path: str) -> list: + """Load patterns from a file for conversion to a signature file.""" + patterns = [] + with open(path, "r", encoding="utf-8") as patterns_file: + patterns = json.loads(patterns_file.read()) + return patterns + + +async def output_signature(path: str): + """Output JSONID compatible signatures to PRONOM.""" + + formats = [] + + encodings = ("UTF-8", "UTF-16", "UTF-16BE", "UTF-32LE") + priorities = [] + + increment_id = 0 + + markers = await load_patterns(path) + + if not markers: + logger.error("no patterns provided via path arg") + sys.exit(1) + + for encoding in encodings: + increment_id += 1 + json_puid = "jsonid2pronom/1" + name_ = f"JSONID2PRONOM Conversion ({encoding})" + try: + mime = "application/json" + except IndexError: + mime = "" + try: + sequences = pronom.process_markers( + copy.deepcopy(markers), + increment_id, + encoding=encoding, + ) + except pronom.UnprocessableEntity as err: + logger.error( + "%s %s: cannot handle: %s", + json_puid, + name_, + err, + ) + for err_marker in markers: + logger.debug("--- START ---") + logger.debug("marker: %s", err_marker) + logger.debug("--- END ---") + continue + fmt = pronom.Format( + id=increment_id, + name=name_, + version="", + puid=json_puid, + mime=mime, + classification="structured text", + external_signatures=[ + pronom.ExternalSignature( + id=increment_id, + signature="json", + type=pronom.EXT, + ) + ], + internal_signatures=sequences, + priorities=list(set(priorities)), + ) + priorities.append(f"{increment_id}") + formats.append(fmt) + + pronom.process_formats_to_stdout(formats) + + +def main() -> None: + """Primary entry point for this script.""" + parser = argparse.ArgumentParser( + prog="jsonid2pronom", + description="convert JSONID compatible markers to PRONOM", + epilog="for more information visit https://github.com/ffdev-info/jsonid", + ) + parser.add_argument( + "--debug", + help="use debug loggng", + required=False, + action="store_true", + ) + parser.add_argument( + "--path", + "-p", + help="file path to process", + required=False, + ) + args = parser.parse_args() + logging.getLogger(__name__).setLevel(logging.DEBUG if args.debug else logging.INFO) + logger.debug("debug logging is configured") + if not args.path: + parser.print_help(sys.stderr) + sys.exit() + asyncio.run( + output_signature( + path=args.path, + ) + ) + + +if __name__ == "__main__": + main() From b064c3e888d261878f8063690baaa9e4e3532fff Mon Sep 17 00:00:00 2001 From: ross-spencer Date: Sun, 4 Jan 2026 17:26:21 +0100 Subject: [PATCH 4/7] Update docs --- docs/jsonid/export.html | 145 ++++- docs/jsonid/export_helpers.html | 118 ++++ docs/jsonid/helpers.html | 14 +- docs/jsonid/index.html | 10 + docs/jsonid/jsonid.html | 11 +- docs/jsonid/pronom.html | 1040 +++++++++++++++++++++++++++++++ docs/registry/index.htm | 8 +- docs/utils/index.html | 6 + docs/utils/json2json.html | 4 +- docs/utils/jsonid2pronom.html | 211 +++++++ 10 files changed, 1549 insertions(+), 18 deletions(-) create mode 100644 docs/jsonid/export_helpers.html create mode 100644 docs/jsonid/pronom.html create mode 100644 docs/utils/jsonid2pronom.html diff --git a/docs/jsonid/export.html b/docs/jsonid/export.html index c85574f..52b34c9 100644 --- a/docs/jsonid/export.html +++ b/docs/jsonid/export.html @@ -55,7 +55,7 @@

Functions

def exportJSON() -> None:  # pylint: disable=C0103
     """Export to JSON."""
-    logger.debug("exporting registry ad JSON")
+    logger.debug("exporting registry as JSON")
     data = registry_data.registry()
     json_obj = []
     id_ = {
@@ -74,9 +74,144 @@ 

Functions

Export to JSON.

+
+def export_pronom() ‑> None +
+
+
+ +Expand source code + +
def export_pronom() -> None:
+    """Export a PRONOM compatible set of signatures.
+
+    Export is done in two phases. A set of proposed "Baseline" JSON
+    signatures to catch many JSON instances.
+
+    Second the JSONID registry is exported.
+
+    Every export has a priority over the other so that there should
+    be no multiple identification results.
+    """
+
+    # pylint: disable=R0914; too-many local variables.
+
+    logger.debug("exporting registry as PRONOM")
+
+    reg_data = registry_data.registry()
+    formats = []
+
+    encodings = ("UTF-8", "UTF-16", "UTF-16BE", "UTF-32LE")
+    priorities = []
+
+    increment_id = 0
+
+    for encoding in encodings:
+        all_baseline = pronom.create_baseline_json_sequences(encoding)
+        for baseline in all_baseline:
+            increment_id += 1
+            fmt = pronom.Format(
+                id=increment_id,
+                name=f"JSON (Baseline - fmt/817) ({encoding})",
+                version="",
+                puid="jsonid:0000",
+                mime="application/json",
+                classification="structured text",
+                external_signatures=[
+                    pronom.ExternalSignature(
+                        id=increment_id,
+                        signature="json",
+                        type=pronom.EXT,
+                    )
+                ],
+                internal_signatures=[baseline],
+                priorities=priorities,
+            )
+            priorities.append(f"{increment_id}")
+            formats.append(fmt)
+
+    for encoding in encodings:
+        for entry in reg_data:
+            increment_id += 1
+            json_puid = f"{entry.json()['identifier']};{encoding}"
+            name_ = f"{entry.json()['name'][0]['@en']} ({encoding})"
+            markers = entry.json()["markers"]
+            try:
+                mime = entry.json()["mime"][0]
+            except IndexError:
+                mime = ""
+            try:
+                sequences = pronom.process_markers(
+                    copy.deepcopy(markers),
+                    increment_id,
+                    encoding=encoding,
+                )
+            except pronom.UnprocessableEntity as err:
+                logger.error(
+                    "%s %s: cannot handle: %s",
+                    json_puid,
+                    name_,
+                    err,
+                )
+                for marker in markers:
+                    logger.debug("--- START ---")
+                    logger.debug("marker: %s", marker)
+                    logger.debug("---  END  ---")
+                continue
+            fmt = pronom.Format(
+                id=increment_id,
+                name=name_,
+                version="",
+                puid=json_puid,
+                mime=mime,
+                classification="structured text",
+                external_signatures=[
+                    pronom.ExternalSignature(
+                        id=increment_id,
+                        signature="json",
+                        type=pronom.EXT,
+                    )
+                ],
+                internal_signatures=sequences,
+                priorities=copy.deepcopy(list(set(priorities))),
+            )
+            priorities.append(f"{increment_id}")
+            formats.append(fmt)
+
+    pronom.process_formats_and_save(formats, PRONOM_FILENAME)
+
+

Export a PRONOM compatible set of signatures.

+

Export is done in two phases. A set of proposed "Baseline" JSON +signatures to catch many JSON instances.

+

Second the JSONID registry is exported.

+

Every export has a priority over the other so that there should +be no multiple identification results.

+
+

Classes

+
+
+class PRONOMException +(*args, **kwargs) +
+
+
+ +Expand source code + +
class PRONOMException(Exception):
+    """Exception class if we can't create a PRONOM signature as expected."""
+
+

Exception class if we can't create a PRONOM signature as expected.

+

Ancestors

+
    +
  • builtins.Exception
  • +
  • builtins.BaseException
  • +
+
+
diff --git a/docs/jsonid/index.html b/docs/jsonid/index.html index 3a3f96e..02e4e2d 100644 --- a/docs/jsonid/index.html +++ b/docs/jsonid/index.html @@ -56,6 +56,10 @@

Sub-modules

Functions to support export.

+
src.jsonid.export_helpers
+
+

Helpers for the export functions.

+
src.jsonid.file_processing

File processing functions.

@@ -80,6 +84,10 @@

Sub-modules

Functions for output of results.

+
src.jsonid.pronom
+
+

PRONOM export routines …

+
src.jsonid.registry

JSON registry processor.

@@ -125,12 +133,14 @@

Sub-modules

  • src.jsonid.base_obj_presets
  • src.jsonid.compressionlib
  • src.jsonid.export
  • +
  • src.jsonid.export_helpers
  • src.jsonid.file_processing
  • src.jsonid.helpers
  • src.jsonid.htm_template
  • src.jsonid.jsonid
  • src.jsonid.lookup
  • src.jsonid.output
  • +
  • src.jsonid.pronom
  • src.jsonid.registry
  • src.jsonid.registry_class
  • src.jsonid.registry_data
  • diff --git a/docs/jsonid/jsonid.html b/docs/jsonid/jsonid.html index 401ff40..f217201 100644 --- a/docs/jsonid/jsonid.html +++ b/docs/jsonid/jsonid.html @@ -84,7 +84,7 @@

    Functions

    # pylint: disable=R0912,R0915 parser = argparse.ArgumentParser( - prog="json-id", + prog="jsonid", description="proof-of-concept identifier for JSON objects on disk based on identifying valid objects and their key-values", epilog="for more information visit https://github.com/ffdev-info/json-id", ) @@ -142,10 +142,12 @@

    Functions

    help="path to a custom registry to lead into memory replacing the default", required=False, ) + # NB. consider output to stdout once the feature is more stable. parser.add_argument( "--pronom", - help="return a PRONOM-centric view of the results", + help=f"return a PRONOM-centric view of the results to `{export.PRONOM_FILENAME}` (BETA)", required=False, + action="store_true", ) parser.add_argument( "--export", @@ -155,7 +157,7 @@

    Functions

    ) parser.add_argument( "--check", - help="check the registry entrues are correct", + help="check the registry entries are correct", required=False, action="store_true", ) @@ -206,7 +208,8 @@

    Functions

    if args.registry: raise NotImplementedError("custom registry is not yet available") if args.pronom: - raise NotImplementedError("pronom view is not yet implemented") + export.export_pronom() + sys.exit() if args.language: raise NotImplementedError("multiple languages are not yet implemented") if args.export: diff --git a/docs/jsonid/pronom.html b/docs/jsonid/pronom.html new file mode 100644 index 0000000..77e8db0 --- /dev/null +++ b/docs/jsonid/pronom.html @@ -0,0 +1,1040 @@ + + + + + + +src.jsonid.pronom API documentation + + + + + + + + + + + +
    +
    +
    +

    Module src.jsonid.pronom

    +
    +
    +

    PRONOM export routines.

    +

    XML tooling: https://xmllint.com/

    +
    +
    +
    +
    +
    +
    +

    Functions

    +
    +
    +def calculate_variable_off_bof(item: ByteSequence) +
    +
    +
    + +Expand source code + +
    def calculate_variable_off_bof(item: ByteSequence):
    +    """Given variable offsets, calculate the correct syntax."""
    +    seq = item.value
    +    if (
    +        item.min_off != ""
    +        and int(item.min_off) > 0
    +        and item.max_off != ""
    +        and int(item.max_off) > 0
    +    ):
    +        seq = f"{{{item.min_off}-{int(item.min_off)+int(item.max_off)}}}{seq}"
    +    elif item.max_off != "" and int(item.max_off) > 0:
    +        seq = f"{{0-{item.max_off}}}{seq}"
    +    elif item.min_off != "" and int(item.min_off) > 0:
    +        seq = f"{{{item.min_off}}}{seq}"
    +    return seq
    +
    +

    Given variable offsets, calculate the correct syntax.

    +
    +
    +def calculate_variable_off_eof(item: ByteSequence) +
    +
    +
    + +Expand source code + +
    def calculate_variable_off_eof(item: ByteSequence):
    +    """Given variable offsets, calculate the correct syntax."""
    +    seq = item.value
    +    if (
    +        item.min_off != ""
    +        and int(item.min_off) > 0
    +        and item.max_off != ""
    +        and int(item.max_off) > 0
    +    ):
    +        seq = f"{seq}{{{item.min_off}-{int(item.min_off)+int(item.max_off)}}}"
    +    elif item.max_off != "" and int(item.max_off) > 0:
    +        seq = f"{seq}{{0-{item.max_off}}}"
    +    elif item.min_off != "" and int(item.min_off) > 0:
    +        seq = f"{seq}{{{item.min_off}}}"
    +    return seq
    +
    +

    Given variable offsets, calculate the correct syntax.

    +
    +
    +def convert_marker_to_signature_sequence(marker: dict, encoding: str) ‑> str +
    +
    +
    + +Expand source code + +
    def convert_marker_to_signature_sequence(marker: dict, encoding: str) -> str:
    +    """Convert a JSONID marker into a signature sequence."""
    +
    +    # pylint: disable=R0914; too-many local variables.
    +    # pylint: disable=R0911; too-many return statements.
    +    # pylint: disable=R0915; too-many statements.
    +
    +    logger.debug("marker: %s", marker)
    +
    +    colon_encoded: Final[str] = encode_roundtrip(COLON, encoding)
    +    double_quote_encoded: Final[str] = encode_roundtrip(DOUBLE_QUOTE, encoding)
    +    curly_open_encoded: Final[str] = encode_roundtrip(CURLY_OPEN, encoding)
    +    curly_close_encoded: Final[str] = encode_roundtrip(CURLY_CLOSE, encoding)
    +    colon_encoded: Final[str] = encode_roundtrip(COLON, encoding)
    +    square_open_encoded: Final[str] = encode_roundtrip(SQUARE_OPEN, encoding)
    +    square_close_encoded: Final[str] = encode_roundtrip(SQUARE_CLOSE, encoding)
    +    double_quote_encoded: Final[str] = encode_roundtrip(DOUBLE_QUOTE, encoding)
    +
    +    instruction = ""
    +    if registry_matchers.MARKER_GOTO in marker.keys():
    +        # GOTO KEY and match KEY.
    +        goto_key = _str_to_hex_str(marker["GOTO"])
    +        key_at_goto = _str_to_hex_str(marker["KEY"])
    +        goto_encoded = quote_and_encode(goto_key, encoding)
    +        key_encoded = quote_and_encode(key_at_goto, encoding)
    +        instruction = f"{goto_encoded}{WHITESPACE_REGEX}{colon_encoded}*{WHITESPACE_REGEX}{key_encoded}{WHITESPACE_REGEX}{colon_encoded}"
    +        marker.pop("GOTO")
    +        marker.pop("KEY")
    +        return instruction.upper()
    +    if registry_matchers.MARKER_INDEX in marker.keys():
    +        key = _str_to_hex_str(marker["KEY"])
    +        instruction = f"{WHITESPACE_REGEX}{square_open_encoded}*{curly_open_encoded}*{double_quote_encoded}{encode_roundtrip(key, encoding)}{double_quote_encoded}{WHITESPACE_REGEX}{colon_encoded}*{curly_close_encoded}*{square_close_encoded}"
    +        marker.pop("INDEX")
    +        marker.pop("KEY")
    +        return instruction.upper()
    +    if "KEY" in marker.keys():
    +        key = _str_to_hex_str(marker["KEY"])
    +        instruction = quote_and_encode(key, encoding)
    +        marker.pop("KEY")
    +    if registry_matchers.MARKER_KEY_EXISTS in marker.keys():
    +        instruction = f"{instruction}{WHITESPACE_REGEX}{colon_encoded}".upper()
    +        return instruction
    +    if registry_matchers.MARKER_IS_TYPE in marker.keys():
    +        is_type = _type_to_str(marker["ISTYPE"], encoding=encoding)
    +        type_val = (
    +            f"{instruction}{WHITESPACE_REGEX}{colon_encoded}{WHITESPACE_REGEX}{is_type}"
    +        )
    +        return type_val.upper()
    +    if registry_matchers.MARKER_IS in marker.keys():
    +        marker_is = marker["IS"]
    +        if not isinstance(marker_is, str):
    +            _complex_is_type(marker_is)
    +        equals = _str_to_hex_str(marker_is)
    +        is_val = f"{instruction}{WHITESPACE_REGEX}{encode_roundtrip(equals, encoding)}"
    +        return is_val.upper()
    +    if registry_matchers.MARKER_STARTSWITH in marker.keys():
    +        starts_with = _str_to_hex_str(marker["STARTSWITH"])
    +        starts_with_val = f"{instruction}{WHITESPACE_REGEX}{colon_encoded}{WHITESPACE_REGEX}{double_quote_encoded}{encode_roundtrip(starts_with, encoding)}"
    +        return starts_with_val.upper()
    +    if registry_matchers.MARKER_ENDSWITH in marker.keys():
    +        ends_with = _str_to_hex_str(marker["ENDSWITH"])
    +        ends_with_val = f"{instruction}{WHITESPACE_REGEX}{colon_encoded}{WHITESPACE_REGEX}*{encode_roundtrip(ends_with, encoding)}{double_quote_encoded}"
    +        return ends_with_val.upper()
    +    if registry_matchers.MARKER_CONTAINS in marker.keys():
    +        contains = _str_to_hex_str(marker["CONTAINS"])
    +        contains_val = f"{instruction}{WHITESPACE_REGEX}{colon_encoded}{WHITESPACE_REGEX}{double_quote_encoded}*{encode_roundtrip(contains, encoding)}*{double_quote_encoded}"
    +        return contains_val.upper()
    +    if registry_matchers.MARKER_REGEX in marker.keys():
    +        raise UnprocessableEntity("REGEX not yet implemented")
    +    if registry_matchers.MARKER_KEY_NO_EXIST in marker.keys():
    +        raise UnprocessableEntity("KEY NO EXIST not yet implemented")
    +    # We should never arrive here. In the future clean this up so we
    +    # only return when we have information.
    +    return ""
    +
    +

    Convert a JSONID marker into a signature sequence.

    +
    +
    +def create_baseline_json_sequences(encoding: str) +
    +
    +
    + +Expand source code + +
    def create_baseline_json_sequences(encoding: str):
    +    """Create baseline JSON sequences that match map and list types
    +    with various different encodings.
    +    """
    +
    +    # pylint: disable=R0914; too-many local variables.
    +
    +    curly_open_encoded: Final[str] = encode_roundtrip(CURLY_OPEN, encoding)
    +    curly_close_encoded: Final[str] = encode_roundtrip(CURLY_CLOSE, encoding)
    +    square_open_encoded: Final[str] = encode_roundtrip(SQUARE_OPEN, encoding)
    +    square_close_encoded: Final[str] = encode_roundtrip(SQUARE_CLOSE, encoding)
    +
    +    colon_encoded: Final[str] = encode_roundtrip(COLON, encoding)
    +    double_quote_encoded: Final[str] = encode_roundtrip(DOUBLE_QUOTE, encoding)
    +
    +    bof = f"({curly_open_encoded}|{square_open_encoded})"
    +    eof = f"({curly_close_encoded}|{square_close_encoded})"
    +
    +    no_encoded: Final[str] = NUMBER_REGEX
    +    true_encoded: Final[str] = encode_roundtrip(TRUE_VALUE, encoding)
    +    false_encoded: Final[str] = encode_roundtrip(FALSE_VALUE, encoding)
    +    null_encoded: Final[str] = encode_roundtrip(NULL_VALUE, encoding)
    +
    +    options = (
    +        f"{double_quote_encoded}{WHITESPACE_REGEX}{colon_encoded}",
    +        no_encoded,
    +        f"({true_encoded}|{false_encoded})",
    +        null_encoded,
    +    )
    +
    +    sigs = []
    +
    +    for opt in options:
    +
    +        bs = []
    +
    +        bs.append(
    +            ByteSequence(
    +                id=1,
    +                pos="BOF",
    +                min_off=0,
    +                max_off=f"{DISK_SECTOR_SIZE}",
    +                endian="",
    +                value=bof,
    +            )
    +        )
    +
    +        bs.append(
    +            ByteSequence(
    +                id=1,
    +                pos="VAR",
    +                min_off=0,
    +                max_off=0,
    +                endian="",
    +                value=opt,
    +            )
    +        )
    +
    +        bs.append(
    +            ByteSequence(
    +                id=1,
    +                pos="EOF",
    +                min_off="0",
    +                max_off=f"{DISK_SECTOR_SIZE}",
    +                endian="",
    +                value=eof,
    +            )
    +        )
    +
    +        iss = InternalSignature(
    +            id=0,
    +            name="",
    +            byte_sequences=bs,
    +        )
    +
    +        sigs.append(iss)
    +
    +    return sigs
    +
    +

    Create baseline JSON sequences that match map and list types +with various different encodings.

    +
    +
    +def create_file_format_collection(fmt: list[Format]) +
    +
    +
    + +Expand source code + +
    def create_file_format_collection(fmt: list[Format]):
    +    """Create the FileFormatCollection object.
    +
    +    E.g.
    +    ```
    +        <FileFormat ID="1" Name="Development Signature" PUID="dev/1" Version="1.0" MIMEType="application/octet-stream">
    +            <InternalSignatureID>1</InternalSignatureID>
    +            <Extension>ext</Extension>
    +        </FileFormat>
    +
    +        <FileFormat ID="49" MIMEType="application/postscript"  FormatType="Text (Structured)"
    +            Name="Adobe Illustrator" PUID="x-fmt/20" Version="1.0 / 1.1">
    +            <InternalSignatureID>880</InternalSignatureID>
    +            <InternalSignatureID>881</InternalSignatureID>
    +            <Extension>ai</Extension>
    +            <HasPriorityOverFileFormatID>86</HasPriorityOverFileFormatID>
    +            <HasPriorityOverFileFormatID>331</HasPriorityOverFileFormatID>
    +            <HasPriorityOverFileFormatID>332</HasPriorityOverFileFormatID>
    +            <HasPriorityOverFileFormatID>771</HasPriorityOverFileFormatID>
    +            <HasPriorityOverFileFormatID>773</HasPriorityOverFileFormatID>
    +        </FileFormat>
    +    ```
    +
    +    """
    +    internal_sigs = [
    +        f"<InternalSignatureID>{sig.id}</InternalSignatureID>"
    +        for sig in fmt.internal_signatures
    +    ]
    +    external_sigs = [
    +        f"<Extension>{sig.signature}</Extension>"
    +        for sig in fmt.external_signatures
    +        if sig.type.lower() == EXT
    +    ]
    +    priorities = [
    +        f"<HasPriorityOverFileFormatID>{priority}</HasPriorityOverFileFormatID>"
    +        for priority in fmt.priorities
    +    ]
    +    ff = f"""
    +<FileFormat ID=\"{fmt.id}\" Name=\"{fmt.name}\" PUID=\"{fmt.puid}\" Version="{fmt.version}" MIMEType=\"{fmt.mime}\" FormatType=\"{fmt.classification}\" >
    +    {"".join(internal_sigs).strip()}
    +    {"".join(external_sigs).strip()}
    +    {"".join(priorities).strip()}
    +</FileFormat>
    +    """
    +    return ff.strip()
    +
    +

    Create the FileFormatCollection object.

    +

    E.g.

    +
        <FileFormat ID="1" Name="Development Signature" PUID="dev/1" Version="1.0" MIMEType="application/octet-stream">
    +        <InternalSignatureID>1</InternalSignatureID>
    +        <Extension>ext</Extension>
    +    </FileFormat>
    +
    +    <FileFormat ID="49" MIMEType="application/postscript"  FormatType="Text (Structured)"
    +        Name="Adobe Illustrator" PUID="x-fmt/20" Version="1.0 / 1.1">
    +        <InternalSignatureID>880</InternalSignatureID>
    +        <InternalSignatureID>881</InternalSignatureID>
    +        <Extension>ai</Extension>
    +        <HasPriorityOverFileFormatID>86</HasPriorityOverFileFormatID>
    +        <HasPriorityOverFileFormatID>331</HasPriorityOverFileFormatID>
    +        <HasPriorityOverFileFormatID>332</HasPriorityOverFileFormatID>
    +        <HasPriorityOverFileFormatID>771</HasPriorityOverFileFormatID>
    +        <HasPriorityOverFileFormatID>773</HasPriorityOverFileFormatID>
    +    </FileFormat>
    +
    +
    +
    +def create_many_to_one_byte_sequence(internal_signatures: list[InternalSignature]) +
    +
    +
    + +Expand source code + +
    def create_many_to_one_byte_sequence(internal_signatures: list[InternalSignature]):
    +    """Create a many to one byte sequence, i.e. a format with multiple
    +    Internal Signatures.
    +    """
    +    internal_signature = ""
    +    for internal in internal_signatures:
    +        id_ = internal.id
    +        bs = create_one_to_many_byte_sequence(internal.byte_sequences)
    +        internal_signature = f"""
    +{internal_signature}<InternalSignature ID=\"{id_}\" Specificity=\"Specific\">
    +    {bs}
    +</InternalSignature>
    +        """
    +    return internal_signature.strip()
    +
    +

    Create a many to one byte sequence, i.e. a format with multiple +Internal Signatures.

    +
    +
    +def create_one_to_many_byte_sequence(byte_sequences: list[ByteSequence]) +
    +
    +
    + +Expand source code + +
    def create_one_to_many_byte_sequence(byte_sequences: list[ByteSequence]):
    +    """Create a byte sequence object."""
    +    byte_sequence = ""
    +    for item in byte_sequences:
    +        seq = item.value
    +        if item.pos.startswith("EOF"):
    +            seq = calculate_variable_off_eof(item)
    +        elif item.pos.startswith("BOF"):
    +            seq = calculate_variable_off_bof(item)
    +        byte_sequence = f"""
    +{byte_sequence.strip()}
    +    <ByteSequence Reference=\"{item.pos}\" Sequence=\"{seq}\" MinOffset=\"{item.min_off}\" MaxOffset=\"{item.max_off}\"/>
    +        """
    +    return byte_sequence.strip()
    +
    +

    Create a byte sequence object.

    +
    +
    +def encode_roundtrip(hexed_val: str, encoding: str) ‑> str +
    +
    +
    + +Expand source code + +
    def encode_roundtrip(hexed_val: str, encoding: str) -> str:
    +    """We want to get a plain-text byte-sequence into a new
    +    encoding. It takes a few hops and skips.
    +    """
    +    val = hexed_val.strip()
    +    try:
    +        re_encoded = binascii.unhexlify(hexed_val).decode("utf-8").encode(encoding)
    +    except (binascii.Error, UnicodeDecodeError) as err:
    +        logger.error("cannot convert: %s len: %s ('%s')", hexed_val, len(val), err)
    +        return val
    +    hex_val = binascii.hexlify(re_encoded).decode().upper()
    +    for bom in _get_bom():
    +        if not hex_val.startswith(bom):
    +            continue
    +        return hex_val.replace(bom, "")
    +    return hex_val
    +
    +

    We want to get a plain-text byte-sequence into a new +encoding. It takes a few hops and skips.

    +
    +
    +def preprocess_goto_markers(markers: dict) ‑> list +
    +
    +
    + +Expand source code + +
    def preprocess_goto_markers(markers: dict) -> list:
    +    """Preprocess markers to remove data that is otherwise duplicated
    +    when converted to a PRONOM signature, e.g. GOTO."""
    +
    +    out = []
    +    for marker in markers:
    +        if registry_matchers.MARKER_GOTO not in marker:
    +            out.append(marker)
    +            continue
    +        key = marker.pop("GOTO")
    +        new_marker = {"KEY": key, "EXISTS": None}
    +        if new_marker not in out:
    +            out.append(new_marker)
    +        out.append(marker)
    +    return out
    +
    +

    Preprocess markers to remove data that is otherwise duplicated +when converted to a PRONOM signature, e.g. GOTO.

    +
    +
    +def process_formats_and_save(formats: list[Format],
    filename: str)
    +
    +
    +
    + +Expand source code + +
    def process_formats_and_save(formats: list[Format], filename: str):
    +    """Process the collected formats and output a signature file.
    +
    +    NB. Given our dataclasses here, we have the opportunity to rework
    +    this data into many new structures. We output XML because DROID
    +    expects XML.
    +    """
    +    prettier_xml = _process_formats(formats)
    +    logger.info("outputting to: %s", filename)
    +    with open(filename, "w", encoding="utf=8") as output_file:
    +        output_file.write(prettier_xml)
    +
    +

    Process the collected formats and output a signature file.

    +

    NB. Given our dataclasses here, we have the opportunity to rework +this data into many new structures. We output XML because DROID +expects XML.

    +
    +
    +def process_formats_to_stdout(formats: list[Format]) +
    +
    +
    + +Expand source code + +
    def process_formats_to_stdout(formats: list[Format]):
    +    """Process the collected formats and output a signature file.
    +
    +    NB. Given our dataclasses here, we have the opportunity to rework
    +    this data into many new structures. We output XML because DROID
    +    expects XML.
    +    """
    +    prettier_xml = _process_formats(formats)
    +    logger.info("outputting to: stdout")
    +    print(prettier_xml)
    +
    +

    Process the collected formats and output a signature file.

    +

    NB. Given our dataclasses here, we have the opportunity to rework +this data into many new structures. We output XML because DROID +expects XML.

    +
    +
    +def process_markers(markers: list, sig_id: int, encoding: str = '') ‑> tuple[list | bool] +
    +
    +
    + +Expand source code + +
    def process_markers(
    +    markers: list, sig_id: int, encoding: str = ""
    +) -> tuple[list | bool]:
    +    """Given a set of markers for a document type, process them into
    +    a set of byte sequences and finally an internal signature sequence
    +    that can be output as a PRONOM signature.
    +
    +    returns a tuple describing the processed value and a flag to
    +    highlight the result is potentially lossless, e.g. in the case
    +    of matching types, e.g. strings.
    +
    +    dict_keys(['CONTAINS'])
    +    dict_keys(['ENDSWITH'])
    +    dict_keys(['IS']
    +    dict_keys(['ISTYPE'])
    +    dict_keys(['STARTSWITH'])
    +
    +    key(0-n):(0-n)value
    +
    +    Need to return something like:
    +
    +      <ByteSequence Reference="BOFoffset" Sequence="FFD8FFE0{2}4A464946000101(00|01|02)" MinOffset="0" MaxOffset=""/>
    +
    +    Different encodings need to be accounted for, e.g. (with added
    +    whitespace below)
    +
    +    UTF-32-LE:
    +
    +        00000000: 2000 0000 2000 0000 2000 0000 2000 0000   ... ... ... ...
    +        00000010: 2000 0000 2000 0000 0a00 0000 0a00 0000   ... ...........
    +        00000020: 0a00 0000 0a00 0000 7b00 0000 2200 0000  ........{..."...
    +        00000030: 6100 0000 2200 0000 3a00 0000 2000 0000  a..."...:... ...
    +        00000040: 2200 0000 6200 0000 2200 0000 7d00 0000  "...b..."...}...
    +        00000050: 0a00 0000                                ....
    +
    +    UTF-32-BE:
    +
    +        00000000: 0000 0020 0000 0020 0000 0020 0000 0020  ... ... ... ...
    +        00000010: 0000 0020 0000 0020 0000 000a 0000 000a  ... ... ........
    +        00000020: 0000 000a 0000 000a 0000 007b 0000 0022  ...........{..."
    +        00000030: 0000 0061 0000 0022 0000 003a 0000 0020  ...a..."...:...
    +        00000040: 0000 0022 0000 0062 0000 0022 0000 007d  ..."...b..."...}
    +        00000050: 0000 000a                                ....
    +
    +
    +    UTF-16-LE:
    +
    +        00000000: 2000 2000 2000 2000 2000 2000 0a00 0a00   . . . . . .....
    +        00000010: 0a00 0a00 7b00 2200 6100 2200 3a00 2000  ....{.".a.".:. .
    +        00000020: 2200 6200 2200 7d00 0a00                 ".b.".}...
    +
    +    UTF-16-BE:
    +
    +        00000000: 0020 0020 0020 0020 0020 0020 000a 000a  . . . . . . ....
    +        00000010: 000a 000a 007b 0022 0061 0022 003a 0020  .....{.".a.".:.
    +        00000020: 0022 0062 0022 007d 000a                 .".b.".}..
    +
    +
    +    """
    +
    +    curly_open_encoded: Final[str] = encode_roundtrip(CURLY_OPEN, encoding)
    +    curly_close_encoded: Final[str] = encode_roundtrip(CURLY_CLOSE, encoding)
    +
    +    sequences = []
    +
    +    markers = preprocess_goto_markers(markers)
    +
    +    for marker in markers:
    +        sig_sequence = convert_marker_to_signature_sequence(marker, encoding)
    +        sequences.append(sig_sequence)
    +
    +    byte_sequences = []
    +
    +    byte_sequences.append(
    +        ByteSequence(
    +            id=0,
    +            pos="BOF",
    +            min_off=0,
    +            max_off=f"{DISK_SECTOR_SIZE}",
    +            endian="",
    +            value=curly_open_encoded,
    +        )
    +    )
    +
    +    for idx, item in enumerate(sequences, 0):
    +        logger.debug("%s. %s", idx, item)
    +        byte_sequence = ByteSequence(
    +            id=idx,
    +            pos="VAR",
    +            min_off="",
    +            max_off="",
    +            endian="",
    +            value=item,
    +        )
    +        byte_sequences.append(byte_sequence)
    +
    +    byte_sequences.append(
    +        ByteSequence(
    +            id=0,
    +            pos="EOF",
    +            min_off="0",
    +            max_off=f"{DISK_SECTOR_SIZE}",
    +            endian="",
    +            value=curly_close_encoded,
    +        )
    +    )
    +
    +    internal_signature = InternalSignature(
    +        id=sig_id,
    +        name="",
    +        byte_sequences=byte_sequences,
    +    )
    +
    +    return [internal_signature]
    +
    +

    Given a set of markers for a document type, process them into +a set of byte sequences and finally an internal signature sequence +that can be output as a PRONOM signature.

    +

    returns a tuple describing the processed value and a flag to +highlight the result is potentially lossless, e.g. in the case +of matching types, e.g. strings.

    +

    dict_keys(['CONTAINS']) +dict_keys(['ENDSWITH']) +dict_keys(['IS'] +dict_keys(['ISTYPE']) +dict_keys(['STARTSWITH'])

    +

    key(0-n):(0-n)value

    +

    Need to return something like:

    +

    +

    Different encodings need to be accounted for, e.g. (with added +whitespace below)

    +

    UTF-32-LE:

    +
    00000000: 2000 0000 2000 0000 2000 0000 2000 0000   ... ... ... ...
    +00000010: 2000 0000 2000 0000 0a00 0000 0a00 0000   ... ...........
    +00000020: 0a00 0000 0a00 0000 7b00 0000 2200 0000  ........{..."...
    +00000030: 6100 0000 2200 0000 3a00 0000 2000 0000  a..."...:... ...
    +00000040: 2200 0000 6200 0000 2200 0000 7d00 0000  "...b..."...}...
    +00000050: 0a00 0000                                ....
    +
    +

    UTF-32-BE:

    +
    00000000: 0000 0020 0000 0020 0000 0020 0000 0020  ... ... ... ...
    +00000010: 0000 0020 0000 0020 0000 000a 0000 000a  ... ... ........
    +00000020: 0000 000a 0000 000a 0000 007b 0000 0022  ...........{..."
    +00000030: 0000 0061 0000 0022 0000 003a 0000 0020  ...a..."...:...
    +00000040: 0000 0022 0000 0062 0000 0022 0000 007d  ..."...b..."...}
    +00000050: 0000 000a                                ....
    +
    +

    UTF-16-LE:

    +
    00000000: 2000 2000 2000 2000 2000 2000 0a00 0a00   . . . . . .....
    +00000010: 0a00 0a00 7b00 2200 6100 2200 3a00 2000  ....{.".a.".:. .
    +00000020: 2200 6200 2200 7d00 0a00                 ".b.".}...
    +
    +

    UTF-16-BE:

    +
    00000000: 0020 0020 0020 0020 0020 0020 000a 000a  . . . . . . ....
    +00000010: 000a 000a 007b 0022 0061 0022 003a 0020  .....{.".a.".:.
    +00000020: 0022 0062 0022 007d 000a                 .".b.".}..
    +
    +
    +
    +def quote_and_encode(value, encoding) ‑> str +
    +
    +
    + +Expand source code + +
    def quote_and_encode(value, encoding) -> str:
    +    """Quote and encode a given value."""
    +
    +    double_quote_encoded: Final[str] = encode_roundtrip(DOUBLE_QUOTE, encoding)
    +    # return f"{double_quote_encoded}{value}{double_quote_encoded}"
    +    return f"{double_quote_encoded}{encode_roundtrip(value, encoding)}{double_quote_encoded}"
    +
    +

    Quote and encode a given value.

    +
    +
    +
    +
    +

    Classes

    +
    +
    +class ByteSequence +(id: str, pos: str, min_off: str, max_off: str, endian: str, value: str) +
    +
    +
    + +Expand source code + +
    @dataclass
    +class ByteSequence:
    +    id: str
    +    pos: str
    +    min_off: str
    +    max_off: str
    +    endian: str
    +    value: str
    +
    +

    ByteSequence(id: str, pos: str, min_off: str, max_off: str, endian: str, value: str)

    +

    Instance variables

    +
    +
    var endian : str
    +
    +
    +
    +
    var id : str
    +
    +
    +
    +
    var max_off : str
    +
    +
    +
    +
    var min_off : str
    +
    +
    +
    +
    var pos : str
    +
    +
    +
    +
    var value : str
    +
    +
    +
    +
    +
    +
    +class ExternalSignature +(id: str, signature: str, type: str) +
    +
    +
    + +Expand source code + +
    @dataclass
    +class ExternalSignature:
    +    id: str
    +    signature: str
    +    type: str
    +
    +

    ExternalSignature(id: str, signature: str, type: str)

    +

    Instance variables

    +
    +
    var id : str
    +
    +
    +
    +
    var signature : str
    +
    +
    +
    +
    var type : str
    +
    +
    +
    +
    +
    +
    +class Format +(id: str,
    name: str,
    version: str,
    puid: str,
    mime: str,
    classification: str,
    external_signatures: list[ExternalSignature],
    internal_signatures: list[InternalSignature],
    priorities: list[int])
    +
    +
    +
    + +Expand source code + +
    @dataclass
    +class Format:  # pylint: disable=R0902
    +    id: str
    +    name: str
    +    version: str
    +    puid: str
    +    mime: str
    +    classification: str
    +    external_signatures: list[ExternalSignature]
    +    internal_signatures: list[InternalSignature]
    +    priorities: list[int]
    +
    +

    Format(id: str, name: str, version: str, puid: str, mime: str, classification: str, external_signatures: list[src.jsonid.pronom.ExternalSignature], internal_signatures: list[src.jsonid.pronom.InternalSignature], priorities: list[int])

    +

    Instance variables

    +
    +
    var classification : str
    +
    +
    +
    +
    var external_signatures : list[ExternalSignature]
    +
    +
    +
    +
    var id : str
    +
    +
    +
    +
    var internal_signatures : list[InternalSignature]
    +
    +
    +
    +
    var mime : str
    +
    +
    +
    +
    var name : str
    +
    +
    +
    +
    var priorities : list[int]
    +
    +
    +
    +
    var puid : str
    +
    +
    +
    +
    var version : str
    +
    +
    +
    +
    +
    +
    +class Identifier +(type: str, value: str) +
    +
    +
    + +Expand source code + +
    @dataclass
    +class Identifier:
    +    type: str
    +    value: str
    +
    +

    Identifier(type: str, value: str)

    +

    Instance variables

    +
    +
    var type : str
    +
    +
    +
    +
    var value : str
    +
    +
    +
    +
    +
    +
    +class InternalSignature +(id: str,
    name: str,
    byte_sequences: list[ByteSequence])
    +
    +
    +
    + +Expand source code + +
    @dataclass
    +class InternalSignature:
    +    id: str
    +    name: str
    +    byte_sequences: list[ByteSequence]
    +
    +

    InternalSignature(id: str, name: str, byte_sequences: list[src.jsonid.pronom.ByteSequence])

    +

    Instance variables

    +
    +
    var byte_sequences : list[ByteSequence]
    +
    +
    +
    +
    var id : str
    +
    +
    +
    +
    var name : str
    +
    +
    +
    +
    +
    +
    +class Priority +(type: str, id: str) +
    +
    +
    + +Expand source code + +
    @dataclass
    +class Priority:
    +    type: str
    +    id: str
    +
    +

    Priority(type: str, id: str)

    +

    Instance variables

    +
    +
    var id : str
    +
    +
    +
    +
    var type : str
    +
    +
    +
    +
    +
    +
    +class UnprocessableEntity +(*args, **kwargs) +
    +
    +
    + +Expand source code + +
    class UnprocessableEntity(Exception):
    +    """Provide a way to give complete feedback to the caller to allow
    +    it to exit."""
    +
    +

    Provide a way to give complete feedback to the caller to allow +it to exit.

    +

    Ancestors

    +
      +
    • builtins.Exception
    • +
    • builtins.BaseException
    • +
    +
    +
    +
    +
    + +
    + + + diff --git a/docs/registry/index.htm b/docs/registry/index.htm index 7c333b4..672769c 100644 --- a/docs/registry/index.htm +++ b/docs/registry/index.htm @@ -672,13 +672,13 @@

    Contents

    jrid:0020 - GL Transmission Format: GLTF runtime 3D asset library (Generic) + GL Transmission Format: GLTF runtime 3D asset library schema (Generic) False False False True -
    {'KEY': '$schema', 'STARTSWITH': 'http://json-schema.org/'}
    -{'KEY': '$schema', 'ENDSWITH': '/schema#'}
    +    
    {'KEY': '$schema', 'STARTSWITH': 'https://json-schema.org/'}
    +{'KEY': '$schema', 'ENDSWITH': '/schema'}
     {'KEY': 'title', 'EXISTS': None}
     {'KEY': 'type', 'IS': 'object'}
     {'KEY': 'description', 'IS': 'The root object for a glTF asset.'}
    @@ -1510,7 +1510,7 @@ 

    Contents

  • jrid:0017: Open Resume Document
  • jrid:0018: jacker song
  • jrid:0019: JSON Patch
  • -
  • jrid:0020: GL Transmission Format: GLTF runtime 3D asset library (Generic)
  • +
  • jrid:0020: GL Transmission Format: GLTF runtime 3D asset library schema (Generic)
  • jrid:0021: Tweet Object
  • jrid:0022: sandboxels save file
  • jrid:0023: dublin core metadata (archivematica)
  • diff --git a/docs/utils/index.html b/docs/utils/index.html index ce33717..6236290 100644 --- a/docs/utils/index.html +++ b/docs/utils/index.html @@ -45,6 +45,11 @@

    Sub-modules

    json2json will convert JSON compatible objects from one encoding to UTF-8.

    +
    src.utils.jsonid2pronom
    +
    +

    jsonid2pronom provides a helper script to enable export of generic +JSONID compatible markers to a PRONOM compatible signature file.

    +
    @@ -67,6 +72,7 @@

    Sub-modules

  • Sub-modules

  • diff --git a/docs/utils/json2json.html b/docs/utils/json2json.html index d68fa96..62dce2d 100644 --- a/docs/utils/json2json.html +++ b/docs/utils/json2json.html @@ -78,7 +78,7 @@

    Functions

    async def identify_json(paths: list[str]):
         """Identify objects."""
         for idx, path in enumerate(paths):
    -        valid, data, _, _ = await identify_plaintext_bytestream(path)
    +        valid, data = await identify_plaintext_bytestream(path)
             if not valid:
                 continue
             print(json.dumps(data, indent=2))
    @@ -143,7 +143,7 @@

    Functions

    parser = argparse.ArgumentParser( prog="json2json", description="parse JSON UTF-16 (BE-LE) objects and output them as UTF-8 for the sake of developer ergonomics", - epilog="for more information visit https://github.com/ffdev-info/json-id", + epilog="for more information visit https://github.com/ffdev-info/jsonid", ) parser.add_argument( "--debug", diff --git a/docs/utils/jsonid2pronom.html b/docs/utils/jsonid2pronom.html new file mode 100644 index 0000000..09d9b27 --- /dev/null +++ b/docs/utils/jsonid2pronom.html @@ -0,0 +1,211 @@ + + + + + + +src.utils.jsonid2pronom API documentation + + + + + + + + + + + +
    +
    +
    +

    Module src.utils.jsonid2pronom

    +
    +
    +

    jsonid2pronom provides a helper script to enable export of generic +JSONID compatible markers to a PRONOM compatible signature file.

    +
    +
    +
    +
    +
    +
    +

    Functions

    +
    +
    +async def load_patterns(path: str) ‑> list +
    +
    +
    + +Expand source code + +
    async def load_patterns(path: str) -> list:
    +    """Load patterns from a file for conversion to a signature file."""
    +    patterns = []
    +    with open(path, "r", encoding="utf-8") as patterns_file:
    +        patterns = json.loads(patterns_file.read())
    +    return patterns
    +
    +

    Load patterns from a file for conversion to a signature file.

    +
    +
    +def main() ‑> None +
    +
    +
    + +Expand source code + +
    def main() -> None:
    +    """Primary entry point for this script."""
    +    parser = argparse.ArgumentParser(
    +        prog="jsonid2pronom",
    +        description="convert JSONID compatible markers to PRONOM",
    +        epilog="for more information visit https://github.com/ffdev-info/jsonid",
    +    )
    +    parser.add_argument(
    +        "--debug",
    +        help="use debug loggng",
    +        required=False,
    +        action="store_true",
    +    )
    +    parser.add_argument(
    +        "--path",
    +        "-p",
    +        help="file path to process",
    +        required=False,
    +    )
    +    args = parser.parse_args()
    +    logging.getLogger(__name__).setLevel(logging.DEBUG if args.debug else logging.INFO)
    +    logger.debug("debug logging is configured")
    +    if not args.path:
    +        parser.print_help(sys.stderr)
    +        sys.exit()
    +    asyncio.run(
    +        output_signature(
    +            path=args.path,
    +        )
    +    )
    +
    +

    Primary entry point for this script.

    +
    +
    +async def output_signature(path: str) +
    +
    +
    + +Expand source code + +
    async def output_signature(path: str):
    +    """Output JSONID compatible signatures to PRONOM."""
    +
    +    formats = []
    +
    +    encodings = ("UTF-8", "UTF-16", "UTF-16BE", "UTF-32LE")
    +    priorities = []
    +
    +    increment_id = 0
    +
    +    markers = await load_patterns(path)
    +
    +    if not markers:
    +        logger.error("no patterns provided via path arg")
    +        sys.exit(1)
    +
    +    for encoding in encodings:
    +        increment_id += 1
    +        json_puid = "jsonid2pronom/1"
    +        name_ = f"JSONID2PRONOM Conversion ({encoding})"
    +        try:
    +            mime = "application/json"
    +        except IndexError:
    +            mime = ""
    +        try:
    +            sequences = pronom.process_markers(
    +                copy.deepcopy(markers),
    +                increment_id,
    +                encoding=encoding,
    +            )
    +        except pronom.UnprocessableEntity as err:
    +            logger.error(
    +                "%s %s: cannot handle: %s",
    +                json_puid,
    +                name_,
    +                err,
    +            )
    +            for err_marker in markers:
    +                logger.debug("--- START ---")
    +                logger.debug("marker: %s", err_marker)
    +                logger.debug("---  END  ---")
    +            continue
    +        fmt = pronom.Format(
    +            id=increment_id,
    +            name=name_,
    +            version="",
    +            puid=json_puid,
    +            mime=mime,
    +            classification="structured text",
    +            external_signatures=[
    +                pronom.ExternalSignature(
    +                    id=increment_id,
    +                    signature="json",
    +                    type=pronom.EXT,
    +                )
    +            ],
    +            internal_signatures=sequences,
    +            priorities=list(set(priorities)),
    +        )
    +        priorities.append(f"{increment_id}")
    +        formats.append(fmt)
    +
    +    pronom.process_formats_to_stdout(formats)
    +
    +

    Output JSONID compatible signatures to PRONOM.

    +
    +
    +
    +
    +
    +
    + +
    + + + From 9bb5ef5be2eb8cfee24d90c6412b08afca95a0e6 Mon Sep 17 00:00:00 2001 From: ross-spencer Date: Sun, 4 Jan 2026 17:34:22 +0100 Subject: [PATCH 5/7] Fix imports and provide PRONOM entry point --- pyproject.toml | 1 + src/utils/jsonid2pronom.py | 8 +++++++- 2 files changed, 8 insertions(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index b7fe1b5..991bcdf 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -40,6 +40,7 @@ dependencies = {file = ["requirements/requirements.txt"]} jsonid = "jsonid.jsonid:main" momoa = "jsonid.jsonid:main" json2json = "utils.json2json:main" +json2pronom = "utils.jsonid2pronom:main" [build-system] requires = ["setuptools>=80.9.0", "wheel>=0.45.1", "setuptools_scm[toml]>=9.2.2"] diff --git a/src/utils/jsonid2pronom.py b/src/utils/jsonid2pronom.py index be3fae2..a515f28 100644 --- a/src/utils/jsonid2pronom.py +++ b/src/utils/jsonid2pronom.py @@ -9,7 +9,13 @@ import logging import sys -from src.jsonid import pronom +try: + from src.jsonid import pronom +except ModuleNotFoundError: + try: + from jsonid import pronom + except ModuleNotFoundError: + import pronom # Set up logging. logging.basicConfig( From fa97890a4fce756ee16261c481166fc7183a2f6e Mon Sep 17 00:00:00 2001 From: ross-spencer Date: Sun, 4 Jan 2026 17:43:35 +0100 Subject: [PATCH 6/7] Add PRONOM export to the README --- README.md | 54 ++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 54 insertions(+) diff --git a/README.md b/README.md index 2a17e78..51a5669 100644 --- a/README.md +++ b/README.md @@ -413,6 +413,60 @@ PRONOM IDs that can then be referenced in the JSONID output. Evantually, PRONOM or a PRONOM-like tool might host an authoritative version of the JSONID registry. +### JSONID for PRONOM Signature Development + +JSONID provides a high-level language for output of PRONOM compatible +signatures. The feature set is still in its BETA phase but JSONID provides +two distinct capabilities: + +#### 1. Registry output + +JSONID's registry can be output using the `--pronom` flag. A signature file +will be created under `jsonid_pronom.xml` which can be imported into DROID +for identification of document types registered with JSONID. + +JSONID's registry is output alongisde a handful of baseline JSON signatures +designed to capture "plain"-JSON that is not yet encoded in the registry. + +#### 2. Signature development + +A standalone `json2pronom` utility is provided for creation of potentially +robust DROID compatible signatures. + +As a high-level language, signatures can be defined in easy to understand +syntax and then output consistently via the `json2pronom` utility. Signatures +include sensible defaults for whitespace and other aspects that are +difficult for signature developers to consistently anticipate when writing +JSON based signatures. + +Given a [sample pattern file](./pronom_example/patterns_example.json) a DROID +compatible snippet can be output as follows (UTF-8 shown for brevity): + + + +```xml + + + + + + + + + + + + + 1 + json + + +``` + + + +Feedback on this utility is welcome. + ## Output format Previously JSONID output YAML containing all result object metadata. It has From 66b482d897837017cc2b7551357e2b6a65b7e82d Mon Sep 17 00:00:00 2001 From: ross-spencer Date: Sun, 4 Jan 2026 17:57:05 +0100 Subject: [PATCH 7/7] Fix PUID numbering --- src/utils/jsonid2pronom.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/utils/jsonid2pronom.py b/src/utils/jsonid2pronom.py index a515f28..444b641 100644 --- a/src/utils/jsonid2pronom.py +++ b/src/utils/jsonid2pronom.py @@ -56,7 +56,7 @@ async def output_signature(path: str): for encoding in encodings: increment_id += 1 - json_puid = "jsonid2pronom/1" + json_puid = f"jsonid2pronom/{increment_id}" name_ = f"JSONID2PRONOM Conversion ({encoding})" try: mime = "application/json"