From 3fbbd82c9e745a3482d2edf09fa11629faefa4d0 Mon Sep 17 00:00:00 2001 From: Martin Sigloch Date: Wed, 30 Jul 2025 08:45:42 +0000 Subject: [PATCH 1/5] Add ULEB128 and SLEB128 encoding for Nat and Int values --- src/lib.mo | 182 +++++++++++++++++++++++++++++++++++++++- tests/ByteUtils.Test.mo | 12 +++ tests/Sorted.Test.mo | 6 -- 3 files changed, 191 insertions(+), 9 deletions(-) diff --git a/src/lib.mo b/src/lib.mo index d78e4ba..1e6b938 100644 --- a/src/lib.mo +++ b/src/lib.mo @@ -2,7 +2,6 @@ import Prim "mo:prim"; import B "mo:base/Buffer"; import Iter "mo:base/Iter"; -import Blob "mo:base/Blob"; import Array "mo:base/Array"; import Nat8 "mo:base/Nat8"; import Nat16 "mo:base/Nat16"; @@ -467,6 +466,18 @@ module ByteUtils { Buffer.readLEB128_64(buffer); }; + public func toLEB128(n : Nat) : [Nat8] { + let buffer = B.Buffer(10); + Buffer.addLEB128_nat(buffer, n); + B.toArray(buffer); + }; + + public func fromLEB128(bytes : Bytes) : Nat { + let buffer = B.Buffer(10); + for (byte in bytes) { buffer.add(byte) }; + Buffer.readLEB128_nat(buffer); + }; + public func toSLEB128_64(n : Int64) : [Nat8] { let buffer = B.Buffer(10); Buffer.addSLEB128_64(buffer, n); @@ -479,6 +490,18 @@ module ByteUtils { Buffer.readSLEB128_64(buffer); }; + public func toSLEB128(n : Int) : [Nat8] { + let buffer = B.Buffer(10); + Buffer.addSLEB128_int(buffer, n); + B.toArray(buffer); + }; + + public func fromSLEB128(bytes : Bytes) : Int { + let buffer = B.Buffer(10); + for (byte in bytes) { buffer.add(byte) }; + Buffer.readSLEB128_int(buffer); + }; + public module Buffer { public func addBytes(buffer : BufferLike, iter : Iter.Iter) { @@ -992,6 +1015,7 @@ module ByteUtils { // https://en.wikipedia.org/wiki/LEB128 // limited to 64-bit unsigned integers // more performant than the general unsigned_leb128 + /// Add ULEB128 encoded number to the end of a buffer public func addLEB128_64(buffer : BufferLike, n : Nat64) { var value = n; while (value >= 0x80) { @@ -1002,7 +1026,8 @@ module ByteUtils { }; - // Write LEB128 at a specific offset + /// Write ULEB128 encoded value at a specific offset. + /// Traps if the buffer is smaller than the offset and number of encoded bytes. public func writeLEB128_64(buffer : BufferLike, offset : Nat, n : Nat64) { var n64 : Nat64 = n; var index = offset; @@ -1019,9 +1044,36 @@ module ByteUtils { }; + /// Add ULEB128 encoded Nat to the end of the buffer. + public func addLEB128_nat(buffer : BufferLike, n : Nat) { + var value = n; + while (value >= 0x80) { + buffer.add(Nat8.fromNat(value % 0x80) + 0x80); + value /= 0x80; + }; + buffer.add(Nat8.fromNat(value)); + + }; + + /// Write ULEB128 encoded value at a sepcific offset. + /// Traps if the buffer is smaller than the offset and number of encoded bytes. + public func writeLEB128_nat(buffer : BufferLike, offset : Nat, n : Nat) { + var value = n; + var index = offset; + + while (value >= 0x80) { + buffer.put(index, Nat8.fromNat(value % 0x80) + 0x80); + index += 1; + value /= 0x80; + }; + buffer.put(index, Nat8.fromNat(value)); + + }; + // https://en.wikipedia.org/wiki/LEB128 // limited to 64-bit signed integers // more performant than the general signed_leb128 + /// Add SLEB128 encoded value to the end of a buffer. public func addSLEB128_64(buffer : BufferLike, _n : Int64) { let n = Int64.toInt(_n); let is_negative = n < 0; @@ -1062,7 +1114,8 @@ module ByteUtils { }; }; - // Write SLEB128 at a specific offset + /// Write SLEB128 encoded value at a specific offset. + /// Traps if the buffer is smaller than the offset and number of encoded bytes. public func writeSLEB128_64(buffer : BufferLike, offset : Nat, _n : Int64) { let n = Int64.toInt(_n); let is_negative = n < 0; @@ -1106,7 +1159,78 @@ module ByteUtils { }; + /// Add SLEB128 encoded value to the end of a buffer. + public func addSLEB128_int(buffer : BufferLike, n : Int) { + var value = n; + let is_negative = value < 0; + + // Convert to correct absolute value representation first + var more = true; + + while (more) { + // Get lowest 7 bits + var byte : Nat8 = Nat8.fromIntWrap(value) & 0x7F; + + // Shift for next iteration + if (is_negative) { + value := (value - 127) / 128; // -127 to round down instead of towards 0 + } else { + value /= 128; + }; + + // Determine if we need more bytes + if ( + (value == 0 and (byte & 0x40) == 0) or + (value == -1 and (byte & 0x40) != 0) + ) { + more := false; + } else { + byte |= 0x80; // Set continuation bit + }; + + buffer.add(byte); + }; + }; + + /// Write SLEB128 encoded value at a specific offset. + /// Traps if the buffer is smaller than the offset and number of encoded bytes. + public func writeSLEB128_int(buffer : BufferLike, offset : Nat, n : Int) { + var value = n; + let is_negative = value < 0; + var index = offset; + + // Convert to correct absolute value representation first + var more = true; + + while (more) { + // Get lowest 7 bits + var byte : Nat8 = Nat8.fromIntWrap(value) & 0x7F; + + // Shift for next iteration + if (is_negative) { + value := (value - 127) / 128; // -127 to round down instead of towards 0 + } else { + value /= 128; + }; + + // Determine if we need more bytes + if ( + (value == 0 and (byte & 0x40) == 0) or + (value == -1 and (byte & 0x40) != 0) + ) { + more := false; + } else { + byte |= 0x80; // Set continuation bit + }; + + buffer.put(index, byte); + index += 1; + }; + }; + // https://en.wikipedia.org/wiki/LEB128 + /// Read unsigned LEB128 value from buffer + /// Traps if end of buffer is reached before value is completely decoded public func readLEB128_64(buffer : BufferLike) : Nat64 { var n64 : Nat64 = 0; var shift : Nat64 = 0; @@ -1126,6 +1250,29 @@ module ByteUtils { n64; }; + /// Read unsigned LEB128 value from buffer + /// Traps if end of buffer is reached before value is completely decoded + public func readLEB128_nat(buffer : BufferLike) : Nat { + var n : Nat = 0; + var shift : Nat = 1; + var i = 0; + + label decoding_leb loop { + let byte = buffer.get(i); + i += 1; + + n += (Nat8.toNat(byte & 0x7f)) * shift; + + if (byte & 0x80 == 0) break decoding_leb; + shift *= 128; + + }; + + n; + }; + + /// Read signed LEB128 value from buffer + /// Traps if end of buffer is reached before value is completely decoded public func readSLEB128_64(buffer : BufferLike) : Int64 { var result : Nat64 = 0; var shift : Nat64 = 0; @@ -1155,6 +1302,35 @@ module ByteUtils { Int64.fromNat64(result); }; + public func readSLEB128_int(buffer : BufferLike) : Int { + var result : Int = 0; + var shift : Int = 1; + var byte : Nat8 = 0; + var i = 0; + + label analyzing loop { + byte := buffer.get(i); + i += 1; + + // Add this byte's 7 bits to the result + result += Nat8.toNat(byte & 0x7F) * shift; + shift *= 128; + + // If continuation bit is not set, we're done reading bytes + if ((byte & 0x80) == 0) { + break analyzing; + }; + }; + + // Sign extend if this is a negative number + if (byte & 0x40 != 0) { + // Fill the rest with 1s (sign extension) + result -= shift; + }; + + result; + }; + }; }; diff --git a/tests/ByteUtils.Test.mo b/tests/ByteUtils.Test.mo index 06335d2..6922f42 100644 --- a/tests/ByteUtils.Test.mo +++ b/tests/ByteUtils.Test.mo @@ -261,6 +261,12 @@ suite( let decoded = ByteUtils.fromLEB128_64(encoded.vals()); assert decoded == value; + + let encodedNat = ByteUtils.toLEB128(Nat64.toNat(value)); + assert encodedNat == expectedBytes; + + let decodedNat = ByteUtils.fromLEB128(encoded.vals()); + assert decodedNat == Nat64.toNat(value); }; }, ); @@ -299,6 +305,12 @@ suite( let decoded = ByteUtils.fromSLEB128_64(encoded.vals()); assert decoded == value; + + let encodedInt = ByteUtils.toSLEB128(Int64.toInt(value)); + assert encodedInt == expectedBytes; + + let decodedInt = ByteUtils.fromSLEB128(encoded.vals()); + assert decodedInt == Int64.toInt(value); }; }, ); diff --git a/tests/Sorted.Test.mo b/tests/Sorted.Test.mo index 5218637..23abf62 100644 --- a/tests/Sorted.Test.mo +++ b/tests/Sorted.Test.mo @@ -1,9 +1,7 @@ // @testmode wasi -import Debug "mo:base/Debug"; import Buffer "mo:base/Buffer"; import Blob "mo:base/Blob"; import Text "mo:base/Text"; -import Char "mo:base/Char"; import Nat "mo:base/Nat"; import Nat8 "mo:base/Nat8"; import Int8 "mo:base/Int8"; @@ -15,14 +13,10 @@ import Float "mo:base/Float"; import Nat64 "mo:base/Nat64"; import Nat16 "mo:base/Nat16"; import Nat32 "mo:base/Nat32"; -import Int "mo:base/Int"; import Bool "mo:base/Bool"; -import Order "mo:base/Order"; -import Array "mo:base/Array"; import { test; suite } "mo:test"; import Itertools "mo:itertools/Iter"; -import PeekableIter "mo:itertools/PeekableIter"; import BpTree "mo:augmented-btrees/BpTree"; import Cmp "mo:augmented-btrees/Cmp"; From dc543c861ab295a74f6db8b6a6194fce6af05ead Mon Sep 17 00:00:00 2001 From: Martin Sigloch Date: Wed, 30 Jul 2025 11:24:08 +0200 Subject: [PATCH 2/5] Add keywords for discoverability --- mops.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mops.toml b/mops.toml index 41cb881..e4e7cbb 100644 --- a/mops.toml +++ b/mops.toml @@ -3,7 +3,7 @@ name = "byte-utils" version = "0.0.1" license = "MIT" description = "A collection of utilities for byte manipulation and conversion." -keywords = ["leb128", "endian"] +keywords = [ "encoding", "decoding", "conversion", "endian", "LEB128", "ULEB128", "SLEB128"] repository = "https://github.com/NatLabs/ByteUtils" [dependencies] From a7e4375c0eb130b2a21445be27cdde492db1e6fc Mon Sep 17 00:00:00 2001 From: Martin Sigloch Date: Wed, 30 Jul 2025 09:48:41 +0000 Subject: [PATCH 3/5] Add doc comments to leb128 functions --- src/lib.mo | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/src/lib.mo b/src/lib.mo index 1e6b938..2df8ffc 100644 --- a/src/lib.mo +++ b/src/lib.mo @@ -454,48 +454,56 @@ module ByteUtils { public let LE = LittleEndian; public let BE = BigEndian; + /// Encodes a `Nat64` into ULEB128 format. public func toLEB128_64(n64 : Nat64) : [Nat8] { let buffer = B.Buffer(10); Buffer.addLEB128_64(buffer, n64); B.toArray(buffer); }; + /// Decodes a ULEB128-encoded `Nat64` from a byte iterator. public func fromLEB128_64(bytes : Bytes) : Nat64 { let buffer = B.Buffer(10); for (byte in bytes) { buffer.add(byte) }; Buffer.readLEB128_64(buffer); }; + /// Encodes a `Nat` into ULEB128 format. public func toLEB128(n : Nat) : [Nat8] { let buffer = B.Buffer(10); Buffer.addLEB128_nat(buffer, n); B.toArray(buffer); }; + /// Decodes a ULEB128-encoded `Nat` from a byte iterator. public func fromLEB128(bytes : Bytes) : Nat { let buffer = B.Buffer(10); for (byte in bytes) { buffer.add(byte) }; Buffer.readLEB128_nat(buffer); }; + /// Encodes an `Int64` into SLEB128 format. public func toSLEB128_64(n : Int64) : [Nat8] { let buffer = B.Buffer(10); Buffer.addSLEB128_64(buffer, n); B.toArray(buffer); }; + /// Decodes an SLEB128-encoded `Int64` from a byte iterator. public func fromSLEB128_64(bytes : Bytes) : Int64 { let buffer = B.Buffer(10); for (byte in bytes) { buffer.add(byte) }; Buffer.readSLEB128_64(buffer); }; + /// Encodes an `Int` into SLEB128 format. public func toSLEB128(n : Int) : [Nat8] { let buffer = B.Buffer(10); Buffer.addSLEB128_int(buffer, n); B.toArray(buffer); }; + /// Decodes an SLEB128-encoded `Int` from a byte iterator. public func fromSLEB128(bytes : Bytes) : Int { let buffer = B.Buffer(10); for (byte in bytes) { buffer.add(byte) }; From 6d9620b2a29f887616e09586eee3616a4d4f8365 Mon Sep 17 00:00:00 2001 From: Martin Sigloch Date: Wed, 30 Jul 2025 10:10:42 +0000 Subject: [PATCH 4/5] Add note to LEB128 functions that can trap --- src/lib.mo | 20 +++++++++++++------- 1 file changed, 13 insertions(+), 7 deletions(-) diff --git a/src/lib.mo b/src/lib.mo index 2df8ffc..09997cd 100644 --- a/src/lib.mo +++ b/src/lib.mo @@ -462,6 +462,7 @@ module ByteUtils { }; /// Decodes a ULEB128-encoded `Nat64` from a byte iterator. + /// Traps if end of buffer is reached before value is completely decoded. public func fromLEB128_64(bytes : Bytes) : Nat64 { let buffer = B.Buffer(10); for (byte in bytes) { buffer.add(byte) }; @@ -476,6 +477,7 @@ module ByteUtils { }; /// Decodes a ULEB128-encoded `Nat` from a byte iterator. + /// Traps if end of buffer is reached before value is completely decoded. public func fromLEB128(bytes : Bytes) : Nat { let buffer = B.Buffer(10); for (byte in bytes) { buffer.add(byte) }; @@ -490,6 +492,7 @@ module ByteUtils { }; /// Decodes an SLEB128-encoded `Int64` from a byte iterator. + /// Traps if end of buffer is reached before value is completely decoded. public func fromSLEB128_64(bytes : Bytes) : Int64 { let buffer = B.Buffer(10); for (byte in bytes) { buffer.add(byte) }; @@ -504,6 +507,7 @@ module ByteUtils { }; /// Decodes an SLEB128-encoded `Int` from a byte iterator. + /// Traps if end of buffer is reached before value is completely decoded. public func fromSLEB128(bytes : Bytes) : Int { let buffer = B.Buffer(10); for (byte in bytes) { buffer.add(byte) }; @@ -1063,7 +1067,7 @@ module ByteUtils { }; - /// Write ULEB128 encoded value at a sepcific offset. + /// Write ULEB128 encoded value at a specific offset. /// Traps if the buffer is smaller than the offset and number of encoded bytes. public func writeLEB128_nat(buffer : BufferLike, offset : Nat, n : Nat) { var value = n; @@ -1237,8 +1241,8 @@ module ByteUtils { }; // https://en.wikipedia.org/wiki/LEB128 - /// Read unsigned LEB128 value from buffer - /// Traps if end of buffer is reached before value is completely decoded + /// Read unsigned LEB128 value from buffer. + /// Traps if end of buffer is reached before value is completely decoded. public func readLEB128_64(buffer : BufferLike) : Nat64 { var n64 : Nat64 = 0; var shift : Nat64 = 0; @@ -1258,8 +1262,8 @@ module ByteUtils { n64; }; - /// Read unsigned LEB128 value from buffer - /// Traps if end of buffer is reached before value is completely decoded + /// Read unsigned LEB128 value from buffer. + /// Traps if end of buffer is reached before value is completely decoded. public func readLEB128_nat(buffer : BufferLike) : Nat { var n : Nat = 0; var shift : Nat = 1; @@ -1279,8 +1283,8 @@ module ByteUtils { n; }; - /// Read signed LEB128 value from buffer - /// Traps if end of buffer is reached before value is completely decoded + /// Read signed LEB128 value from buffer. + /// Traps if end of buffer is reached before value is completely decoded. public func readSLEB128_64(buffer : BufferLike) : Int64 { var result : Nat64 = 0; var shift : Nat64 = 0; @@ -1310,6 +1314,8 @@ module ByteUtils { Int64.fromNat64(result); }; + /// Read signed LEB128 value from buffer. + /// Traps if end of buffer is reached before value is completely decoded. public func readSLEB128_int(buffer : BufferLike) : Int { var result : Int = 0; var shift : Int = 1; From b68627aa4402fd0a8fb72b8f8119536cff56d0cf Mon Sep 17 00:00:00 2001 From: Martin Sigloch Date: Thu, 31 Jul 2025 16:05:09 +0000 Subject: [PATCH 5/5] Add test cases for LEB128 encoding of values larger than 2**64 --- tests/ByteUtils.Test.mo | 49 +++++++++++++++++++++++++++++++++++++++++ 1 file changed, 49 insertions(+) diff --git a/tests/ByteUtils.Test.mo b/tests/ByteUtils.Test.mo index 6922f42..9ad11cc 100644 --- a/tests/ByteUtils.Test.mo +++ b/tests/ByteUtils.Test.mo @@ -271,6 +271,28 @@ suite( }, ); + test( + "LEB128 large values", + func() { + let testVectors : [(Nat, [Nat8])] = [ + (2 ** 64, [0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x02]), + (2 ** 65, [0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x04]), + (2 ** 70, [0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x01]), + (2 ** 64 + 1, [0x81, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x02]), + (123456789012345678901234567890, [0xd2, 0x95, 0xfc, 0xf1, 0xe4, 0x9d, 0xf8, 0xb9, 0xc3, 0xed, 0xbf, 0xc8, 0xee, 0x31]), + ]; + + for ((value, expectedBytes) in testVectors.vals()) { + let encoded = ByteUtils.toLEB128(value); + Debug.print(debug_show ("leb128 large", value, encoded, expectedBytes)); + assert encoded == expectedBytes; + + let decodedNat = ByteUtils.fromLEB128(encoded.vals()); + assert decodedNat == value; + }; + }, + ); + test( "SLEB128 test vectors ", func() { @@ -315,6 +337,33 @@ suite( }, ); + test( + "SLEB128 large values", + func() { + let testVectors : [(Int, [Nat8])] = [ + (2 ** 64, [0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x02]), + (2 ** 65, [0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x04]), + (2 ** 70, [0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x01]), + (2 ** 64 + 1, [0x81, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x02]), + (123456789012345678901234567890, [0xd2, 0x95, 0xfc, 0xf1, 0xe4, 0x9d, 0xf8, 0xb9, 0xc3, 0xed, 0xbf, 0xc8, 0xee, 0x31]), + (-1 * (2 ** 64), [0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x7e]), + (-1 * (2 ** 65), [0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x7c]), + (-1 * (2 ** 70), [0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x7f]), + (-1 * (2 ** 64 + 1), [0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x7d]), + (-123456789012345678901234567890, [0xae, 0xea, 0x83, 0x8e, 0x9b, 0xe2, 0x87, 0xc6, 0xbc, 0x92, 0xc0, 0xb7, 0x91, 0x4e]), + ]; + + for ((value, expectedBytes) in testVectors.vals()) { + let encoded = ByteUtils.toSLEB128(value); + Debug.print(debug_show ("sleb128 large", value, encoded, expectedBytes)); + assert encoded == expectedBytes; + + let decodedNat = ByteUtils.fromSLEB128(encoded.vals()); + assert decodedNat == value; + }; + }, + ); + }, );