From 7675bd5f5fe046a4b22a61ca4e8011ec914fdb31 Mon Sep 17 00:00:00 2001 From: Leonard Hecker Date: Tue, 20 Jan 2026 19:45:12 +0100 Subject: [PATCH 1/2] Add a variable-length integer encoder/decoder --- crates/edit/benches/lib.rs | 68 +++++++++++++++++ crates/stdext/src/lib.rs | 1 + crates/stdext/src/varint.rs | 142 ++++++++++++++++++++++++++++++++++++ 3 files changed, 211 insertions(+) create mode 100644 crates/stdext/src/varint.rs diff --git a/crates/edit/benches/lib.rs b/crates/edit/benches/lib.rs index 4c8fcc37df3..41ee4f4d7c8 100644 --- a/crates/edit/benches/lib.rs +++ b/crates/edit/benches/lib.rs @@ -227,6 +227,73 @@ fn bench_unicode(c: &mut Criterion) { }); } +fn bench_varint(c: &mut Criterion) { + const BUFFER_SIZE: usize = MEBI; + + let mut buffer = Vec::with_capacity(BUFFER_SIZE + 16); + + // Knuth's MMIX LCG + let mut rng_state = 1442695040888963407u64; + let mut rng = || { + rng_state = rng_state.wrapping_mul(6364136223846793005).wrapping_add(1442695040888963407); + rng_state as u32 + }; + + // Bitmask with Rejection (as used by Apple) + let mut rng_state = 1442695040888963407u64; + let mut rng_range = |range: Range| { + let range_size = range.len() as u32; + let mask = range_size.next_power_of_two() - 1; + loop { + rng_state = + rng_state.wrapping_mul(6364136223846793005).wrapping_add(1442695040888963407); + let value = rng_state as u32 & mask; + if value < range_size { + return range.start.wrapping_add(value); + } + } + }; + + loop { + // Generate values according to a non-uniform distribution. + // The distribution roughly corresponds to what LSH encounters. + let value = match rng() { + // ~35%: <=7 bits + 0..1503238553 => rng_range(0..0x7F), + // ~40%: <=14 bits + 1503238553..3221225472 => rng_range(0x80..0x3FFF), + // ~20%: <=21 bits + 3221225472..4026531840 => rng_range(0x4000..0x1FFFFF), + // ~5%: u32::MAX + _ => (1 << 28) - 1, + }; + + buffer.extend(varint::encode(value)); + + if buffer.len() > BUFFER_SIZE { + break; + } + } + + // As per the varint::decode() safety requirements, we need 8 bytes of padding. + // We pre-allocated `buffer` with extra capacity, so we technically fulfill that. + // _Technically_, however, we also make Rust unhappy, because it's uninitialized memory. + // It's just that I really really don't care about any such antics. It's memory. + + c.benchmark_group("varint").bench_function("decode", |b| { + let mut off = 0; + + b.iter(|| { + let (val, len) = unsafe { varint::decode(buffer.as_ptr().add(off)) }; + black_box(val); + off += len; + if off >= buffer.len() { + off = 0; + } + }); + }); +} + fn bench(c: &mut Criterion) { arena::init(128 * MEBI).unwrap(); @@ -238,6 +305,7 @@ fn bench(c: &mut Criterion) { bench_simd_memset::(c); bench_simd_memset::(c); bench_unicode(c); + bench_varint(c); } criterion_group!(benches, bench); diff --git a/crates/stdext/src/lib.rs b/crates/stdext/src/lib.rs index d7226d7e19c..7e8a849e15b 100644 --- a/crates/stdext/src/lib.rs +++ b/crates/stdext/src/lib.rs @@ -7,6 +7,7 @@ pub mod arena; pub mod sys; +pub mod varint; mod helpers; pub use helpers::*; diff --git a/crates/stdext/src/varint.rs b/crates/stdext/src/varint.rs new file mode 100644 index 00000000000..64a40c037a6 --- /dev/null +++ b/crates/stdext/src/varint.rs @@ -0,0 +1,142 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT License. + +//! Variable-length `u32` encoding and decoding, with efficient storage of `u32::MAX`. +//! `u32::MAX` is a common value in Microsoft Edit's syntax highlighter bytecode. +//! +//! # Format +//! +//! ```text +//! 0-127 ( 7 bits): xxxxxxx0 +//! 128-16383 (14 bits): xxxxxx01 yyyyyyyx +//! 16384-2097151 (21 bits): xxxxx011 yyyyyyxx zzzzzzzy +//! 2097152-268435455 (28 bits): xxxx0111 yyyyyxxx zzzzzzyy wwwwwwwz +//! 4294967295 (32 bits): ....1111 +//! ``` +//! +//! The least significant bits indicate the length, in a format identical to UTF-8. The remaining bits store +//! the value, in little-endian order. Little endian was chosen, as most architectures today use that. +//! +//! On x86, `tzcnt` (= `trailing_ones()` = what we need) has the benefit that its encoding is identical to `rep bsf`. +//! Older CPUs without BMI1 will ignore the `rep` prefix and use `bsf`, while modern CPUs will use the faster `tzcnt`. +//! So not just can we drop the need for `bswap` on x86, but we also speed up the bit count calculation. +//! This makes this encoding faster than LEB128, Google Varint, and others. + +pub fn encode(val: u32) -> Vec { + let mut result = Vec::with_capacity(5); + let shift = match val { + 0..0x80 => 0, + 0x80..0x4000 => 1, + 0x4000..0x200000 => 2, + 0x200000..0x10000000 => 3, + _ => { + result.push(0xff); + return result; + } + }; + let marker = (1u32 << shift) - 1; + let encoded = (val << (shift + 1)) | marker; + let bytes = encoded.to_le_bytes(); + result.extend_from_slice(&bytes[..=shift]); + result +} + +/// # Safety +/// +/// The caller must ensure that `data..data+4` is valid memory. +/// It doesn't need to be a valid value, but it must be readable. +pub unsafe fn decode(data: *const u8) -> (u32, usize) { + // For inputs such as: + // [0xff, 0xff, 0xff, 0xff] + // the shifts below will shift by more than 31 digits, which Rust considers undefined behavior. + // *We explicitly want UB here*. + // + // If we write an if condition here (like this one), LLVM will turn that into a proper branch. Since our inputs + // are relatively random, that branch will mispredict, hurting performance. The if condition at the end + // gets turned into conditional moves (good!), but that only works because it comes after the shifts. + // Unfortunately, there's no way to ask Rust for "platform-defined behavior" (`unchecked_shl/shr` is not it). + #[cfg(debug_assertions)] + unsafe { + if (*data & 0x0f) == 0x0f { + return (u32::MAX, 1); + } + } + + unsafe { + // Read the following 4 bytes in a single u32 load. We need to swap to big-endian to move the lead + // 0/10/110/1110/1111 bits to the MSB. This then allows us to do a single, quick `leading_ones` call. + let val = u32::from_le((data as *const u32).read_unaligned()); + let ones = val.trailing_ones(); + + let mut len = ones as usize + 1; + let mut res = 'bextr: { + // Give LLVM a helping hand for x86 CPUs with BMI1. It's not smart enough to figure out that `bextr` can + // be used here. To be fair, it's not faster, so maybe that's why. It is _a lot_ more compact, however. + #[cfg(target_feature = "bmi1")] + break 'bextr std::arch::x86_64::_bextr_u32(val, len as u32, (7 * len) as u32); + + // This is where you'd put more architecture-specific optimizations. + // In fact this is where I'd put my ARM optimizations, but it doesn't have anything like `bextr`. :( + + let mut res = val; + // Shift out the bytes we read but don't need. + res <<= 32 - 8 * len; + // Shift back down and remove the trailing 0/10/110/1110/1111 length bits. + res >>= 32 - 7 * len; + break 'bextr res; + }; + + // If the lead byte indicates >28 bits, assume `u32::MAX`. + // This doubles as a simple form of error correction. + if len > 4 { + res = u32::MAX; + len = 1; + } + + (res, len) + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_encode_decode_roundtrip() { + // Test various boundary values + let test_values = [ + 0u32, + 1, + 123, + 127, // Max 1 byte + 128, // Min 2 bytes + 1234, + 16383, // Max 2 bytes + 16384, // Min 3 bytes + 2097151, // Max 3 bytes + 2097152, // Min 4 bytes + 268435455, // Max 4 bytes + u32::MAX, // Special case + ]; + + for &val in &test_values { + let encoded = encode(val); + println!("Value {} encoded as: {:02X?}", val, encoded); + let (decoded, len) = unsafe { decode(encoded.as_ptr()) }; + println!(" Decoded as: {} with length {}", decoded, len); + assert_eq!(decoded, val, "Failed roundtrip for value {}", val); + assert_eq!(len, encoded.len(), "Length mismatch for value {}", val); + } + } + + #[test] + fn test_specific_encodings() { + // Test specific byte patterns + unsafe { + assert_eq!((0, 1), decode([0, 0xbb, 0xcc, 0xdd].as_ptr())); + assert_eq!((123, 1), decode([0xf6, 0xbb, 0xcc, 0xdd].as_ptr())); + assert_eq!((1234, 2), decode([0x49, 0x13, 0xcc, 0xdd].as_ptr())); + assert_eq!((u32::MAX, 1), decode([0xff, 0xbb, 0xcc, 0xdd].as_ptr())); + } + } +} From 8198b206c686973d43f193f58aac3e555d686cc2 Mon Sep 17 00:00:00 2001 From: Leonard Hecker Date: Tue, 20 Jan 2026 21:52:29 +0100 Subject: [PATCH 2/2] Fix build --- crates/edit/benches/lib.rs | 3 ++- crates/stdext/src/varint.rs | 2 -- 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/crates/edit/benches/lib.rs b/crates/edit/benches/lib.rs index 41ee4f4d7c8..8f22b9edd23 100644 --- a/crates/edit/benches/lib.rs +++ b/crates/edit/benches/lib.rs @@ -3,6 +3,7 @@ use std::hint::black_box; use std::io::Cursor; +use std::ops::Range; use std::{mem, vec}; use criterion::{BenchmarkId, Criterion, Throughput, criterion_group, criterion_main}; @@ -10,7 +11,7 @@ use edit::helpers::*; use edit::simd::MemsetSafe; use edit::{buffer, hash, oklab, simd, unicode}; use serde::Deserialize; -use stdext::arena; +use stdext::{arena, varint}; #[derive(Deserialize)] pub struct EditingTracePatch(pub usize, pub usize, pub String); diff --git a/crates/stdext/src/varint.rs b/crates/stdext/src/varint.rs index 64a40c037a6..5b1e867a457 100644 --- a/crates/stdext/src/varint.rs +++ b/crates/stdext/src/varint.rs @@ -63,8 +63,6 @@ pub unsafe fn decode(data: *const u8) -> (u32, usize) { } unsafe { - // Read the following 4 bytes in a single u32 load. We need to swap to big-endian to move the lead - // 0/10/110/1110/1111 bits to the MSB. This then allows us to do a single, quick `leading_ones` call. let val = u32::from_le((data as *const u32).read_unaligned()); let ones = val.trailing_ones();