diff --git a/binaryfusefilter.go b/binaryfusefilter.go index f1f0f99..69e0096 100644 --- a/binaryfusefilter.go +++ b/binaryfusefilter.go @@ -1,12 +1,79 @@ package xorfilter import ( + "encoding/binary" "errors" "math" "math/bits" "unsafe" ) +// isBigEndian returns true if the host CPU uses big-endian byte order. +func isBigEndian() bool { + var x uint16 = 0x0102 + return *(*byte)(unsafe.Pointer(&x)) == 0x01 +} + +// forceLEUint16 converts a uint16 slice to little-endian byte order in place. +// On little-endian systems, this is a no-op. +func forceLEUint16(data []uint16) { + if !isBigEndian() { + return + } + for i := range data { + // Swap bytes: convert from big-endian to little-endian + data[i] = (data[i] << 8) | (data[i] >> 8) + } +} + +// forceLEUint32 converts a uint32 slice to little-endian byte order in place. +// On little-endian systems, this is a no-op. +func forceLEUint32(data []uint32) { + if !isBigEndian() { + return + } + for i := range data { + data[i] = binary.LittleEndian.Uint32((*[4]byte)(unsafe.Pointer(&data[i]))[:]) + } +} + +// forceLE converts a slice of unsigned integers to little-endian byte order in place. +// On little-endian systems, this is a no-op. For uint8, this is always a no-op. +func forceLE[T Unsigned](data []T) { + if !isBigEndian() { + return + } + var zero T + switch any(zero).(type) { + case uint8: + // No conversion needed for single bytes + case uint16: + forceLEUint16(*(*[]uint16)(unsafe.Pointer(&data))) + case uint32: + forceLEUint32(*(*[]uint32)(unsafe.Pointer(&data))) + } +} + +// fromLE converts a single value from little-endian storage to native byte order. +// On little-endian systems, this is a no-op. For uint8, this is always a no-op. +func fromLE[T Unsigned](v T) T { + if !isBigEndian() { + return v + } + var zero T + switch any(zero).(type) { + case uint8: + return v + case uint16: + u := uint16(v) + return T((u << 8) | (u >> 8)) + case uint32: + u := uint32(v) + return T(binary.LittleEndian.Uint32((*[4]byte)(unsafe.Pointer(&u))[:])) + } + return v +} + type Unsigned interface { ~uint8 | ~uint16 | ~uint32 } @@ -19,6 +86,12 @@ type BinaryFuse[T Unsigned] struct { SegmentCountLength uint32 Fingerprints []T + + // Portable, when true, ensures that Fingerprints are stored in little-endian + // byte order regardless of the host CPU's native endianness. This allows + // filters to be serialized and shared across different architectures. + // Only affects uint16 and uint32 fingerprint types; uint8 is unaffected. + Portable bool } // NewBinaryFuse creates a binary fuse filter with provided keys. For best @@ -36,6 +109,18 @@ func NewBinaryFuse[T Unsigned](keys []uint64) (*BinaryFuse[T], error) { return &filter, nil } +// NewBinaryFusePortable creates a binary fuse filter with Portable=true, +// ensuring fingerprints are stored in little-endian byte order for +// cross-platform compatibility. See NewBinaryFuse for more details. +func NewBinaryFusePortable[T Unsigned](keys []uint64) (*BinaryFuse[T], error) { + var b BinaryFuseBuilder + filter, err := BuildBinaryFusePortable[T](&b, keys) + if err != nil { + return nil, err + } + return &filter, nil +} + // BinaryFuseBuilder can be used to reuse memory allocations across multiple // BinaryFuse builds. type BinaryFuseBuilder struct { @@ -248,6 +333,19 @@ func BuildBinaryFuse[T Unsigned](b *BinaryFuseBuilder, keys []uint64) (BinaryFus return filter, nil } +// BuildBinaryFusePortable creates a binary fuse filter with Portable=true, +// ensuring fingerprints are stored in little-endian byte order for +// cross-platform compatibility. See BuildBinaryFuse for more details. +func BuildBinaryFusePortable[T Unsigned](b *BinaryFuseBuilder, keys []uint64) (BinaryFuse[T], error) { + filter, err := BuildBinaryFuse[T](b, keys) + if err != nil { + return filter, err + } + filter.Portable = true + forceLE(filter.Fingerprints) + return filter, nil +} + func (filter *BinaryFuse[T]) initializeParameters(b *BinaryFuseBuilder, size uint32) { arity := uint32(3) filter.SegmentLength = calculateSegmentLength(arity, size) @@ -296,7 +394,11 @@ func (filter *BinaryFuse[T]) Contains(key uint64) bool { hash := mixsplit(key, filter.Seed) f := T(fingerprint(hash)) h0, h1, h2 := filter.getHashFromHash(hash) - f ^= filter.Fingerprints[h0] ^ filter.Fingerprints[h1] ^ filter.Fingerprints[h2] + if filter.Portable { + f ^= fromLE(filter.Fingerprints[h0]) ^ fromLE(filter.Fingerprints[h1]) ^ fromLE(filter.Fingerprints[h2]) + } else { + f ^= filter.Fingerprints[h0] ^ filter.Fingerprints[h1] ^ filter.Fingerprints[h2] + } return f == 0 } diff --git a/binaryfusefilter8.go b/binaryfusefilter8.go index ee8584a..7615998 100644 --- a/binaryfusefilter8.go +++ b/binaryfusefilter8.go @@ -14,6 +14,19 @@ func PopulateBinaryFuse8(keys []uint64) (*BinaryFuse8, error) { return (*BinaryFuse8)(filter), nil } +// PopulateBinaryFuse8Portable fills the filter with provided keys and sets +// Portable=true for cross-platform compatibility. For best results, the caller +// should avoid having too many duplicated keys. +// The function may return an error if the set is empty. +func PopulateBinaryFuse8Portable(keys []uint64) (*BinaryFuse8, error) { + filter, err := NewBinaryFusePortable[uint8](keys) + if err != nil { + return nil, err + } + + return (*BinaryFuse8)(filter), nil +} + // Contains returns `true` if key is part of the set with a false positive probability of <0.4%. func (filter *BinaryFuse8) Contains(key uint64) bool { return (*BinaryFuse[uint8])(filter).Contains(key) diff --git a/binaryfusefilter_test.go b/binaryfusefilter_test.go index ff7d1fd..01692e7 100644 --- a/binaryfusefilter_test.go +++ b/binaryfusefilter_test.go @@ -377,3 +377,176 @@ func crossCheckFuseBuilder[T Unsigned](t *testing.T, bld *BinaryFuseBuilder, key _ = expected require.Equal(t, *expected, filter) } + +func TestBinaryFusePortableBasic(t *testing.T) { + keys := make([]uint64, NUM_KEYS) + for i := range keys { + keys[i] = rand.Uint64() + } + filter, err := NewBinaryFusePortable[testType](keys) + require.NoError(t, err) + assert.True(t, filter.Portable) + + for _, v := range keys { + assert.True(t, filter.Contains(v), "key %d should be in filter", v) + } + + // Test false positive rate + falsesize := 1000000 + matches := 0 + for i := 0; i < falsesize; i++ { + v := rand.Uint64() + if filter.Contains(v) { + matches++ + } + } + fpp := float64(matches) * 100.0 / float64(falsesize) + assert.Less(t, fpp, 1.0, "false positive rate should be less than 1%%") +} + +func TestBinaryFusePortable_AllTypes(t *testing.T) { + keys := make([]uint64, 10000) + for i := range keys { + keys[i] = rand.Uint64() + } + + t.Run("uint8", func(t *testing.T) { + filter, err := NewBinaryFusePortable[uint8](keys) + require.NoError(t, err) + assert.True(t, filter.Portable) + for _, v := range keys { + assert.True(t, filter.Contains(v)) + } + }) + + t.Run("uint16", func(t *testing.T) { + filter, err := NewBinaryFusePortable[uint16](keys) + require.NoError(t, err) + assert.True(t, filter.Portable) + for _, v := range keys { + assert.True(t, filter.Contains(v)) + } + }) + + t.Run("uint32", func(t *testing.T) { + filter, err := NewBinaryFusePortable[uint32](keys) + require.NoError(t, err) + assert.True(t, filter.Portable) + for _, v := range keys { + assert.True(t, filter.Contains(v)) + } + }) +} + +func TestBinaryFusePortableBuilder(t *testing.T) { + var bld BinaryFuseBuilder + for i := 0; i < 50; i++ { + n := 1 + rand.IntN(1<