From 161eff00fbec794438210c1c7cda7009e4259b12 Mon Sep 17 00:00:00 2001 From: Radu Berinde Date: Wed, 7 Jan 2026 09:57:29 -0800 Subject: [PATCH 1/2] Improve binary fuse parameter testing We add a test that shows the range of sizes and segment counts for each segment length. We also add a test that checks filter generation at "boundary" sizes in terms of segment lengths. The test prints the average and max number of iterations for each tested size. Output with numTrials=100: ``` size: 2 iterations: 1.02 avg (2 max) size: 8 iterations: 1.02 avg (2 max) size: 24 iterations: 1.13 avg (3 max) size: 27 iterations: 1.02 avg (2 max) size: 55 iterations: 1.02 avg (2 max) size: 91 iterations: 1.04 avg (3 max) size: 120 iterations: 1.00 avg (1 max) size: 303 iterations: 1.09 avg (3 max) size: 349 iterations: 1.04 avg (2 max) size: 1009 iterations: 1.02 avg (2 max) size: 1124 iterations: 1.13 avg (2 max) size: 3361 iterations: 1.03 avg (3 max) size: 3551 iterations: 9.45 avg (42 max) size: 11192 iterations: 1.03 avg (2 max) size: 11521 iterations: 109.79 avg (528 max) size: 37272 iterations: 1.00 avg (1 max) size: 37454 iterations: 15.42 avg (70 max) size: 124117 iterations: 1.02 avg (2 max) size: 126131 iterations: 1.70 avg (6 max) size: 413309 iterations: 1.01 avg (2 max) size: 416077 iterations: 1.83 avg (6 max) size: 1376321 iterations: 1.00 avg (1 max) ``` --- binaryfusefilter.go | 12 ++-- binaryfusefilter_test.go | 132 +++++++++++++++++++++++++++++++++++++++ 2 files changed, 140 insertions(+), 4 deletions(-) diff --git a/binaryfusefilter.go b/binaryfusefilter.go index f1f0f99..1b33f45 100644 --- a/binaryfusefilter.go +++ b/binaryfusefilter.go @@ -56,6 +56,11 @@ type BinaryFuseBuilder struct { // // The function may return an error if the set is empty. func BuildBinaryFuse[T Unsigned](b *BinaryFuseBuilder, keys []uint64) (BinaryFuse[T], error) { + f, _, err := buildBinaryFuse[T](b, keys) + return f, err +} + +func buildBinaryFuse[T Unsigned](b *BinaryFuseBuilder, keys []uint64) (_ BinaryFuse[T], iterations int, _ error) { size := uint32(len(keys)) var filter BinaryFuse[T] filter.initializeParameters(b, size) @@ -78,13 +83,12 @@ func BuildBinaryFuse[T Unsigned](b *BinaryFuseBuilder, keys []uint64) (BinaryFus var h012 [6]uint32 // this could be used to compute the mod3 // tabmod3 := [5]uint8{0,1,2,0,1} - iterations := 0 for { iterations += 1 if iterations > MaxIterations { // The probability of this happening is lower than the cosmic-ray // probability (i.e., a cosmic ray corrupts your system). - return BinaryFuse[T]{}, errors.New("too many iterations") + return BinaryFuse[T]{}, iterations, errors.New("too many iterations") } blockBits := 1 @@ -228,7 +232,7 @@ func BuildBinaryFuse[T Unsigned](b *BinaryFuseBuilder, keys []uint64) (BinaryFus filter.Seed = splitmix64(&rngcounter) } if size == 0 { - return filter, nil + return filter, iterations, nil } for i := int(size - 1); i >= 0; i-- { @@ -245,7 +249,7 @@ func BuildBinaryFuse[T Unsigned](b *BinaryFuseBuilder, keys []uint64) (BinaryFus filter.Fingerprints[h012[found]] = xor2 ^ filter.Fingerprints[h012[found+1]] ^ filter.Fingerprints[h012[found+2]] } - return filter, nil + return filter, iterations, nil } func (filter *BinaryFuse[T]) initializeParameters(b *BinaryFuseBuilder, size uint32) { diff --git a/binaryfusefilter_test.go b/binaryfusefilter_test.go index ff7d1fd..a60fe3c 100644 --- a/binaryfusefilter_test.go +++ b/binaryfusefilter_test.go @@ -4,6 +4,9 @@ import ( "fmt" "math/rand/v2" "slices" + "sort" + "strings" + "sync" "testing" "github.com/cespare/xxhash/v2" @@ -377,3 +380,132 @@ func crossCheckFuseBuilder[T Unsigned](t *testing.T, bld *BinaryFuseBuilder, key _ = expected require.Equal(t, *expected, filter) } + +// segmentLengthSizes contains represents the range of sizes [startSize, endSize] that +// all get the same segmentLength. +type segmentLengthSizes struct { + segmentLength uint32 + startSize uint32 + startSegmentCount uint32 + endSize uint32 + endSegmentCount uint32 +} + +var binaryFuseParamStableOnce struct { + once sync.Once + result []segmentLengthSizes +} + +const binaryFuseParamTableMaxSegmentSize = 16384 + +func binaryFuseSegLenAndCnt(size uint32) (segLen uint32, segCnt uint32) { + var f BinaryFuse[uint8] + f.initializeParameters(&BinaryFuseBuilder{}, size) + return f.SegmentLength, f.SegmentCount +} + +func binaryFuseParamsTable() []segmentLengthSizes { + binaryFuseParamStableOnce.once.Do(func() { + var table []segmentLengthSizes + size := uint32(1) + for { + segLen, segCnt := binaryFuseSegLenAndCnt(size) + if segLen > binaryFuseParamTableMaxSegmentSize { + break + } + // Find the first size that changes the segment length. + n := uint32(sort.Search(int(size*4), func(x int) bool { + l, _ := binaryFuseSegLenAndCnt(size + uint32(x)) + return l != segLen + })) + _, endSegCnt := binaryFuseSegLenAndCnt(size + n - 1) + table = append(table, segmentLengthSizes{ + segmentLength: segLen, + startSize: size, + startSegmentCount: segCnt, + endSize: size + n - 1, + endSegmentCount: endSegCnt, + }) + size += n + } + binaryFuseParamStableOnce.result = table + }) + return binaryFuseParamStableOnce.result +} + +// TestBinaryFuseParams shows the segment count and size range for each segment +// length. Used to verify any changes in parameter calculation. +func TestBinaryFuseParams(t *testing.T) { + expected := ` +| SegLen | SegCnt range | Size range | +|--------|--------------|-------------------| +| 4 | 1 - 1 | 1 - 2 | +| 8 | 1 - 1 | 3 - 8 | +| 16 | 1 - 2 | 9 - 27 | +| 32 | 1 - 3 | 28 - 91 | +| 64 | 1 - 5 | 92 - 303 | +| 128 | 2 - 9 | 304 - 1009 | +| 256 | 4 - 16 | 1010 - 3361 | +| 512 | 7 - 26 | 3362 - 11192 | +| 1024 | 12 - 42 | 11193 - 37272 | +| 2048 | 20 - 69 | 37273 - 124117 | +| 4096 | 34 - 114 | 124118 - 413309 | +| 8192 | 56 - 188 | 413310 - 1376321 | +| 16384 | 93 - 313 | 1376322 - 4583149 | +` + + var out strings.Builder + fmt.Fprintf(&out, "| SegLen | SegCnt range | Size range |\n") + fmt.Fprintf(&out, "|--------|--------------|-------------------|\n") + for _, row := range binaryFuseParamsTable() { + fmt.Fprintf(&out, "| %6d | %4d - %-5d | %7d - %-7d |\n", + row.segmentLength, + row.startSegmentCount, row.endSegmentCount, + row.startSize, row.endSize, + ) + } + str := out.String() + require.Equal(t, strings.TrimSpace(expected), strings.TrimSpace(str)) +} + +func checkNumIterations(t *testing.T, size uint32) { + const numTrials = 20 + + keys := make([]uint64, size) + var totalIterations, maxIterations int + for range numTrials { + for i := range keys { + keys[i] = rand.Uint64() + } + var b BinaryFuseBuilder + filter, iterations, err := buildBinaryFuse[uint8](&b, keys) + require.NoError(t, err) + for range 100 { + require.True(t, filter.Contains(keys[rand.IntN(len(keys))])) + } + totalIterations += iterations + maxIterations = max(maxIterations, iterations) + } + t.Logf("size: %d iterations: %.2f avg (%d max)", size, float64(totalIterations)/numTrials, maxIterations) +} + +func TestBinaryFuseBoundarySizes(t *testing.T) { + // For each segment length, test the smallest and largest segment count. For a + // given segment count, we want to choose the largest size for that count + // (which has the least "slack" space). + for _, s := range binaryFuseParamsTable() { + if s.startSize > 1_000_000 { + // Larger sizes take too long to test. + break + } + if s.startSegmentCount != s.endSegmentCount { + // Find the first size that doesn't use the start segment count. + n := uint32(sort.Search(int(s.endSize-s.startSize+1), func(x int) bool { + l, c := binaryFuseSegLenAndCnt(s.startSize + uint32(x)) + return l != s.segmentLength || c != s.startSegmentCount + })) + checkNumIterations(t, s.startSize+n-1) + } + checkNumIterations(t, s.endSize) + } +} From 87e37e6c5c97e931d449163bd35344f0bd74cbe2 Mon Sep 17 00:00:00 2001 From: Radu Berinde Date: Wed, 7 Jan 2026 10:32:10 -0800 Subject: [PATCH 2/2] Try smaller segment length in binary fuse build Some sizes around segment length transitions require many iterations and would work much better with the previous segment length. We add a simple fix that is more robust than tweaking the formula: once every four iterations, we try the previous segment length while keeping the same capacity. Note that in most cases this won't affect the build because it's rare to need more than 1-2 iterations. `TestBinaryFuseBoundarySizes` output (with numTrials=100): ``` binaryfusefilter_test.go:490: size: 2 iterations: 1.02 avg (2 max) binaryfusefilter_test.go:490: size: 8 iterations: 1.08 avg (3 max) binaryfusefilter_test.go:490: size: 24 iterations: 1.08 avg (3 max) binaryfusefilter_test.go:490: size: 27 iterations: 1.03 avg (2 max) binaryfusefilter_test.go:490: size: 55 iterations: 1.02 avg (2 max) binaryfusefilter_test.go:490: size: 91 iterations: 1.02 avg (2 max) binaryfusefilter_test.go:490: size: 120 iterations: 1.04 avg (2 max) binaryfusefilter_test.go:490: size: 303 iterations: 1.04 avg (2 max) binaryfusefilter_test.go:490: size: 349 iterations: 1.01 avg (2 max) binaryfusefilter_test.go:490: size: 1009 iterations: 1.01 avg (2 max) binaryfusefilter_test.go:490: size: 1124 iterations: 1.16 avg (4 max) binaryfusefilter_test.go:490: size: 3361 iterations: 1.03 avg (2 max) binaryfusefilter_test.go:490: size: 3551 iterations: 2.05 avg (6 max) binaryfusefilter_test.go:490: size: 11192 iterations: 1.04 avg (3 max) binaryfusefilter_test.go:490: size: 11521 iterations: 2.10 avg (6 max) binaryfusefilter_test.go:490: size: 37272 iterations: 1.01 avg (2 max) binaryfusefilter_test.go:490: size: 37454 iterations: 2.09 avg (6 max) binaryfusefilter_test.go:490: size: 124117 iterations: 1.03 avg (2 max) binaryfusefilter_test.go:490: size: 126131 iterations: 1.53 avg (4 max) binaryfusefilter_test.go:490: size: 413309 iterations: 1.00 avg (1 max) binaryfusefilter_test.go:490: size: 416077 iterations: 1.50 avg (4 max) binaryfusefilter_test.go:490: size: 1376321 iterations: 1.02 avg (3 max) ``` --- binaryfusefilter.go | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/binaryfusefilter.go b/binaryfusefilter.go index 1b33f45..6c4b029 100644 --- a/binaryfusefilter.go +++ b/binaryfusefilter.go @@ -90,6 +90,26 @@ func buildBinaryFuse[T Unsigned](b *BinaryFuseBuilder, keys []uint64) (_ BinaryF // probability (i.e., a cosmic ray corrupts your system). return BinaryFuse[T]{}, iterations, errors.New("too many iterations") } + if size > 4 && size < 1_000_000 { + // The segment length is calculated using an empirical formula. For some + // sizes, the segment length is too large and leads to many iterations. + // Once every four iterations, use the previous segment length while + // keeping the same capacity. See TestBinaryFuseBoundarySizes. + switch iterations % 4 { + case 2: + // Switch to smaller segment size. + filter.SegmentLength /= 2 + filter.SegmentLengthMask = filter.SegmentLength - 1 + filter.SegmentCount = filter.SegmentCount*2 + 2 + filter.SegmentCountLength = filter.SegmentCount * filter.SegmentLength + case 3: + // Restore the calculated segment size. + filter.SegmentLength *= 2 + filter.SegmentLengthMask = filter.SegmentLength - 1 + filter.SegmentCount = filter.SegmentCount/2 - 1 + filter.SegmentCountLength = filter.SegmentCount * filter.SegmentLength + } + } blockBits := 1 for (1 << blockBits) < filter.SegmentCount {