diff --git a/binaryfusefilter.go b/binaryfusefilter.go index f1f0f99..6c4b029 100644 --- a/binaryfusefilter.go +++ b/binaryfusefilter.go @@ -56,6 +56,11 @@ type BinaryFuseBuilder struct { // // The function may return an error if the set is empty. func BuildBinaryFuse[T Unsigned](b *BinaryFuseBuilder, keys []uint64) (BinaryFuse[T], error) { + f, _, err := buildBinaryFuse[T](b, keys) + return f, err +} + +func buildBinaryFuse[T Unsigned](b *BinaryFuseBuilder, keys []uint64) (_ BinaryFuse[T], iterations int, _ error) { size := uint32(len(keys)) var filter BinaryFuse[T] filter.initializeParameters(b, size) @@ -78,13 +83,32 @@ func BuildBinaryFuse[T Unsigned](b *BinaryFuseBuilder, keys []uint64) (BinaryFus var h012 [6]uint32 // this could be used to compute the mod3 // tabmod3 := [5]uint8{0,1,2,0,1} - iterations := 0 for { iterations += 1 if iterations > MaxIterations { // The probability of this happening is lower than the cosmic-ray // probability (i.e., a cosmic ray corrupts your system). - return BinaryFuse[T]{}, errors.New("too many iterations") + return BinaryFuse[T]{}, iterations, errors.New("too many iterations") + } + if size > 4 && size < 1_000_000 { + // The segment length is calculated using an empirical formula. For some + // sizes, the segment length is too large and leads to many iterations. + // Once every four iterations, use the previous segment length while + // keeping the same capacity. See TestBinaryFuseBoundarySizes. + switch iterations % 4 { + case 2: + // Switch to smaller segment size. + filter.SegmentLength /= 2 + filter.SegmentLengthMask = filter.SegmentLength - 1 + filter.SegmentCount = filter.SegmentCount*2 + 2 + filter.SegmentCountLength = filter.SegmentCount * filter.SegmentLength + case 3: + // Restore the calculated segment size. + filter.SegmentLength *= 2 + filter.SegmentLengthMask = filter.SegmentLength - 1 + filter.SegmentCount = filter.SegmentCount/2 - 1 + filter.SegmentCountLength = filter.SegmentCount * filter.SegmentLength + } } blockBits := 1 @@ -228,7 +252,7 @@ func BuildBinaryFuse[T Unsigned](b *BinaryFuseBuilder, keys []uint64) (BinaryFus filter.Seed = splitmix64(&rngcounter) } if size == 0 { - return filter, nil + return filter, iterations, nil } for i := int(size - 1); i >= 0; i-- { @@ -245,7 +269,7 @@ func BuildBinaryFuse[T Unsigned](b *BinaryFuseBuilder, keys []uint64) (BinaryFus filter.Fingerprints[h012[found]] = xor2 ^ filter.Fingerprints[h012[found+1]] ^ filter.Fingerprints[h012[found+2]] } - return filter, nil + return filter, iterations, nil } func (filter *BinaryFuse[T]) initializeParameters(b *BinaryFuseBuilder, size uint32) { diff --git a/binaryfusefilter_test.go b/binaryfusefilter_test.go index ff7d1fd..a60fe3c 100644 --- a/binaryfusefilter_test.go +++ b/binaryfusefilter_test.go @@ -4,6 +4,9 @@ import ( "fmt" "math/rand/v2" "slices" + "sort" + "strings" + "sync" "testing" "github.com/cespare/xxhash/v2" @@ -377,3 +380,132 @@ func crossCheckFuseBuilder[T Unsigned](t *testing.T, bld *BinaryFuseBuilder, key _ = expected require.Equal(t, *expected, filter) } + +// segmentLengthSizes contains represents the range of sizes [startSize, endSize] that +// all get the same segmentLength. +type segmentLengthSizes struct { + segmentLength uint32 + startSize uint32 + startSegmentCount uint32 + endSize uint32 + endSegmentCount uint32 +} + +var binaryFuseParamStableOnce struct { + once sync.Once + result []segmentLengthSizes +} + +const binaryFuseParamTableMaxSegmentSize = 16384 + +func binaryFuseSegLenAndCnt(size uint32) (segLen uint32, segCnt uint32) { + var f BinaryFuse[uint8] + f.initializeParameters(&BinaryFuseBuilder{}, size) + return f.SegmentLength, f.SegmentCount +} + +func binaryFuseParamsTable() []segmentLengthSizes { + binaryFuseParamStableOnce.once.Do(func() { + var table []segmentLengthSizes + size := uint32(1) + for { + segLen, segCnt := binaryFuseSegLenAndCnt(size) + if segLen > binaryFuseParamTableMaxSegmentSize { + break + } + // Find the first size that changes the segment length. + n := uint32(sort.Search(int(size*4), func(x int) bool { + l, _ := binaryFuseSegLenAndCnt(size + uint32(x)) + return l != segLen + })) + _, endSegCnt := binaryFuseSegLenAndCnt(size + n - 1) + table = append(table, segmentLengthSizes{ + segmentLength: segLen, + startSize: size, + startSegmentCount: segCnt, + endSize: size + n - 1, + endSegmentCount: endSegCnt, + }) + size += n + } + binaryFuseParamStableOnce.result = table + }) + return binaryFuseParamStableOnce.result +} + +// TestBinaryFuseParams shows the segment count and size range for each segment +// length. Used to verify any changes in parameter calculation. +func TestBinaryFuseParams(t *testing.T) { + expected := ` +| SegLen | SegCnt range | Size range | +|--------|--------------|-------------------| +| 4 | 1 - 1 | 1 - 2 | +| 8 | 1 - 1 | 3 - 8 | +| 16 | 1 - 2 | 9 - 27 | +| 32 | 1 - 3 | 28 - 91 | +| 64 | 1 - 5 | 92 - 303 | +| 128 | 2 - 9 | 304 - 1009 | +| 256 | 4 - 16 | 1010 - 3361 | +| 512 | 7 - 26 | 3362 - 11192 | +| 1024 | 12 - 42 | 11193 - 37272 | +| 2048 | 20 - 69 | 37273 - 124117 | +| 4096 | 34 - 114 | 124118 - 413309 | +| 8192 | 56 - 188 | 413310 - 1376321 | +| 16384 | 93 - 313 | 1376322 - 4583149 | +` + + var out strings.Builder + fmt.Fprintf(&out, "| SegLen | SegCnt range | Size range |\n") + fmt.Fprintf(&out, "|--------|--------------|-------------------|\n") + for _, row := range binaryFuseParamsTable() { + fmt.Fprintf(&out, "| %6d | %4d - %-5d | %7d - %-7d |\n", + row.segmentLength, + row.startSegmentCount, row.endSegmentCount, + row.startSize, row.endSize, + ) + } + str := out.String() + require.Equal(t, strings.TrimSpace(expected), strings.TrimSpace(str)) +} + +func checkNumIterations(t *testing.T, size uint32) { + const numTrials = 20 + + keys := make([]uint64, size) + var totalIterations, maxIterations int + for range numTrials { + for i := range keys { + keys[i] = rand.Uint64() + } + var b BinaryFuseBuilder + filter, iterations, err := buildBinaryFuse[uint8](&b, keys) + require.NoError(t, err) + for range 100 { + require.True(t, filter.Contains(keys[rand.IntN(len(keys))])) + } + totalIterations += iterations + maxIterations = max(maxIterations, iterations) + } + t.Logf("size: %d iterations: %.2f avg (%d max)", size, float64(totalIterations)/numTrials, maxIterations) +} + +func TestBinaryFuseBoundarySizes(t *testing.T) { + // For each segment length, test the smallest and largest segment count. For a + // given segment count, we want to choose the largest size for that count + // (which has the least "slack" space). + for _, s := range binaryFuseParamsTable() { + if s.startSize > 1_000_000 { + // Larger sizes take too long to test. + break + } + if s.startSegmentCount != s.endSegmentCount { + // Find the first size that doesn't use the start segment count. + n := uint32(sort.Search(int(s.endSize-s.startSize+1), func(x int) bool { + l, c := binaryFuseSegLenAndCnt(s.startSize + uint32(x)) + return l != s.segmentLength || c != s.startSegmentCount + })) + checkNumIterations(t, s.startSize+n-1) + } + checkNumIterations(t, s.endSize) + } +}