diff --git a/.github/workflows/bigendian.yml b/.github/workflows/bigendian.yml new file mode 100644 index 0000000..db4433e --- /dev/null +++ b/.github/workflows/bigendian.yml @@ -0,0 +1,26 @@ + +name: Go-PPC64-CI + +on: [push, pull_request] + +jobs: + test: + strategy: + matrix: + go-version: [1.24.x] + platform: [ubuntu-latest] + runs-on: ${{ matrix.platform }} + steps: + - name: Install Go + uses: actions/setup-go@v5 + with: + go-version: ${{ matrix.go-version }} + - name: Checkout code + uses: actions/checkout@v4 + - name: Install + run: | + sudo apt-get update + sudo apt install -y qemu-system-ppc64 qemu-user + - name: Test + run: | + GOARCH=ppc64 go test ./... \ No newline at end of file diff --git a/README.md b/README.md index 1aa86ce..57e1094 100644 --- a/README.md +++ b/README.md @@ -49,19 +49,16 @@ An xor filter is immutable, it is concurrent. The expectation is that you build Though the filter itself does not use much memory, the construction of the filter needs many bytes of memory per set entry. -For persistence, you only need to serialize the following data structure: +For persistence, you can use `Save` and `LoadBinaryFuse8`. It is uses a portable format over different systems (little/big endian). ```Go -type BinaryFuse8 struct { - Seed uint64 - SegmentLength uint32 - SegmentLengthMask uint32 - SegmentCount uint32 - SegmentCountLength uint32 - Fingerprints []uint8 -} +errsave := filter.Save(...) +//... +filter, errload := LoadBinaryFuse8(&buf) ``` +Note that it is a direct binary save/restore. There is not data integrity check: loading from corrupted sources might result in runtime errors. We recommend that you use hash codes for integrity checks. + When constructing the filter, you should ensure that there are not too many duplicate keys for best results. ## Generic (8-bit, 16-bit, 32-bit) @@ -75,6 +72,9 @@ filter8, _ := xorfilter.NewBinaryFuse[uint8](keys) // 0.39% false positive rate, filter16, _ := xorfilter.NewBinaryFuse[uint16](keys) // 0.0015% false positive rate, uses about 18 bits per key filter32, _ := xorfilter.NewBinaryFuse[uint32](keys) // 2e-08% false positive rate, uses about 36 bits per key ``` + +You can similarly save or load the data with `Save` and `LoadBinaryFuse[uint16](...)`. + The 32-bit fingerprints are provided but not recommended. Most users will want to use either the 8-bit or 16-bit fingerprints. The Binary Fuse filters have memory usages of about 9 bits per key in the 8-bit case, 18 bits per key in the 16-bit case, diff --git a/binaryfusefilter8.go b/binaryfusefilter8.go index ee8584a..ce22212 100644 --- a/binaryfusefilter8.go +++ b/binaryfusefilter8.go @@ -1,5 +1,7 @@ package xorfilter +import "io" + type BinaryFuse8 BinaryFuse[uint8] // PopulateBinaryFuse8 fills the filter with provided keys. For best results, @@ -18,3 +20,17 @@ func PopulateBinaryFuse8(keys []uint64) (*BinaryFuse8, error) { func (filter *BinaryFuse8) Contains(key uint64) bool { return (*BinaryFuse[uint8])(filter).Contains(key) } + +// Save writes the filter to the writer in little endian format. +func (f *BinaryFuse8) Save(w io.Writer) error { + return (*BinaryFuse[uint8])(f).Save(w) +} + +// LoadBinaryFuse8 reads the filter from the reader in little endian format. +func LoadBinaryFuse8(r io.Reader) (*BinaryFuse8, error) { + filter, err := LoadBinaryFuse[uint8](r) + if err != nil { + return nil, err + } + return (*BinaryFuse8)(filter), nil +} diff --git a/serialization.go b/serialization.go new file mode 100644 index 0000000..58bb56d --- /dev/null +++ b/serialization.go @@ -0,0 +1,72 @@ +//go:build (!amd64 && !386 && !arm && !arm64 && !ppc64le && !mipsle && !mips64le && !mips64p32le && !wasm) || appengine +// +build !amd64,!386,!arm,!arm64,!ppc64le,!mipsle,!mips64le,!mips64p32le,!wasm appengine + +package xorfilter + +import ( + "encoding/binary" + "io" +) + +// Save writes the filter to the writer in little endian format. +func (f *BinaryFuse[T]) Save(w io.Writer) error { + if err := binary.Write(w, binary.LittleEndian, f.Seed); err != nil { + return err + } + if err := binary.Write(w, binary.LittleEndian, f.SegmentLength); err != nil { + return err + } + if err := binary.Write(w, binary.LittleEndian, f.SegmentLengthMask); err != nil { + return err + } + if err := binary.Write(w, binary.LittleEndian, f.SegmentCount); err != nil { + return err + } + if err := binary.Write(w, binary.LittleEndian, f.SegmentCountLength); err != nil { + return err + } + // Write the length of Fingerprints + fpLen := uint32(len(f.Fingerprints)) + if err := binary.Write(w, binary.LittleEndian, fpLen); err != nil { + return err + } + // Write the Fingerprints + for _, fp := range f.Fingerprints { + if err := binary.Write(w, binary.LittleEndian, fp); err != nil { + return err + } + } + return nil +} + +// LoadBinaryFuse reads the filter from the reader in little endian format. +func LoadBinaryFuse[T Unsigned](r io.Reader) (*BinaryFuse[T], error) { + var f BinaryFuse[T] + if err := binary.Read(r, binary.LittleEndian, &f.Seed); err != nil { + return nil, err + } + if err := binary.Read(r, binary.LittleEndian, &f.SegmentLength); err != nil { + return nil, err + } + if err := binary.Read(r, binary.LittleEndian, &f.SegmentLengthMask); err != nil { + return nil, err + } + if err := binary.Read(r, binary.LittleEndian, &f.SegmentCount); err != nil { + return nil, err + } + if err := binary.Read(r, binary.LittleEndian, &f.SegmentCountLength); err != nil { + return nil, err + } + // Read the length of Fingerprints + var fpLen uint32 + if err := binary.Read(r, binary.LittleEndian, &fpLen); err != nil { + return nil, err + } + f.Fingerprints = make([]T, fpLen) + for i := range f.Fingerprints { + if err := binary.Read(r, binary.LittleEndian, &f.Fingerprints[i]); err != nil { + return nil, err + } + } + return &f, nil +} diff --git a/serialization_le.go b/serialization_le.go new file mode 100644 index 0000000..d852441 --- /dev/null +++ b/serialization_le.go @@ -0,0 +1,85 @@ +//go:build amd64 || 386 || arm || arm64 || ppc64le || mipsle || mips64le || mips64p32le || wasm + +package xorfilter + +import ( + "io" + "unsafe" +) + +// Save writes the filter to the writer assuming little endian system, using direct byte copy for performance. +func (f *BinaryFuse[T]) Save(w io.Writer) error { + // Write Seed + if _, err := w.Write((*[8]byte)(unsafe.Pointer(&f.Seed))[:]); err != nil { + return err + } + // Write SegmentLength + if _, err := w.Write((*[4]byte)(unsafe.Pointer(&f.SegmentLength))[:]); err != nil { + return err + } + // Write SegmentLengthMask + if _, err := w.Write((*[4]byte)(unsafe.Pointer(&f.SegmentLengthMask))[:]); err != nil { + return err + } + // Write SegmentCount + if _, err := w.Write((*[4]byte)(unsafe.Pointer(&f.SegmentCount))[:]); err != nil { + return err + } + // Write SegmentCountLength + if _, err := w.Write((*[4]byte)(unsafe.Pointer(&f.SegmentCountLength))[:]); err != nil { + return err + } + // Write length of Fingerprints + fpLen := uint32(len(f.Fingerprints)) + if _, err := w.Write((*[4]byte)(unsafe.Pointer(&fpLen))[:]); err != nil { + return err + } + // Write Fingerprints + if len(f.Fingerprints) > 0 { + size := int(unsafe.Sizeof(T(0))) + bytes := unsafe.Slice((*byte)(unsafe.Pointer(&f.Fingerprints[0])), len(f.Fingerprints)*size) + if _, err := w.Write(bytes); err != nil { + return err + } + } + return nil +} + +// LoadBinaryFuse reads the filter from the reader assuming little endian system, using direct byte copy for performance. +func LoadBinaryFuse[T Unsigned](r io.Reader) (*BinaryFuse[T], error) { + var f BinaryFuse[T] + // Read Seed + if _, err := io.ReadFull(r, (*[8]byte)(unsafe.Pointer(&f.Seed))[:]); err != nil { + return nil, err + } + // Read SegmentLength + if _, err := io.ReadFull(r, (*[4]byte)(unsafe.Pointer(&f.SegmentLength))[:]); err != nil { + return nil, err + } + // Read SegmentLengthMask + if _, err := io.ReadFull(r, (*[4]byte)(unsafe.Pointer(&f.SegmentLengthMask))[:]); err != nil { + return nil, err + } + // Read SegmentCount + if _, err := io.ReadFull(r, (*[4]byte)(unsafe.Pointer(&f.SegmentCount))[:]); err != nil { + return nil, err + } + // Read SegmentCountLength + if _, err := io.ReadFull(r, (*[4]byte)(unsafe.Pointer(&f.SegmentCountLength))[:]); err != nil { + return nil, err + } + // Read length of Fingerprints + var fpLen uint32 + if _, err := io.ReadFull(r, (*[4]byte)(unsafe.Pointer(&fpLen))[:]); err != nil { + return nil, err + } + f.Fingerprints = make([]T, fpLen) + if fpLen > 0 { + size := int(unsafe.Sizeof(T(0))) + bytes := unsafe.Slice((*byte)(unsafe.Pointer(&f.Fingerprints[0])), int(fpLen)*size) + if _, err := io.ReadFull(r, bytes); err != nil { + return nil, err + } + } + return &f, nil +} diff --git a/serialization_test.go b/serialization_test.go new file mode 100644 index 0000000..b301b26 --- /dev/null +++ b/serialization_test.go @@ -0,0 +1,73 @@ +package xorfilter + +import ( + "bytes" + "encoding/base64" + "reflect" + "testing" +) + +func TestBinaryFuse8Serialization(t *testing.T) { + keys := []uint64{1, 2, 3, 4, 5, 100, 200, 300} + filter, err := PopulateBinaryFuse8(keys) + if err != nil { + t.Fatal(err) + } + + // Test generic serialization + var buf bytes.Buffer + err = filter.Save(&buf) + if err != nil { + t.Fatal(err) + } + + loadedFilter, err := LoadBinaryFuse8(&buf) + if err != nil { + t.Fatal(err) + } + + if !reflect.DeepEqual(filter, loadedFilter) { + t.Error("Generic serialization: Filters do not match after save/load") + } + + for _, key := range keys { + if !loadedFilter.Contains(key) { + t.Errorf("Generic serialization: Key %d not found in loaded filter", key) + } + } +} + +func TestBinaryFuseSerializationGeneric(t *testing.T) { + keys := []uint64{1, 2, 3, 4, 5, 100, 200, 300} + filter, err := NewBinaryFuse[uint16](keys) + if err != nil { + t.Fatal(err) + } + + // Test generic serialization + var buf bytes.Buffer + err = filter.Save(&buf) + if err != nil { + t.Fatal(err) + } + + if "wVwCiewtCpEIAAAABwAAAAEAAAAIAAAAGAAAAAAAAABY7/rBAAAAAAoqAAA2kPb5AAAAAAAAAAAAAAAAuLkw2QAAAAAAAH1sAAAAAA==" != base64.StdEncoding.EncodeToString(buf.Bytes()) { + t.Log("Base64 serialized data:", base64.StdEncoding.EncodeToString(buf.Bytes())) + t.Error("Generic serialization: Unexpected serialized data") + } + + loadedFilter, err := LoadBinaryFuse[uint16](&buf) + if err != nil { + t.Fatal(err) + } + + if !reflect.DeepEqual(filter, loadedFilter) { + t.Error("Generic serialization: Filters do not match after save/load") + } + + for _, key := range keys { + if !loadedFilter.Contains(key) { + t.Errorf("Generic serialization: Key %d not found in loaded filter", key) + } + } +}