lib/db: Use SipHash to deal with hash collision in GC (#6826)

If the GC finds a key k that it wants to keep, it records that in a
Bloom filter. If a key k' can be removed but its hash collides with k,
it will be kept. Since the old Bloom filter code was completely
deterministic, the next run would encounter the same collision, assuming
k must still be kept.

A randomized hash function that uses all the SHA-256 bits solves this
problem: the second run has a non-zero probability of removing k', as
long as the Bloom filter is not completely full.
This commit is contained in:
greatroar 2020-07-11 09:36:09 +02:00 committed by GitHub
parent bbda58a29f
commit 9f92f8c609
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
3 changed files with 40 additions and 16 deletions

1
go.mod
View File

@ -11,6 +11,7 @@ require (
github.com/certifi/gocertifi v0.0.0-20190905060710-a5e0173ced67 // indirect
github.com/chmduquesne/rollinghash v0.0.0-20180912150627-a60f8e7142b5
github.com/d4l3k/messagediff v1.2.1
github.com/dchest/siphash v1.2.1
github.com/dgraph-io/badger/v2 v2.0.3
github.com/flynn-archive/go-shlex v0.0.0-20150515145356-3f9db97f8568
github.com/getsentry/raven-go v0.2.0

2
go.sum
View File

@ -70,6 +70,8 @@ github.com/d4l3k/messagediff v1.2.1/go.mod h1:Oozbb1TVXFac9FtSIxHBMnBCq2qeH/2KkE
github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c=
github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
github.com/dchest/siphash v1.2.1 h1:4cLinnzVJDKxTCl9B01807Yiy+W7ZzVHj/KIroQRvT4=
github.com/dchest/siphash v1.2.1/go.mod h1:q+IRvb2gOSrUnYoPqHiyHXS0FOBBOdl6tONBlVnOnt4=
github.com/dgraph-io/badger v1.6.1 h1:w9pSFNSdq/JPM1N12Fz/F/bzo993Is1W+Q7HjPzi7yg=
github.com/dgraph-io/badger/v2 v2.0.3 h1:inzdf6VF/NZ+tJ8RwwYMjJMvsOALTHYdozn0qSl6XJI=
github.com/dgraph-io/badger/v2 v2.0.3/go.mod h1:3KY8+bsP8wI0OEnQJAKpd4wIJW/Mm32yw2j/9FUVnIM=

View File

@ -10,11 +10,14 @@ import (
"bytes"
"context"
"encoding/binary"
"io"
"time"
"github.com/dchest/siphash"
"github.com/greatroar/blobloom"
"github.com/syncthing/syncthing/lib/db/backend"
"github.com/syncthing/syncthing/lib/protocol"
"github.com/syncthing/syncthing/lib/rand"
"github.com/syncthing/syncthing/lib/sha256"
"github.com/syncthing/syncthing/lib/sync"
"github.com/syncthing/syncthing/lib/util"
@ -679,10 +682,10 @@ func (db *Lowlevel) gcIndirect(ctx context.Context) error {
return err
}
if len(hashes.BlocksHash) > 0 {
blockFilter.Add(bloomHash(hashes.BlocksHash))
blockFilter.add(hashes.BlocksHash)
}
if len(hashes.VersionHash) > 0 {
versionFilter.Add(bloomHash(hashes.VersionHash))
versionFilter.add(hashes.VersionHash)
}
}
it.Release()
@ -707,7 +710,7 @@ func (db *Lowlevel) gcIndirect(ctx context.Context) error {
}
key := blockListKey(it.Key())
if blockFilter.Has(bloomHash(key.Hash())) {
if blockFilter.has(key.Hash()) {
matchedBlocks++
continue
}
@ -736,7 +739,7 @@ func (db *Lowlevel) gcIndirect(ctx context.Context) error {
}
key := versionKey(it.Key())
if versionFilter.Has(bloomHash(key.Hash())) {
if versionFilter.has(key.Hash()) {
matchedVersions++
continue
}
@ -762,21 +765,39 @@ func (db *Lowlevel) gcIndirect(ctx context.Context) error {
return db.Compact()
}
func newBloomFilter(capacity int) *blobloom.Filter {
return blobloom.NewOptimized(blobloom.Config{
Capacity: uint64(capacity),
FPRate: indirectGCBloomFalsePositiveRate,
MaxBits: 8 * indirectGCBloomMaxBytes,
})
func newBloomFilter(capacity int) bloomFilter {
var buf [16]byte
io.ReadFull(rand.Reader, buf[:])
return bloomFilter{
f: blobloom.NewOptimized(blobloom.Config{
Capacity: uint64(capacity),
FPRate: indirectGCBloomFalsePositiveRate,
MaxBits: 8 * indirectGCBloomMaxBytes,
}),
k0: binary.LittleEndian.Uint64(buf[:8]),
k1: binary.LittleEndian.Uint64(buf[8:]),
}
}
// Hash function for the bloomfilter: first eight bytes of the SHA-256.
// Big or little-endian makes no difference, as long as we're consistent.
func bloomHash(key []byte) uint64 {
if len(key) != sha256.Size {
panic("bug: bloomHash passed something not a SHA256 hash")
type bloomFilter struct {
f *blobloom.Filter
k0, k1 uint64 // Random key for SipHash.
}
func (b *bloomFilter) add(id []byte) { b.f.Add(b.hash(id)) }
func (b *bloomFilter) has(id []byte) bool { return b.f.Has(b.hash(id)) }
// Hash function for the bloomfilter: SipHash of the SHA-256.
//
// The randomization in SipHash means we get different collisions across
// runs and colliding keys are not kept indefinitely.
func (b *bloomFilter) hash(id []byte) uint64 {
if len(id) != sha256.Size {
panic("bug: bloomFilter.hash passed something not a SHA256 hash")
}
return binary.BigEndian.Uint64(key)
return siphash.Hash(b.k0, b.k1, id)
}
// CheckRepair checks folder metadata and sequences for miscellaneous errors.