Update goleveldb

This commit is contained in:
Jakob Borg 2014-11-18 16:24:42 +04:00
parent aa637fd942
commit 68399601ce
43 changed files with 2364 additions and 1181 deletions

11
Godeps/Godeps.json generated
View File

@ -25,11 +25,6 @@
"Comment": "null-90",
"Rev": "d65bffbc88a153d23a6d2a864531e6e7c2cde59b"
},
{
"ImportPath": "code.google.com/p/snappy-go/snappy",
"Comment": "null-15",
"Rev": "12e4b4183793ac4b061921e7980845e750679fd0"
},
{
"ImportPath": "github.com/AudriusButkevicius/lfu-go",
"Rev": "164bcecceb92fd6037f4d18a8d97b495ec6ef669"
@ -56,7 +51,11 @@
},
{
"ImportPath": "github.com/syndtr/goleveldb/leveldb",
"Rev": "cd2b8f743192883ab9fbc5f070ebda1dc90f3732"
"Rev": "d8d1d2a5cc2d34c950dffa2f554525415d59f737"
},
{
"ImportPath": "github.com/syndtr/gosnappy/snappy",
"Rev": "ce8acff4829e0c2458a67ead32390ac0a381c862"
},
{
"ImportPath": "github.com/vitrun/qart/coding",

View File

@ -8,65 +8,84 @@ package leveldb
import (
"encoding/binary"
"errors"
"fmt"
"github.com/syndtr/goleveldb/leveldb/errors"
"github.com/syndtr/goleveldb/leveldb/memdb"
)
var (
errBatchTooShort = errors.New("leveldb: batch is too short")
errBatchBadRecord = errors.New("leveldb: bad record in batch")
type ErrBatchCorrupted struct {
Reason string
}
func (e *ErrBatchCorrupted) Error() string {
return fmt.Sprintf("leveldb: batch corrupted: %s", e.Reason)
}
func newErrBatchCorrupted(reason string) error {
return errors.NewErrCorrupted(nil, &ErrBatchCorrupted{reason})
}
const (
batchHdrLen = 8 + 4
batchGrowRec = 3000
)
const kBatchHdrLen = 8 + 4
type batchReplay interface {
put(key, value []byte, seq uint64)
delete(key []byte, seq uint64)
type BatchReplay interface {
Put(key, value []byte)
Delete(key []byte)
}
// Batch is a write batch.
type Batch struct {
buf []byte
data []byte
rLen, bLen int
seq uint64
sync bool
}
func (b *Batch) grow(n int) {
off := len(b.buf)
off := len(b.data)
if off == 0 {
// include headers
off = kBatchHdrLen
n += off
off = batchHdrLen
if b.data != nil {
b.data = b.data[:off]
}
}
if cap(b.buf)-off >= n {
return
if cap(b.data)-off < n {
if b.data == nil {
b.data = make([]byte, off, off+n)
} else {
odata := b.data
div := 1
if b.rLen > batchGrowRec {
div = b.rLen / batchGrowRec
}
b.data = make([]byte, off, off+n+(off-batchHdrLen)/div)
copy(b.data, odata)
}
}
buf := make([]byte, 2*cap(b.buf)+n)
copy(buf, b.buf)
b.buf = buf[:off]
}
func (b *Batch) appendRec(t vType, key, value []byte) {
func (b *Batch) appendRec(kt kType, key, value []byte) {
n := 1 + binary.MaxVarintLen32 + len(key)
if t == tVal {
if kt == ktVal {
n += binary.MaxVarintLen32 + len(value)
}
b.grow(n)
off := len(b.buf)
buf := b.buf[:off+n]
buf[off] = byte(t)
off := len(b.data)
data := b.data[:off+n]
data[off] = byte(kt)
off += 1
off += binary.PutUvarint(buf[off:], uint64(len(key)))
copy(buf[off:], key)
off += binary.PutUvarint(data[off:], uint64(len(key)))
copy(data[off:], key)
off += len(key)
if t == tVal {
off += binary.PutUvarint(buf[off:], uint64(len(value)))
copy(buf[off:], value)
if kt == ktVal {
off += binary.PutUvarint(data[off:], uint64(len(value)))
copy(data[off:], value)
off += len(value)
}
b.buf = buf[:off]
b.data = data[:off]
b.rLen++
// Include 8-byte ikey header
b.bLen += len(key) + len(value) + 8
@ -75,18 +94,51 @@ func (b *Batch) appendRec(t vType, key, value []byte) {
// Put appends 'put operation' of the given key/value pair to the batch.
// It is safe to modify the contents of the argument after Put returns.
func (b *Batch) Put(key, value []byte) {
b.appendRec(tVal, key, value)
b.appendRec(ktVal, key, value)
}
// Delete appends 'delete operation' of the given key to the batch.
// It is safe to modify the contents of the argument after Delete returns.
func (b *Batch) Delete(key []byte) {
b.appendRec(tDel, key, nil)
b.appendRec(ktDel, key, nil)
}
// Dump dumps batch contents. The returned slice can be loaded into the
// batch using Load method.
// The returned slice is not its own copy, so the contents should not be
// modified.
func (b *Batch) Dump() []byte {
return b.encode()
}
// Load loads given slice into the batch. Previous contents of the batch
// will be discarded.
// The given slice will not be copied and will be used as batch buffer, so
// it is not safe to modify the contents of the slice.
func (b *Batch) Load(data []byte) error {
return b.decode(0, data)
}
// Replay replays batch contents.
func (b *Batch) Replay(r BatchReplay) error {
return b.decodeRec(func(i int, kt kType, key, value []byte) {
switch kt {
case ktVal:
r.Put(key, value)
case ktDel:
r.Delete(key)
}
})
}
// Len returns number of records in the batch.
func (b *Batch) Len() int {
return b.rLen
}
// Reset resets the batch.
func (b *Batch) Reset() {
b.buf = nil
b.data = b.data[:0]
b.seq = 0
b.rLen = 0
b.bLen = 0
@ -97,24 +149,10 @@ func (b *Batch) init(sync bool) {
b.sync = sync
}
func (b *Batch) put(key, value []byte, seq uint64) {
if b.rLen == 0 {
b.seq = seq
}
b.Put(key, value)
}
func (b *Batch) delete(key []byte, seq uint64) {
if b.rLen == 0 {
b.seq = seq
}
b.Delete(key)
}
func (b *Batch) append(p *Batch) {
if p.rLen > 0 {
b.grow(len(p.buf) - kBatchHdrLen)
b.buf = append(b.buf, p.buf[kBatchHdrLen:]...)
b.grow(len(p.data) - batchHdrLen)
b.data = append(b.data, p.data[batchHdrLen:]...)
b.rLen += p.rLen
}
if p.sync {
@ -122,95 +160,93 @@ func (b *Batch) append(p *Batch) {
}
}
func (b *Batch) len() int {
return b.rLen
}
// size returns sums of key/value pair length plus 8-bytes ikey.
func (b *Batch) size() int {
return b.bLen
}
func (b *Batch) encode() []byte {
b.grow(0)
binary.LittleEndian.PutUint64(b.buf, b.seq)
binary.LittleEndian.PutUint32(b.buf[8:], uint32(b.rLen))
binary.LittleEndian.PutUint64(b.data, b.seq)
binary.LittleEndian.PutUint32(b.data[8:], uint32(b.rLen))
return b.buf
return b.data
}
func (b *Batch) decode(buf []byte) error {
if len(buf) < kBatchHdrLen {
return errBatchTooShort
func (b *Batch) decode(prevSeq uint64, data []byte) error {
if len(data) < batchHdrLen {
return newErrBatchCorrupted("too short")
}
b.seq = binary.LittleEndian.Uint64(buf)
b.rLen = int(binary.LittleEndian.Uint32(buf[8:]))
b.seq = binary.LittleEndian.Uint64(data)
if b.seq < prevSeq {
return newErrBatchCorrupted("invalid sequence number")
}
b.rLen = int(binary.LittleEndian.Uint32(data[8:]))
if b.rLen < 0 {
return newErrBatchCorrupted("invalid records length")
}
// No need to be precise at this point, it won't be used anyway
b.bLen = len(buf) - kBatchHdrLen
b.buf = buf
b.bLen = len(data) - batchHdrLen
b.data = data
return nil
}
func (b *Batch) decodeRec(f func(i int, t vType, key, value []byte)) error {
off := kBatchHdrLen
func (b *Batch) decodeRec(f func(i int, kt kType, key, value []byte)) (err error) {
off := batchHdrLen
for i := 0; i < b.rLen; i++ {
if off >= len(b.buf) {
return errors.New("leveldb: invalid batch record length")
if off >= len(b.data) {
return newErrBatchCorrupted("invalid records length")
}
t := vType(b.buf[off])
if t > tVal {
return errors.New("leveldb: invalid batch record type in batch")
kt := kType(b.data[off])
if kt > ktVal {
return newErrBatchCorrupted("bad record: invalid type")
}
off += 1
x, n := binary.Uvarint(b.buf[off:])
x, n := binary.Uvarint(b.data[off:])
off += n
if n <= 0 || off+int(x) > len(b.buf) {
return errBatchBadRecord
if n <= 0 || off+int(x) > len(b.data) {
return newErrBatchCorrupted("bad record: invalid key length")
}
key := b.buf[off : off+int(x)]
key := b.data[off : off+int(x)]
off += int(x)
var value []byte
if t == tVal {
x, n := binary.Uvarint(b.buf[off:])
if kt == ktVal {
x, n := binary.Uvarint(b.data[off:])
off += n
if n <= 0 || off+int(x) > len(b.buf) {
return errBatchBadRecord
if n <= 0 || off+int(x) > len(b.data) {
return newErrBatchCorrupted("bad record: invalid value length")
}
value = b.buf[off : off+int(x)]
value = b.data[off : off+int(x)]
off += int(x)
}
f(i, t, key, value)
f(i, kt, key, value)
}
return nil
}
func (b *Batch) replay(to batchReplay) error {
return b.decodeRec(func(i int, t vType, key, value []byte) {
switch t {
case tVal:
to.put(key, value, b.seq+uint64(i))
case tDel:
to.delete(key, b.seq+uint64(i))
}
})
}
func (b *Batch) memReplay(to *memdb.DB) error {
return b.decodeRec(func(i int, t vType, key, value []byte) {
ikey := newIKey(key, b.seq+uint64(i), t)
return b.decodeRec(func(i int, kt kType, key, value []byte) {
ikey := newIkey(key, b.seq+uint64(i), kt)
to.Put(ikey, value)
})
}
func (b *Batch) memDecodeAndReplay(prevSeq uint64, data []byte, to *memdb.DB) error {
if err := b.decode(prevSeq, data); err != nil {
return err
}
return b.memReplay(to)
}
func (b *Batch) revertMemReplay(to *memdb.DB) error {
return b.decodeRec(func(i int, t vType, key, value []byte) {
ikey := newIKey(key, b.seq+uint64(i), t)
return b.decodeRec(func(i int, kt kType, key, value []byte) {
ikey := newIkey(key, b.seq+uint64(i), kt)
to.Delete(ikey)
})
}

View File

@ -15,7 +15,7 @@ import (
)
type tbRec struct {
t vType
kt kType
key, value []byte
}
@ -23,39 +23,39 @@ type testBatch struct {
rec []*tbRec
}
func (p *testBatch) put(key, value []byte, seq uint64) {
p.rec = append(p.rec, &tbRec{tVal, key, value})
func (p *testBatch) Put(key, value []byte) {
p.rec = append(p.rec, &tbRec{ktVal, key, value})
}
func (p *testBatch) delete(key []byte, seq uint64) {
p.rec = append(p.rec, &tbRec{tDel, key, nil})
func (p *testBatch) Delete(key []byte) {
p.rec = append(p.rec, &tbRec{ktDel, key, nil})
}
func compareBatch(t *testing.T, b1, b2 *Batch) {
if b1.seq != b2.seq {
t.Errorf("invalid seq number want %d, got %d", b1.seq, b2.seq)
}
if b1.len() != b2.len() {
t.Fatalf("invalid record length want %d, got %d", b1.len(), b2.len())
if b1.Len() != b2.Len() {
t.Fatalf("invalid record length want %d, got %d", b1.Len(), b2.Len())
}
p1, p2 := new(testBatch), new(testBatch)
err := b1.replay(p1)
err := b1.Replay(p1)
if err != nil {
t.Fatal("error when replaying batch 1: ", err)
}
err = b2.replay(p2)
err = b2.Replay(p2)
if err != nil {
t.Fatal("error when replaying batch 2: ", err)
}
for i := range p1.rec {
r1, r2 := p1.rec[i], p2.rec[i]
if r1.t != r2.t {
t.Errorf("invalid type on record '%d' want %d, got %d", i, r1.t, r2.t)
if r1.kt != r2.kt {
t.Errorf("invalid type on record '%d' want %d, got %d", i, r1.kt, r2.kt)
}
if !bytes.Equal(r1.key, r2.key) {
t.Errorf("invalid key on record '%d' want %s, got %s", i, string(r1.key), string(r2.key))
}
if r1.t == tVal {
if r1.kt == ktVal {
if !bytes.Equal(r1.value, r2.value) {
t.Errorf("invalid value on record '%d' want %s, got %s", i, string(r1.value), string(r2.value))
}
@ -75,7 +75,7 @@ func TestBatch_EncodeDecode(t *testing.T) {
b1.Delete([]byte("k"))
buf := b1.encode()
b2 := new(Batch)
err := b2.decode(buf)
err := b2.decode(0, buf)
if err != nil {
t.Error("error when decoding batch: ", err)
}

View File

@ -249,7 +249,7 @@ func (x *testingCacheObject) Release() {
x.releaseCalled = true
x.cnt.releaseOne()
} else {
x.t.Errorf("duplicate setfin NS#%d KEY#%s", x.ns, x.key)
x.t.Errorf("duplicate setfin NS#%d KEY#%d", x.ns, x.key)
}
}
@ -489,7 +489,7 @@ func TestLRUCache_Finalizer(t *testing.T) {
return true
} else {
if p.delfinCalled != keymax {
t.Errorf("(2) #%d not all delete fin called, diff=%d", p.ns, keymax-p.delfinCalled)
t.Errorf("(2) NS#%d not all delete fin called, diff=%d", p.nsid, keymax-p.delfinCalled)
}
return false
}

View File

@ -1,40 +0,0 @@
// Copyright (c) 2012, Suryandaru Triandana <syndtr@gmail.com>
// All rights reserved.
//
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
package leveldb
const (
kNumLevels = 7
// Level-0 compaction is started when we hit this many files.
kL0_CompactionTrigger float64 = 4
// Soft limit on number of level-0 files. We slow down writes at this point.
kL0_SlowdownWritesTrigger = 8
// Maximum number of level-0 files. We stop writes at this point.
kL0_StopWritesTrigger = 12
// Maximum level to which a new compacted memdb is pushed if it
// does not create overlap. We try to push to level 2 to avoid the
// relatively expensive level 0=>1 compactions and to avoid some
// expensive manifest file operations. We do not push all the way to
// the largest level since that can generate a lot of wasted disk
// space if the same key space is being repeatedly overwritten.
kMaxMemCompactLevel = 2
// Maximum size of a table.
kMaxTableSize = 2 * 1048576
// Maximum bytes of overlaps in grandparent (i.e., level+2) before we
// stop building a single file in a level->level+1 compaction.
kMaxGrandParentOverlapBytes = 10 * kMaxTableSize
// Maximum number of bytes in all compacted files. We avoid expanding
// the lower level file set of a compaction if it would make the
// total compaction cover more than this many bytes.
kExpCompactionMaxBytes = 25 * kMaxTableSize
)

View File

@ -8,7 +8,6 @@ package leveldb
import (
"container/list"
"errors"
"fmt"
"io"
"os"
@ -18,6 +17,7 @@ import (
"sync/atomic"
"time"
"github.com/syndtr/goleveldb/leveldb/errors"
"github.com/syndtr/goleveldb/leveldb/iterator"
"github.com/syndtr/goleveldb/leveldb/journal"
"github.com/syndtr/goleveldb/leveldb/memdb"
@ -57,18 +57,19 @@ type DB struct {
writeMergedC chan bool
writeLockC chan struct{}
writeAckC chan error
writeDelay time.Duration
writeDelayN int
journalC chan *Batch
journalAckC chan error
// Compaction.
tcompCmdC chan cCmd
tcompPauseC chan chan<- struct{}
tcompTriggerC chan struct{}
mcompCmdC chan cCmd
mcompTriggerC chan struct{}
compErrC chan error
compErrSetC chan error
compStats [kNumLevels]cStats
tcompCmdC chan cCmd
tcompPauseC chan chan<- struct{}
mcompCmdC chan cCmd
compErrC chan error
compPerErrC chan error
compErrSetC chan error
compStats []cStats
// Close.
closeW sync.WaitGroup
@ -83,7 +84,7 @@ func openDB(s *session) (*DB, error) {
db := &DB{
s: s,
// Initial sequence
seq: s.stSeq,
seq: s.stSeqNum,
// MemDB
memPool: make(chan *memdb.DB, 1),
// Snapshot
@ -96,13 +97,13 @@ func openDB(s *session) (*DB, error) {
journalC: make(chan *Batch),
journalAckC: make(chan error),
// Compaction
tcompCmdC: make(chan cCmd),
tcompPauseC: make(chan chan<- struct{}),
tcompTriggerC: make(chan struct{}, 1),
mcompCmdC: make(chan cCmd),
mcompTriggerC: make(chan struct{}, 1),
compErrC: make(chan error),
compErrSetC: make(chan error),
tcompCmdC: make(chan cCmd),
tcompPauseC: make(chan chan<- struct{}),
mcompCmdC: make(chan cCmd),
compErrC: make(chan error),
compPerErrC: make(chan error),
compErrSetC: make(chan error),
compStats: make([]cStats, s.o.GetNumLevel()),
// Close
closeC: make(chan struct{}),
}
@ -121,14 +122,14 @@ func openDB(s *session) (*DB, error) {
return nil, err
}
// Don't include compaction error goroutine into wait group.
// Doesn't need to be included in the wait group.
go db.compactionError()
go db.mpoolDrain()
db.closeW.Add(3)
go db.tCompaction()
go db.mCompaction()
go db.jWriter()
go db.mpoolDrain()
s.logf("db@open done T·%v", time.Since(start))
@ -255,6 +256,10 @@ func RecoverFile(path string, o *opt.Options) (db *DB, err error) {
}
func recoverTable(s *session, o *opt.Options) error {
o = dupOptions(o)
// Mask StrictReader, lets StrictRecovery doing its job.
o.Strict &= ^opt.StrictReader
// Get all tables and sort it by file number.
tableFiles_, err := s.getFiles(storage.TypeTable)
if err != nil {
@ -263,10 +268,16 @@ func recoverTable(s *session, o *opt.Options) error {
tableFiles := files(tableFiles_)
tableFiles.sort()
var mSeq uint64
var good, corrupted int
rec := new(sessionRecord)
bpool := util.NewBufferPool(o.GetBlockSize() + 5)
var (
mSeq uint64
recoveredKey, goodKey, corruptedKey, corruptedBlock, droppedTable int
// We will drop corrupted table.
strict = o.GetStrict(opt.StrictRecovery)
rec = &sessionRecord{numLevel: o.GetNumLevel()}
bpool = util.NewBufferPool(o.GetBlockSize() + 5)
)
buildTable := func(iter iterator.Iterator) (tmp storage.File, size int64, err error) {
tmp = s.newTemp()
writer, err := tmp.Create()
@ -321,25 +332,32 @@ func recoverTable(s *session, o *opt.Options) error {
return err
}
var tSeq uint64
var tgood, tcorrupted, blockerr int
var imin, imax []byte
tr := table.NewReader(reader, size, nil, bpool, o)
var (
tSeq uint64
tgoodKey, tcorruptedKey, tcorruptedBlock int
imin, imax []byte
)
tr, err := table.NewReader(reader, size, storage.NewFileInfo(file), nil, bpool, o)
if err != nil {
return err
}
iter := tr.NewIterator(nil, nil)
iter.(iterator.ErrorCallbackSetter).SetErrorCallback(func(err error) {
s.logf("table@recovery found error @%d %q", file.Num(), err)
blockerr++
if errors.IsCorrupted(err) {
s.logf("table@recovery block corruption @%d %q", file.Num(), err)
tcorruptedBlock++
}
})
// Scan the table.
for iter.Next() {
key := iter.Key()
_, seq, _, ok := parseIkey(key)
if !ok {
tcorrupted++
_, seq, _, kerr := parseIkey(key)
if kerr != nil {
tcorruptedKey++
continue
}
tgood++
tgoodKey++
if seq > tSeq {
tSeq = seq
}
@ -354,8 +372,18 @@ func recoverTable(s *session, o *opt.Options) error {
}
iter.Release()
if tgood > 0 {
if tcorrupted > 0 || blockerr > 0 {
goodKey += tgoodKey
corruptedKey += tcorruptedKey
corruptedBlock += tcorruptedBlock
if strict && (tcorruptedKey > 0 || tcorruptedBlock > 0) {
droppedTable++
s.logf("table@recovery dropped @%d Gk·%d Ck·%d Cb·%d S·%d Q·%d", file.Num(), tgoodKey, tcorruptedKey, tcorruptedBlock, size, tSeq)
return nil
}
if tgoodKey > 0 {
if tcorruptedKey > 0 || tcorruptedBlock > 0 {
// Rebuild the table.
s.logf("table@recovery rebuilding @%d", file.Num())
iter := tr.NewIterator(nil, nil)
@ -373,16 +401,15 @@ func recoverTable(s *session, o *opt.Options) error {
if tSeq > mSeq {
mSeq = tSeq
}
recoveredKey += tgoodKey
// Add table to level 0.
rec.addTable(0, file.Num(), uint64(size), imin, imax)
s.logf("table@recovery recovered @%d N·%d C·%d B·%d S·%d Q·%d", file.Num(), tgood, tcorrupted, blockerr, size, tSeq)
s.logf("table@recovery recovered @%d Gk·%d Ck·%d Cb·%d S·%d Q·%d", file.Num(), tgoodKey, tcorruptedKey, tcorruptedBlock, size, tSeq)
} else {
s.logf("table@recovery unrecoverable @%d C·%d B·%d S·%d", file.Num(), tcorrupted, blockerr, size)
droppedTable++
s.logf("table@recovery unrecoverable @%d Ck·%d Cb·%d S·%d", file.Num(), tcorruptedKey, tcorruptedBlock, size)
}
good += tgood
corrupted += tcorrupted
return nil
}
@ -399,11 +426,11 @@ func recoverTable(s *session, o *opt.Options) error {
}
}
s.logf("table@recovery recovered F·%d N·%d C·%d Q·%d", len(tableFiles), good, corrupted, mSeq)
s.logf("table@recovery recovered F·%d N·%d Gk·%d Ck·%d Q·%d", len(tableFiles), recoveredKey, goodKey, corruptedKey, mSeq)
}
// Set sequence number.
rec.setSeq(mSeq + 1)
rec.setSeqNum(mSeq + 1)
// Create new manifest.
if err := s.create(); err != nil {
@ -486,26 +513,30 @@ func (db *DB) recoverJournal() error {
if err == io.EOF {
break
}
return err
return errors.SetFile(err, file)
}
buf.Reset()
if _, err := buf.ReadFrom(r); err != nil {
if err == io.ErrUnexpectedEOF {
// This is error returned due to corruption, with strict == false.
continue
} else {
return err
return errors.SetFile(err, file)
}
}
if err := batch.decode(buf.Bytes()); err != nil {
return err
}
if err := batch.memReplay(mem); err != nil {
return err
if err := batch.memDecodeAndReplay(db.seq, buf.Bytes(), mem); err != nil {
if strict || !errors.IsCorrupted(err) {
return errors.SetFile(err, file)
} else {
db.s.logf("journal error: %v (skipped)", err)
// We won't apply sequence number as it might be corrupted.
continue
}
}
// Save sequence number.
db.seq = batch.seq + uint64(batch.len())
db.seq = batch.seq + uint64(batch.Len())
// Flush it if large enough.
if mem.Size() >= writeBuffer {
@ -566,7 +597,7 @@ func (db *DB) recoverJournal() error {
}
func (db *DB) get(key []byte, seq uint64, ro *opt.ReadOptions) (value []byte, err error) {
ikey := newIKey(key, seq, tSeek)
ikey := newIkey(key, seq, ktSeek)
em, fm := db.getMems()
for _, m := range [...]*memDB{em, fm} {
@ -577,9 +608,13 @@ func (db *DB) get(key []byte, seq uint64, ro *opt.ReadOptions) (value []byte, er
mk, mv, me := m.mdb.Find(ikey)
if me == nil {
ukey, _, t, ok := parseIkey(mk)
if ok && db.s.icmp.uCompare(ukey, key) == 0 {
if t == tDel {
ukey, _, kt, kerr := parseIkey(mk)
if kerr != nil {
// Shouldn't have had happen.
panic(kerr)
}
if db.s.icmp.uCompare(ukey, key) == 0 {
if kt == ktDel {
return nil, ErrNotFound
}
return append([]byte{}, mv...), nil
@ -594,7 +629,7 @@ func (db *DB) get(key []byte, seq uint64, ro *opt.ReadOptions) (value []byte, er
v.release()
if cSched {
// Trigger table compaction.
db.compTrigger(db.tcompTriggerC)
db.compSendTrigger(db.tcompCmdC)
}
return
}
@ -697,7 +732,7 @@ func (db *DB) GetProperty(name string) (value string, err error) {
var level uint
var rest string
n, _ := fmt.Sscanf(p[len(numFilesPrefix):], "%d%s", &level, &rest)
if n != 1 || level >= kNumLevels {
if n != 1 || int(level) >= db.s.o.GetNumLevel() {
err = errors.New("leveldb: GetProperty: invalid property: " + name)
} else {
value = fmt.Sprint(v.tLen(int(level)))
@ -759,8 +794,8 @@ func (db *DB) SizeOf(ranges []util.Range) (Sizes, error) {
sizes := make(Sizes, 0, len(ranges))
for _, r := range ranges {
imin := newIKey(r.Start, kMaxSeq, tSeek)
imax := newIKey(r.Limit, kMaxSeq, tSeek)
imin := newIkey(r.Start, kMaxSeq, ktSeek)
imax := newIkey(r.Limit, kMaxSeq, ktSeek)
start, err := v.offsetOf(imin)
if err != nil {
return nil, err
@ -816,6 +851,10 @@ func (db *DB) Close() error {
db.journalWriter.Close()
}
if db.writeDelayN > 0 {
db.logf("db@write was delayed N·%d T·%v", db.writeDelayN, db.writeDelay)
}
// Close session.
db.s.close()
db.logf("db@close done T·%v", time.Since(start))

View File

@ -7,11 +7,12 @@
package leveldb
import (
"errors"
"sync"
"time"
"github.com/syndtr/goleveldb/leveldb/errors"
"github.com/syndtr/goleveldb/leveldb/memdb"
"github.com/syndtr/goleveldb/leveldb/opt"
)
var (
@ -68,7 +69,7 @@ type cMem struct {
}
func newCMem(s *session) *cMem {
return &cMem{s: s, rec: new(sessionRecord)}
return &cMem{s: s, rec: &sessionRecord{numLevel: s.o.GetNumLevel()}}
}
func (c *cMem) flush(mem *memdb.DB, level int) error {
@ -84,7 +85,9 @@ func (c *cMem) flush(mem *memdb.DB, level int) error {
// Pick level.
if level < 0 {
level = s.version_NB().pickLevel(t.imin.ukey(), t.imax.ukey())
v := s.version()
level = v.pickLevel(t.imin.ukey(), t.imax.ukey())
v.release()
}
c.rec.addTableFile(level, t)
@ -95,24 +98,32 @@ func (c *cMem) flush(mem *memdb.DB, level int) error {
}
func (c *cMem) reset() {
c.rec = new(sessionRecord)
c.rec = &sessionRecord{numLevel: c.s.o.GetNumLevel()}
}
func (c *cMem) commit(journal, seq uint64) error {
c.rec.setJournalNum(journal)
c.rec.setSeq(seq)
c.rec.setSeqNum(seq)
// Commit changes.
return c.s.commit(c.rec)
}
func (db *DB) compactionError() {
var err error
var (
err error
wlocked bool
)
noerr:
// No error.
for {
select {
case err = <-db.compErrSetC:
if err != nil {
switch {
case err == nil:
case errors.IsCorrupted(err):
goto hasperr
default:
goto haserr
}
case _, _ = <-db.closeC:
@ -120,17 +131,39 @@ noerr:
}
}
haserr:
// Transient error.
for {
select {
case db.compErrC <- err:
case err = <-db.compErrSetC:
if err == nil {
switch {
case err == nil:
goto noerr
case errors.IsCorrupted(err):
goto hasperr
default:
}
case _, _ = <-db.closeC:
return
}
}
hasperr:
// Persistent error.
for {
select {
case db.compErrC <- err:
case db.compPerErrC <- err:
case db.writeLockC <- struct{}{}:
// Hold write lock, so that write won't pass-through.
wlocked = true
case _, _ = <-db.closeC:
if wlocked {
// We should release the lock or Close will hang.
<-db.writeLockC
}
return
}
}
}
type compactionTransactCounter int
@ -139,12 +172,17 @@ func (cnt *compactionTransactCounter) incr() {
*cnt++
}
func (db *DB) compactionTransact(name string, exec func(cnt *compactionTransactCounter) error, rollback func() error) {
type compactionTransactInterface interface {
run(cnt *compactionTransactCounter) error
revert() error
}
func (db *DB) compactionTransact(name string, t compactionTransactInterface) {
defer func() {
if x := recover(); x != nil {
if x == errCompactionTransactExiting && rollback != nil {
if err := rollback(); err != nil {
db.logf("%s rollback error %q", name, err)
if x == errCompactionTransactExiting {
if err := t.revert(); err != nil {
db.logf("%s revert error %q", name, err)
}
}
panic(x)
@ -156,9 +194,13 @@ func (db *DB) compactionTransact(name string, exec func(cnt *compactionTransactC
backoffMax = 8 * time.Second
backoffMul = 2 * time.Second
)
backoff := backoffMin
backoffT := time.NewTimer(backoff)
lastCnt := compactionTransactCounter(0)
var (
backoff = backoffMin
backoffT = time.NewTimer(backoff)
lastCnt = compactionTransactCounter(0)
disableBackoff = db.s.o.GetDisableCompactionBackoff()
)
for n := 0; ; n++ {
// Check wether the DB is closed.
if db.isClosed() {
@ -170,11 +212,19 @@ func (db *DB) compactionTransact(name string, exec func(cnt *compactionTransactC
// Execute.
cnt := compactionTransactCounter(0)
err := exec(&cnt)
err := t.run(&cnt)
if err != nil {
db.logf("%s error I·%d %q", name, cnt, err)
}
// Set compaction error status.
select {
case db.compErrSetC <- err:
case perr := <-db.compPerErrC:
if err != nil {
db.logf("%s exiting (persistent error %q)", name, perr)
db.compactionExitTransact()
}
case _, _ = <-db.closeC:
db.logf("%s exiting", name)
db.compactionExitTransact()
@ -182,31 +232,56 @@ func (db *DB) compactionTransact(name string, exec func(cnt *compactionTransactC
if err == nil {
return
}
db.logf("%s error I·%d %q", name, cnt, err)
// Reset backoff duration if counter is advancing.
if cnt > lastCnt {
backoff = backoffMin
lastCnt = cnt
}
// Backoff.
backoffT.Reset(backoff)
if backoff < backoffMax {
backoff *= backoffMul
if backoff > backoffMax {
backoff = backoffMax
}
}
select {
case <-backoffT.C:
case _, _ = <-db.closeC:
db.logf("%s exiting", name)
if errors.IsCorrupted(err) {
db.logf("%s exiting (corruption detected)", name)
db.compactionExitTransact()
}
if !disableBackoff {
// Reset backoff duration if counter is advancing.
if cnt > lastCnt {
backoff = backoffMin
lastCnt = cnt
}
// Backoff.
backoffT.Reset(backoff)
if backoff < backoffMax {
backoff *= backoffMul
if backoff > backoffMax {
backoff = backoffMax
}
}
select {
case <-backoffT.C:
case _, _ = <-db.closeC:
db.logf("%s exiting", name)
db.compactionExitTransact()
}
}
}
}
type compactionTransactFunc struct {
runFunc func(cnt *compactionTransactCounter) error
revertFunc func() error
}
func (t *compactionTransactFunc) run(cnt *compactionTransactCounter) error {
return t.runFunc(cnt)
}
func (t *compactionTransactFunc) revert() error {
if t.revertFunc != nil {
return t.revertFunc()
}
return nil
}
func (db *DB) compactionTransactFunc(name string, run func(cnt *compactionTransactCounter) error, revert func() error) {
db.compactionTransact(name, &compactionTransactFunc{run, revert})
}
func (db *DB) compactionExitTransact() {
panic(errCompactionTransactExiting)
}
@ -232,20 +307,23 @@ func (db *DB) memCompaction() {
}
// Pause table compaction.
ch := make(chan struct{})
resumeC := make(chan struct{})
select {
case db.tcompPauseC <- (chan<- struct{})(ch):
case db.tcompPauseC <- (chan<- struct{})(resumeC):
case <-db.compPerErrC:
close(resumeC)
resumeC = nil
case _, _ = <-db.closeC:
return
}
db.compactionTransact("mem@flush", func(cnt *compactionTransactCounter) (err error) {
db.compactionTransactFunc("mem@flush", func(cnt *compactionTransactCounter) (err error) {
stats.startTimer()
defer stats.stopTimer()
return c.flush(mem.mdb, -1)
}, func() error {
for _, r := range c.rec.addedTables {
db.logf("mem@flush rollback @%d", r.num)
db.logf("mem@flush revert @%d", r.num)
f := db.s.getTableFile(r.num)
if err := f.Remove(); err != nil {
return err
@ -254,7 +332,7 @@ func (db *DB) memCompaction() {
return nil
})
db.compactionTransact("mem@commit", func(cnt *compactionTransactCounter) (err error) {
db.compactionTransactFunc("mem@commit", func(cnt *compactionTransactCounter) (err error) {
stats.startTimer()
defer stats.stopTimer()
return c.commit(db.journalFile.Num(), db.frozenSeq)
@ -271,26 +349,223 @@ func (db *DB) memCompaction() {
db.dropFrozenMem()
// Resume table compaction.
select {
case <-ch:
case _, _ = <-db.closeC:
return
if resumeC != nil {
select {
case <-resumeC:
close(resumeC)
case _, _ = <-db.closeC:
return
}
}
// Trigger table compaction.
db.compTrigger(db.mcompTriggerC)
db.compSendTrigger(db.tcompCmdC)
}
type tableCompactionBuilder struct {
db *DB
s *session
c *compaction
rec *sessionRecord
stat0, stat1 *cStatsStaging
snapHasLastUkey bool
snapLastUkey []byte
snapLastSeq uint64
snapIter int
snapKerrCnt int
snapDropCnt int
kerrCnt int
dropCnt int
minSeq uint64
strict bool
tableSize int
tw *tWriter
}
func (b *tableCompactionBuilder) appendKV(key, value []byte) error {
// Create new table if not already.
if b.tw == nil {
// Check for pause event.
if b.db != nil {
select {
case ch := <-b.db.tcompPauseC:
b.db.pauseCompaction(ch)
case _, _ = <-b.db.closeC:
b.db.compactionExitTransact()
default:
}
}
// Create new table.
var err error
b.tw, err = b.s.tops.create()
if err != nil {
return err
}
}
// Write key/value into table.
return b.tw.append(key, value)
}
func (b *tableCompactionBuilder) needFlush() bool {
return b.tw.tw.BytesLen() >= b.tableSize
}
func (b *tableCompactionBuilder) flush() error {
t, err := b.tw.finish()
if err != nil {
return err
}
b.rec.addTableFile(b.c.level+1, t)
b.stat1.write += t.size
b.s.logf("table@build created L%d@%d N·%d S·%s %q:%q", b.c.level+1, t.file.Num(), b.tw.tw.EntriesLen(), shortenb(int(t.size)), t.imin, t.imax)
b.tw = nil
return nil
}
func (b *tableCompactionBuilder) cleanup() {
if b.tw != nil {
b.tw.drop()
b.tw = nil
}
}
func (b *tableCompactionBuilder) run(cnt *compactionTransactCounter) error {
snapResumed := b.snapIter > 0
hasLastUkey := b.snapHasLastUkey // The key might has zero length, so this is necessary.
lastUkey := append([]byte{}, b.snapLastUkey...)
lastSeq := b.snapLastSeq
b.kerrCnt = b.snapKerrCnt
b.dropCnt = b.snapDropCnt
// Restore compaction state.
b.c.restore()
defer b.cleanup()
b.stat1.startTimer()
defer b.stat1.stopTimer()
iter := b.c.newIterator()
defer iter.Release()
for i := 0; iter.Next(); i++ {
// Incr transact counter.
cnt.incr()
// Skip until last state.
if i < b.snapIter {
continue
}
resumed := false
if snapResumed {
resumed = true
snapResumed = false
}
ikey := iter.Key()
ukey, seq, kt, kerr := parseIkey(ikey)
if kerr == nil {
shouldStop := !resumed && b.c.shouldStopBefore(ikey)
if !hasLastUkey || b.s.icmp.uCompare(lastUkey, ukey) != 0 {
// First occurrence of this user key.
// Only rotate tables if ukey doesn't hop across.
if b.tw != nil && (shouldStop || b.needFlush()) {
if err := b.flush(); err != nil {
return err
}
// Creates snapshot of the state.
b.c.save()
b.snapHasLastUkey = hasLastUkey
b.snapLastUkey = append(b.snapLastUkey[:0], lastUkey...)
b.snapLastSeq = lastSeq
b.snapIter = i
b.snapKerrCnt = b.kerrCnt
b.snapDropCnt = b.dropCnt
}
hasLastUkey = true
lastUkey = append(lastUkey[:0], ukey...)
lastSeq = kMaxSeq
}
switch {
case lastSeq <= b.minSeq:
// Dropped because newer entry for same user key exist
fallthrough // (A)
case kt == ktDel && seq <= b.minSeq && b.c.baseLevelForKey(lastUkey):
// For this user key:
// (1) there is no data in higher levels
// (2) data in lower levels will have larger seq numbers
// (3) data in layers that are being compacted here and have
// smaller seq numbers will be dropped in the next
// few iterations of this loop (by rule (A) above).
// Therefore this deletion marker is obsolete and can be dropped.
lastSeq = seq
b.dropCnt++
continue
default:
lastSeq = seq
}
} else {
if b.strict {
return kerr
}
// Don't drop corrupted keys.
hasLastUkey = false
lastUkey = lastUkey[:0]
lastSeq = kMaxSeq
b.kerrCnt++
}
if err := b.appendKV(ikey, iter.Value()); err != nil {
return err
}
}
if err := iter.Error(); err != nil {
return err
}
// Finish last table.
if b.tw != nil && !b.tw.empty() {
return b.flush()
}
return nil
}
func (b *tableCompactionBuilder) revert() error {
for _, at := range b.rec.addedTables {
b.s.logf("table@build revert @%d", at.num)
f := b.s.getTableFile(at.num)
if err := f.Remove(); err != nil {
return err
}
}
return nil
}
func (db *DB) tableCompaction(c *compaction, noTrivial bool) {
rec := new(sessionRecord)
rec.addCompactionPointer(c.level, c.imax)
defer c.release()
rec := &sessionRecord{numLevel: db.s.o.GetNumLevel()}
rec.addCompPtr(c.level, c.imax)
if !noTrivial && c.trivial() {
t := c.tables[0][0]
db.logf("table@move L%d@%d -> L%d", c.level, t.file.Num(), c.level+1)
rec.deleteTable(c.level, t.file.Num())
rec.delTable(c.level, t.file.Num())
rec.addTableFile(c.level+1, t)
db.compactionTransact("table@move", func(cnt *compactionTransactCounter) (err error) {
db.compactionTransactFunc("table@move", func(cnt *compactionTransactCounter) (err error) {
return db.s.commit(rec)
}, nil)
return
@ -301,184 +576,34 @@ func (db *DB) tableCompaction(c *compaction, noTrivial bool) {
for _, t := range tables {
stats[i].read += t.size
// Insert deleted tables into record
rec.deleteTable(c.level+i, t.file.Num())
rec.delTable(c.level+i, t.file.Num())
}
}
sourceSize := int(stats[0].read + stats[1].read)
minSeq := db.minSeq()
db.logf("table@compaction L%d·%d -> L%d·%d S·%s Q·%d", c.level, len(c.tables[0]), c.level+1, len(c.tables[1]), shortenb(sourceSize), minSeq)
var snapUkey []byte
var snapHasUkey bool
var snapSeq uint64
var snapIter int
var snapDropCnt int
var dropCnt int
db.compactionTransact("table@build", func(cnt *compactionTransactCounter) (err error) {
ukey := append([]byte{}, snapUkey...)
hasUkey := snapHasUkey
lseq := snapSeq
dropCnt = snapDropCnt
snapSched := snapIter == 0
var tw *tWriter
finish := func() error {
t, err := tw.finish()
if err != nil {
return err
}
rec.addTableFile(c.level+1, t)
stats[1].write += t.size
db.logf("table@build created L%d@%d N·%d S·%s %q:%q", c.level+1, t.file.Num(), tw.tw.EntriesLen(), shortenb(int(t.size)), t.imin, t.imax)
return nil
}
defer func() {
stats[1].stopTimer()
if tw != nil {
tw.drop()
tw = nil
}
}()
stats[1].startTimer()
iter := c.newIterator()
defer iter.Release()
for i := 0; iter.Next(); i++ {
// Incr transact counter.
cnt.incr()
// Skip until last state.
if i < snapIter {
continue
}
ikey := iKey(iter.Key())
if c.shouldStopBefore(ikey) && tw != nil {
err = finish()
if err != nil {
return
}
snapSched = true
tw = nil
}
// Scheduled for snapshot, snapshot will used to retry compaction
// if error occured.
if snapSched {
snapUkey = append(snapUkey[:0], ukey...)
snapHasUkey = hasUkey
snapSeq = lseq
snapIter = i
snapDropCnt = dropCnt
snapSched = false
}
if seq, vt, ok := ikey.parseNum(); !ok {
// Don't drop error keys
ukey = ukey[:0]
hasUkey = false
lseq = kMaxSeq
} else {
if !hasUkey || db.s.icmp.uCompare(ikey.ukey(), ukey) != 0 {
// First occurrence of this user key
ukey = append(ukey[:0], ikey.ukey()...)
hasUkey = true
lseq = kMaxSeq
}
drop := false
if lseq <= minSeq {
// Dropped because newer entry for same user key exist
drop = true // (A)
} else if vt == tDel && seq <= minSeq && c.baseLevelForKey(ukey) {
// For this user key:
// (1) there is no data in higher levels
// (2) data in lower levels will have larger seq numbers
// (3) data in layers that are being compacted here and have
// smaller seq numbers will be dropped in the next
// few iterations of this loop (by rule (A) above).
// Therefore this deletion marker is obsolete and can be dropped.
drop = true
}
lseq = seq
if drop {
dropCnt++
continue
}
}
// Create new table if not already
if tw == nil {
// Check for pause event.
select {
case ch := <-db.tcompPauseC:
db.pauseCompaction(ch)
case _, _ = <-db.closeC:
db.compactionExitTransact()
default:
}
// Create new table.
tw, err = db.s.tops.create()
if err != nil {
return
}
}
// Write key/value into table
err = tw.append(ikey, iter.Value())
if err != nil {
return
}
// Finish table if it is big enough
if tw.tw.BytesLen() >= kMaxTableSize {
err = finish()
if err != nil {
return
}
snapSched = true
tw = nil
}
}
err = iter.Error()
if err != nil {
return
}
// Finish last table
if tw != nil && !tw.empty() {
err = finish()
if err != nil {
return
}
tw = nil
}
return
}, func() error {
for _, r := range rec.addedTables {
db.logf("table@build rollback @%d", r.num)
f := db.s.getTableFile(r.num)
if err := f.Remove(); err != nil {
return err
}
}
return nil
})
b := &tableCompactionBuilder{
db: db,
s: db.s,
c: c,
rec: rec,
stat1: &stats[1],
minSeq: minSeq,
strict: db.s.o.GetStrict(opt.StrictCompaction),
tableSize: db.s.o.GetCompactionTableSize(c.level + 1),
}
db.compactionTransact("table@build", b)
// Commit changes
db.compactionTransact("table@commit", func(cnt *compactionTransactCounter) (err error) {
db.compactionTransactFunc("table@commit", func(cnt *compactionTransactCounter) (err error) {
stats[1].startTimer()
defer stats[1].stopTimer()
return db.s.commit(rec)
}, nil)
resultSize := int(stats[1].write)
db.logf("table@compaction committed F%s S%s D·%d T·%v", sint(len(rec.addedTables)-len(rec.deletedTables)), sshortenb(resultSize-sourceSize), dropCnt, stats[1].duration)
db.logf("table@compaction committed F%s S%s Ke·%d D·%d T·%v", sint(len(rec.addedTables)-len(rec.deletedTables)), sshortenb(resultSize-sourceSize), b.kerrCnt, b.dropCnt, stats[1].duration)
// Save compaction stats
for i := range stats {
@ -494,14 +619,14 @@ func (db *DB) tableRangeCompaction(level int, umin, umax []byte) {
db.tableCompaction(c, true)
}
} else {
v := db.s.version_NB()
v := db.s.version()
m := 1
for i, t := range v.tables[1:] {
if t.overlaps(db.s.icmp, umin, umax, false) {
m = i + 1
}
}
v.release()
for level := 0; level < m; level++ {
if c := db.s.getCompactionRange(level, umin, umax); c != nil {
@ -518,7 +643,9 @@ func (db *DB) tableAutoCompaction() {
}
func (db *DB) tableNeedCompaction() bool {
return db.s.version_NB().needCompaction()
v := db.s.version()
defer v.release()
return v.needCompaction()
}
func (db *DB) pauseCompaction(ch chan<- struct{}) {
@ -538,10 +665,12 @@ type cIdle struct {
}
func (r cIdle) ack(err error) {
defer func() {
recover()
}()
r.ackC <- err
if r.ackC != nil {
defer func() {
recover()
}()
r.ackC <- err
}
}
type cRange struct {
@ -559,6 +688,7 @@ func (r cRange) ack(err error) {
}
}
// This will trigger auto compation and/or wait for all compaction to be done.
func (db *DB) compSendIdle(compC chan<- cCmd) (err error) {
ch := make(chan error)
defer close(ch)
@ -580,6 +710,15 @@ func (db *DB) compSendIdle(compC chan<- cCmd) (err error) {
return err
}
// This will trigger auto compaction but will not wait for it.
func (db *DB) compSendTrigger(compC chan<- cCmd) {
select {
case compC <- cIdle{}:
default:
}
}
// Send range compaction request.
func (db *DB) compSendRange(compC chan<- cCmd, level int, min, max []byte) (err error) {
ch := make(chan error)
defer close(ch)
@ -601,13 +740,6 @@ func (db *DB) compSendRange(compC chan<- cCmd, level int, min, max []byte) (err
return err
}
func (db *DB) compTrigger(compTriggerC chan struct{}) {
select {
case compTriggerC <- struct{}{}:
default:
}
}
func (db *DB) mCompaction() {
var x cCmd
@ -626,11 +758,14 @@ func (db *DB) mCompaction() {
for {
select {
case x = <-db.mcompCmdC:
db.memCompaction()
x.ack(nil)
x = nil
case <-db.mcompTriggerC:
db.memCompaction()
switch x.(type) {
case cIdle:
db.memCompaction()
x.ack(nil)
x = nil
default:
panic("leveldb: unknown command")
}
case _, _ = <-db.closeC:
return
}
@ -661,7 +796,6 @@ func (db *DB) tCompaction() {
if db.tableNeedCompaction() {
select {
case x = <-db.tcompCmdC:
case <-db.tcompTriggerC:
case ch := <-db.tcompPauseC:
db.pauseCompaction(ch)
continue
@ -677,7 +811,6 @@ func (db *DB) tCompaction() {
ackQ = ackQ[:0]
select {
case x = <-db.tcompCmdC:
case <-db.tcompTriggerC:
case ch := <-db.tcompPauseC:
db.pauseCompaction(ch)
continue
@ -692,6 +825,8 @@ func (db *DB) tCompaction() {
case cRange:
db.tableRangeCompaction(cmd.level, cmd.min, cmd.max)
x.ack(nil)
default:
panic("leveldb: unknown command")
}
x = nil
}

View File

@ -48,7 +48,8 @@ func (db *DB) newRawIterator(slice *util.Range, ro *opt.ReadOptions) iterator.It
i = append(i, fmi)
}
i = append(i, ti...)
mi := iterator.NewMergedIterator(i, db.s.icmp, true)
strict := opt.GetStrict(db.s.o.Options, ro, opt.StrictReader)
mi := iterator.NewMergedIterator(i, db.s.icmp, strict)
mi.SetReleaser(&versionReleaser{v: v})
return mi
}
@ -58,10 +59,10 @@ func (db *DB) newIterator(seq uint64, slice *util.Range, ro *opt.ReadOptions) *d
if slice != nil {
islice = &util.Range{}
if slice.Start != nil {
islice.Start = newIKey(slice.Start, kMaxSeq, tSeek)
islice.Start = newIkey(slice.Start, kMaxSeq, ktSeek)
}
if slice.Limit != nil {
islice.Limit = newIKey(slice.Limit, kMaxSeq, tSeek)
islice.Limit = newIkey(slice.Limit, kMaxSeq, ktSeek)
}
}
rawIter := db.newRawIterator(islice, ro)
@ -70,7 +71,7 @@ func (db *DB) newIterator(seq uint64, slice *util.Range, ro *opt.ReadOptions) *d
icmp: db.s.icmp,
iter: rawIter,
seq: seq,
strict: db.s.o.GetStrict(opt.StrictIterator) || ro.GetStrict(opt.StrictIterator),
strict: opt.GetStrict(db.s.o.Options, ro, opt.StrictReader),
key: make([]byte, 0),
value: make([]byte, 0),
}
@ -161,7 +162,7 @@ func (i *dbIter) Seek(key []byte) bool {
return false
}
ikey := newIKey(key, i.seq, tSeek)
ikey := newIkey(key, i.seq, ktSeek)
if i.iter.Seek(ikey) {
i.dir = dirSOI
return i.next()
@ -173,15 +174,14 @@ func (i *dbIter) Seek(key []byte) bool {
func (i *dbIter) next() bool {
for {
ukey, seq, t, ok := parseIkey(i.iter.Key())
if ok {
if ukey, seq, kt, kerr := parseIkey(i.iter.Key()); kerr == nil {
if seq <= i.seq {
switch t {
case tDel:
switch kt {
case ktDel:
// Skip deleted key.
i.key = append(i.key[:0], ukey...)
i.dir = dirForward
case tVal:
case ktVal:
if i.dir == dirSOI || i.icmp.uCompare(ukey, i.key) > 0 {
i.key = append(i.key[:0], ukey...)
i.value = append(i.value[:0], i.iter.Value()...)
@ -191,7 +191,7 @@ func (i *dbIter) next() bool {
}
}
} else if i.strict {
i.setErr(errInvalidIkey)
i.setErr(kerr)
break
}
if !i.iter.Next() {
@ -224,20 +224,19 @@ func (i *dbIter) prev() bool {
del := true
if i.iter.Valid() {
for {
ukey, seq, t, ok := parseIkey(i.iter.Key())
if ok {
if ukey, seq, kt, kerr := parseIkey(i.iter.Key()); kerr == nil {
if seq <= i.seq {
if !del && i.icmp.uCompare(ukey, i.key) < 0 {
return true
}
del = (t == tDel)
del = (kt == ktDel)
if !del {
i.key = append(i.key[:0], ukey...)
i.value = append(i.value[:0], i.iter.Value()...)
}
}
} else if i.strict {
i.setErr(errInvalidIkey)
i.setErr(kerr)
return false
}
if !i.iter.Prev() {
@ -266,13 +265,12 @@ func (i *dbIter) Prev() bool {
return i.Last()
case dirForward:
for i.iter.Prev() {
ukey, _, _, ok := parseIkey(i.iter.Key())
if ok {
if ukey, _, _, kerr := parseIkey(i.iter.Key()); kerr == nil {
if i.icmp.uCompare(ukey, i.key) < 0 {
goto cont
}
} else if i.strict {
i.setErr(errInvalidIkey)
i.setErr(kerr)
return false
}
}

View File

@ -7,6 +7,7 @@
package leveldb
import (
"bytes"
"container/list"
crand "crypto/rand"
"encoding/binary"
@ -23,6 +24,7 @@ import (
"unsafe"
"github.com/syndtr/goleveldb/leveldb/comparer"
"github.com/syndtr/goleveldb/leveldb/errors"
"github.com/syndtr/goleveldb/leveldb/filter"
"github.com/syndtr/goleveldb/leveldb/iterator"
"github.com/syndtr/goleveldb/leveldb/opt"
@ -151,7 +153,10 @@ func (h *dbHarness) maxNextLevelOverlappingBytes(want uint64) {
t := h.t
db := h.db
var res uint64
var (
maxOverlaps uint64
maxLevel int
)
v := db.s.version()
for i, tt := range v.tables[1 : len(v.tables)-1] {
level := i + 1
@ -159,15 +164,18 @@ func (h *dbHarness) maxNextLevelOverlappingBytes(want uint64) {
for _, t := range tt {
r := next.getOverlaps(nil, db.s.icmp, t.imin.ukey(), t.imax.ukey(), false)
sum := r.size()
if sum > res {
res = sum
if sum > maxOverlaps {
maxOverlaps = sum
maxLevel = level
}
}
}
v.release()
if res > want {
t.Errorf("next level overlapping bytes is more than %d, got=%d", want, res)
if maxOverlaps > want {
t.Errorf("next level most overlapping bytes is more than %d, got=%d level=%d", want, maxOverlaps, maxLevel)
} else {
t.Logf("next level most overlapping bytes is %d, level=%d want=%d", maxOverlaps, maxLevel, want)
}
}
@ -240,7 +248,7 @@ func (h *dbHarness) allEntriesFor(key, want string) {
db := h.db
s := db.s
ikey := newIKey([]byte(key), kMaxSeq, tVal)
ikey := newIkey([]byte(key), kMaxSeq, ktVal)
iter := db.newRawIterator(nil, nil)
if !iter.Seek(ikey) && iter.Error() != nil {
t.Error("AllEntries: error during seek, err: ", iter.Error())
@ -249,19 +257,18 @@ func (h *dbHarness) allEntriesFor(key, want string) {
res := "[ "
first := true
for iter.Valid() {
rkey := iKey(iter.Key())
if _, t, ok := rkey.parseNum(); ok {
if s.icmp.uCompare(ikey.ukey(), rkey.ukey()) != 0 {
if ukey, _, kt, kerr := parseIkey(iter.Key()); kerr == nil {
if s.icmp.uCompare(ikey.ukey(), ukey) != 0 {
break
}
if !first {
res += ", "
}
first = false
switch t {
case tVal:
switch kt {
case ktVal:
res += string(iter.Value())
case tDel:
case ktDel:
res += "DEL"
}
} else {
@ -326,6 +333,8 @@ func (h *dbHarness) compactMem() {
t := h.t
db := h.db
t.Log("starting memdb compaction")
db.writeLockC <- struct{}{}
defer func() {
<-db.writeLockC
@ -341,6 +350,8 @@ func (h *dbHarness) compactMem() {
if h.totalTables() == 0 {
t.Error("zero tables after mem compaction")
}
t.Log("memdb compaction done")
}
func (h *dbHarness) compactRangeAtErr(level int, min, max string, wanterr bool) {
@ -355,6 +366,8 @@ func (h *dbHarness) compactRangeAtErr(level int, min, max string, wanterr bool)
_max = []byte(max)
}
t.Logf("starting table range compaction: level=%d, min=%q, max=%q", level, min, max)
if err := db.compSendRange(db.tcompCmdC, level, _min, _max); err != nil {
if wanterr {
t.Log("CompactRangeAt: got error (expected): ", err)
@ -364,6 +377,8 @@ func (h *dbHarness) compactRangeAtErr(level int, min, max string, wanterr bool)
} else if wanterr {
t.Error("CompactRangeAt: expect error")
}
t.Log("table range compaction done")
}
func (h *dbHarness) compactRangeAt(level int, min, max string) {
@ -374,6 +389,8 @@ func (h *dbHarness) compactRange(min, max string) {
t := h.t
db := h.db
t.Logf("starting DB range compaction: min=%q, max=%q", min, max)
var r util.Range
if min != "" {
r.Start = []byte(min)
@ -384,6 +401,8 @@ func (h *dbHarness) compactRange(min, max string) {
if err := db.CompactRange(r); err != nil {
t.Error("CompactRange: got error: ", err)
}
t.Log("DB range compaction done")
}
func (h *dbHarness) sizeAssert(start, limit string, low, hi uint64) {
@ -505,10 +524,10 @@ func Test_FieldsAligned(t *testing.T) {
p1 := new(DB)
testAligned(t, "DB.seq", unsafe.Offsetof(p1.seq))
p2 := new(session)
testAligned(t, "session.stFileNum", unsafe.Offsetof(p2.stFileNum))
testAligned(t, "session.stNextFileNum", unsafe.Offsetof(p2.stNextFileNum))
testAligned(t, "session.stJournalNum", unsafe.Offsetof(p2.stJournalNum))
testAligned(t, "session.stPrevJournalNum", unsafe.Offsetof(p2.stPrevJournalNum))
testAligned(t, "session.stSeq", unsafe.Offsetof(p2.stSeq))
testAligned(t, "session.stSeqNum", unsafe.Offsetof(p2.stSeqNum))
}
func TestDb_Locking(t *testing.T) {
@ -944,7 +963,7 @@ func TestDb_RepeatedWritesToSameKey(t *testing.T) {
h := newDbHarnessWopt(t, &opt.Options{WriteBuffer: 100000})
defer h.close()
maxTables := kNumLevels + kL0_StopWritesTrigger
maxTables := h.o.GetNumLevel() + h.o.GetWriteL0PauseTrigger()
value := strings.Repeat("v", 2*h.o.GetWriteBuffer())
for i := 0; i < 5*maxTables; i++ {
@ -962,7 +981,7 @@ func TestDb_RepeatedWritesToSameKeyAfterReopen(t *testing.T) {
h.reopenDB()
maxTables := kNumLevels + kL0_StopWritesTrigger
maxTables := h.o.GetNumLevel() + h.o.GetWriteL0PauseTrigger()
value := strings.Repeat("v", 2*h.o.GetWriteBuffer())
for i := 0; i < 5*maxTables; i++ {
@ -978,7 +997,7 @@ func TestDb_SparseMerge(t *testing.T) {
h := newDbHarnessWopt(t, &opt.Options{Compression: opt.NoCompression})
defer h.close()
h.putMulti(kNumLevels, "A", "Z")
h.putMulti(h.o.GetNumLevel(), "A", "Z")
// Suppose there is:
// small amount of data with prefix A
@ -1002,6 +1021,7 @@ func TestDb_SparseMerge(t *testing.T) {
h.put("C", "vc2")
h.compactMem()
h.waitCompaction()
h.maxNextLevelOverlappingBytes(20 * 1048576)
h.compactRangeAt(0, "", "")
h.waitCompaction()
@ -1172,7 +1192,7 @@ func TestDb_HiddenValuesAreRemoved(t *testing.T) {
h.put("foo", "v1")
h.compactMem()
m := kMaxMemCompactLevel
m := h.o.GetMaxMemCompationLevel()
v := s.version()
num := v.tLen(m)
v.release()
@ -1216,7 +1236,7 @@ func TestDb_DeletionMarkers2(t *testing.T) {
h.put("foo", "v1")
h.compactMem()
m := kMaxMemCompactLevel
m := h.o.GetMaxMemCompationLevel()
v := s.version()
num := v.tLen(m)
v.release()
@ -1269,14 +1289,14 @@ func TestDb_CompactionTableOpenError(t *testing.T) {
t.Errorf("total tables is %d, want %d", n, im)
}
h.stor.SetOpenErr(storage.TypeTable)
h.stor.SetEmuErr(storage.TypeTable, tsOpOpen)
go h.db.CompactRange(util.Range{})
if err := h.db.compSendIdle(h.db.tcompCmdC); err != nil {
t.Log("compaction error: ", err)
}
h.closeDB0()
h.openDB()
h.stor.SetOpenErr(0)
h.stor.SetEmuErr(0, tsOpOpen)
for i := 0; i < im; i++ {
for j := 0; j < jm; j++ {
@ -1287,7 +1307,7 @@ func TestDb_CompactionTableOpenError(t *testing.T) {
func TestDb_OverlapInLevel0(t *testing.T) {
trun(t, func(h *dbHarness) {
if kMaxMemCompactLevel != 2 {
if h.o.GetMaxMemCompationLevel() != 2 {
t.Fatal("fix test to reflect the config")
}
@ -1407,23 +1427,23 @@ func TestDb_ManifestWriteError(t *testing.T) {
h.compactMem()
h.getVal("foo", "bar")
v := h.db.s.version()
if n := v.tLen(kMaxMemCompactLevel); n != 1 {
if n := v.tLen(h.o.GetMaxMemCompationLevel()); n != 1 {
t.Errorf("invalid total tables, want=1 got=%d", n)
}
v.release()
if i == 0 {
h.stor.SetWriteErr(storage.TypeManifest)
h.stor.SetEmuErr(storage.TypeManifest, tsOpWrite)
} else {
h.stor.SetSyncErr(storage.TypeManifest)
h.stor.SetEmuErr(storage.TypeManifest, tsOpSync)
}
// Merging compaction (will fail)
h.compactRangeAtErr(kMaxMemCompactLevel, "", "", true)
h.compactRangeAtErr(h.o.GetMaxMemCompationLevel(), "", "", true)
h.db.Close()
h.stor.SetWriteErr(0)
h.stor.SetSyncErr(0)
h.stor.SetEmuErr(0, tsOpWrite)
h.stor.SetEmuErr(0, tsOpSync)
// Should not lose data
h.openDB()
@ -1573,7 +1593,7 @@ func TestDb_ManualCompaction(t *testing.T) {
h := newDbHarness(t)
defer h.close()
if kMaxMemCompactLevel != 2 {
if h.o.GetMaxMemCompationLevel() != 2 {
t.Fatal("fix test to reflect the config")
}
@ -1857,7 +1877,7 @@ func TestDb_DeletionMarkersOnMemdb(t *testing.T) {
}
func TestDb_LeveldbIssue178(t *testing.T) {
nKeys := (kMaxTableSize / 30) * 5
nKeys := (opt.DefaultCompactionTableSize / 30) * 5
key1 := func(i int) string {
return fmt.Sprintf("my_key_%d", i)
}
@ -2125,7 +2145,7 @@ func TestDb_GoleveldbIssue72and83(t *testing.T) {
}
}
if err := iter.Error(); err != nil {
t.Fatalf("READER0 #%d.%d W#%d snap.Iterator: %v", i, k, err)
t.Fatalf("READER0 #%d.%d W#%d snap.Iterator: %v", i, k, writei, err)
}
iter.Release()
snap.Release()
@ -2164,5 +2184,385 @@ func TestDb_GoleveldbIssue72and83(t *testing.T) {
}()
wg.Wait()
}
func TestDb_TransientError(t *testing.T) {
h := newDbHarnessWopt(t, &opt.Options{
WriteBuffer: 128 * opt.KiB,
CachedOpenFiles: 3,
DisableCompactionBackoff: true,
})
defer h.close()
const (
nSnap = 20
nKey = 10000
)
var (
snaps [nSnap]*Snapshot
b = &Batch{}
)
for i := range snaps {
vtail := fmt.Sprintf("VAL%030d", i)
b.Reset()
for k := 0; k < nKey; k++ {
key := fmt.Sprintf("KEY%8d", k)
b.Put([]byte(key), []byte(key+vtail))
}
h.stor.SetEmuRandErr(storage.TypeTable, tsOpOpen, tsOpRead, tsOpReadAt)
if err := h.db.Write(b, nil); err != nil {
t.Logf("WRITE #%d error: %v", i, err)
h.stor.SetEmuRandErr(0, tsOpOpen, tsOpRead, tsOpReadAt, tsOpWrite)
for {
if err := h.db.Write(b, nil); err == nil {
break
} else if errors.IsCorrupted(err) {
t.Fatalf("WRITE #%d corrupted: %v", i, err)
}
}
}
snaps[i] = h.db.newSnapshot()
b.Reset()
for k := 0; k < nKey; k++ {
key := fmt.Sprintf("KEY%8d", k)
b.Delete([]byte(key))
}
h.stor.SetEmuRandErr(storage.TypeTable, tsOpOpen, tsOpRead, tsOpReadAt)
if err := h.db.Write(b, nil); err != nil {
t.Logf("WRITE #%d error: %v", i, err)
h.stor.SetEmuRandErr(0, tsOpOpen, tsOpRead, tsOpReadAt)
for {
if err := h.db.Write(b, nil); err == nil {
break
} else if errors.IsCorrupted(err) {
t.Fatalf("WRITE #%d corrupted: %v", i, err)
}
}
}
}
h.stor.SetEmuRandErr(0, tsOpOpen, tsOpRead, tsOpReadAt)
runtime.GOMAXPROCS(runtime.NumCPU())
rnd := rand.New(rand.NewSource(0xecafdaed))
wg := &sync.WaitGroup{}
for i, snap := range snaps {
wg.Add(2)
go func(i int, snap *Snapshot, sk []int) {
defer wg.Done()
vtail := fmt.Sprintf("VAL%030d", i)
for _, k := range sk {
key := fmt.Sprintf("KEY%8d", k)
xvalue, err := snap.Get([]byte(key), nil)
if err != nil {
t.Fatalf("READER_GET #%d SEQ=%d K%d error: %v", i, snap.elem.seq, k, err)
}
value := key + vtail
if !bytes.Equal([]byte(value), xvalue) {
t.Fatalf("READER_GET #%d SEQ=%d K%d invalid value: want %q, got %q", i, snap.elem.seq, k, value, xvalue)
}
}
}(i, snap, rnd.Perm(nKey))
go func(i int, snap *Snapshot) {
defer wg.Done()
vtail := fmt.Sprintf("VAL%030d", i)
iter := snap.NewIterator(nil, nil)
defer iter.Release()
for k := 0; k < nKey; k++ {
if !iter.Next() {
if err := iter.Error(); err != nil {
t.Fatalf("READER_ITER #%d K%d error: %v", i, k, err)
} else {
t.Fatalf("READER_ITER #%d K%d eoi", i, k)
}
}
key := fmt.Sprintf("KEY%8d", k)
xkey := iter.Key()
if !bytes.Equal([]byte(key), xkey) {
t.Fatalf("READER_ITER #%d K%d invalid key: want %q, got %q", i, k, key, xkey)
}
value := key + vtail
xvalue := iter.Value()
if !bytes.Equal([]byte(value), xvalue) {
t.Fatalf("READER_ITER #%d K%d invalid value: want %q, got %q", i, k, value, xvalue)
}
}
}(i, snap)
}
wg.Wait()
}
func TestDb_UkeyShouldntHopAcrossTable(t *testing.T) {
h := newDbHarnessWopt(t, &opt.Options{
WriteBuffer: 112 * opt.KiB,
CompactionTableSize: 90 * opt.KiB,
CompactionExpandLimitFactor: 1,
})
defer h.close()
const (
nSnap = 190
nKey = 140
)
var (
snaps [nSnap]*Snapshot
b = &Batch{}
)
for i := range snaps {
vtail := fmt.Sprintf("VAL%030d", i)
b.Reset()
for k := 0; k < nKey; k++ {
key := fmt.Sprintf("KEY%08d", k)
b.Put([]byte(key), []byte(key+vtail))
}
if err := h.db.Write(b, nil); err != nil {
t.Fatalf("WRITE #%d error: %v", i, err)
}
snaps[i] = h.db.newSnapshot()
b.Reset()
for k := 0; k < nKey; k++ {
key := fmt.Sprintf("KEY%08d", k)
b.Delete([]byte(key))
}
if err := h.db.Write(b, nil); err != nil {
t.Fatalf("WRITE #%d error: %v", i, err)
}
}
h.compactMem()
h.waitCompaction()
for level, tables := range h.db.s.stVersion.tables {
for _, table := range tables {
t.Logf("L%d@%d %q:%q", level, table.file.Num(), table.imin, table.imax)
}
}
h.compactRangeAt(0, "", "")
h.waitCompaction()
for level, tables := range h.db.s.stVersion.tables {
for _, table := range tables {
t.Logf("L%d@%d %q:%q", level, table.file.Num(), table.imin, table.imax)
}
}
h.compactRangeAt(1, "", "")
h.waitCompaction()
for level, tables := range h.db.s.stVersion.tables {
for _, table := range tables {
t.Logf("L%d@%d %q:%q", level, table.file.Num(), table.imin, table.imax)
}
}
runtime.GOMAXPROCS(runtime.NumCPU())
wg := &sync.WaitGroup{}
for i, snap := range snaps {
wg.Add(1)
go func(i int, snap *Snapshot) {
defer wg.Done()
vtail := fmt.Sprintf("VAL%030d", i)
for k := 0; k < nKey; k++ {
key := fmt.Sprintf("KEY%08d", k)
xvalue, err := snap.Get([]byte(key), nil)
if err != nil {
t.Fatalf("READER_GET #%d SEQ=%d K%d error: %v", i, snap.elem.seq, k, err)
}
value := key + vtail
if !bytes.Equal([]byte(value), xvalue) {
t.Fatalf("READER_GET #%d SEQ=%d K%d invalid value: want %q, got %q", i, snap.elem.seq, k, value, xvalue)
}
}
}(i, snap)
}
wg.Wait()
}
func TestDb_TableCompactionBuilder(t *testing.T) {
stor := newTestStorage(t)
defer stor.Close()
const nSeq = 99
o := &opt.Options{
WriteBuffer: 112 * opt.KiB,
CompactionTableSize: 43 * opt.KiB,
CompactionExpandLimitFactor: 1,
CompactionGPOverlapsFactor: 1,
BlockCache: opt.NoCache,
}
s, err := newSession(stor, o)
if err != nil {
t.Fatal(err)
}
if err := s.create(); err != nil {
t.Fatal(err)
}
defer s.close()
var (
seq uint64
targetSize = 5 * o.CompactionTableSize
value = bytes.Repeat([]byte{'0'}, 100)
)
for i := 0; i < 2; i++ {
tw, err := s.tops.create()
if err != nil {
t.Fatal(err)
}
for k := 0; tw.tw.BytesLen() < targetSize; k++ {
key := []byte(fmt.Sprintf("%09d", k))
seq += nSeq - 1
for x := uint64(0); x < nSeq; x++ {
if err := tw.append(newIkey(key, seq-x, ktVal), value); err != nil {
t.Fatal(err)
}
}
}
tf, err := tw.finish()
if err != nil {
t.Fatal(err)
}
rec := &sessionRecord{numLevel: s.o.GetNumLevel()}
rec.addTableFile(i, tf)
if err := s.commit(rec); err != nil {
t.Fatal(err)
}
}
// Build grandparent.
v := s.version()
c := newCompaction(s, v, 1, append(tFiles{}, v.tables[1]...))
rec := &sessionRecord{numLevel: s.o.GetNumLevel()}
b := &tableCompactionBuilder{
s: s,
c: c,
rec: rec,
stat1: new(cStatsStaging),
minSeq: 0,
strict: true,
tableSize: o.CompactionTableSize/3 + 961,
}
if err := b.run(new(compactionTransactCounter)); err != nil {
t.Fatal(err)
}
for _, t := range c.tables[0] {
rec.delTable(c.level, t.file.Num())
}
if err := s.commit(rec); err != nil {
t.Fatal(err)
}
c.release()
// Build level-1.
v = s.version()
c = newCompaction(s, v, 0, append(tFiles{}, v.tables[0]...))
rec = &sessionRecord{numLevel: s.o.GetNumLevel()}
b = &tableCompactionBuilder{
s: s,
c: c,
rec: rec,
stat1: new(cStatsStaging),
minSeq: 0,
strict: true,
tableSize: o.CompactionTableSize,
}
if err := b.run(new(compactionTransactCounter)); err != nil {
t.Fatal(err)
}
for _, t := range c.tables[0] {
rec.delTable(c.level, t.file.Num())
}
// Move grandparent to level-3
for _, t := range v.tables[2] {
rec.delTable(2, t.file.Num())
rec.addTableFile(3, t)
}
if err := s.commit(rec); err != nil {
t.Fatal(err)
}
c.release()
v = s.version()
for level, want := range []bool{false, true, false, true, false} {
got := len(v.tables[level]) > 0
if want != got {
t.Fatalf("invalid level-%d tables len: want %v, got %v", level, want, got)
}
}
for i, f := range v.tables[1][:len(v.tables[1])-1] {
nf := v.tables[1][i+1]
if bytes.Equal(f.imax.ukey(), nf.imin.ukey()) {
t.Fatalf("KEY %q hop across table %d .. %d", f.imax.ukey(), f.file.Num(), nf.file.Num())
}
}
v.release()
// Compaction with transient error.
v = s.version()
c = newCompaction(s, v, 1, append(tFiles{}, v.tables[1]...))
rec = &sessionRecord{numLevel: s.o.GetNumLevel()}
b = &tableCompactionBuilder{
s: s,
c: c,
rec: rec,
stat1: new(cStatsStaging),
minSeq: 0,
strict: true,
tableSize: o.CompactionTableSize,
}
stor.SetEmuErrOnce(storage.TypeTable, tsOpSync)
stor.SetEmuRandErr(storage.TypeTable, tsOpRead, tsOpReadAt, tsOpWrite)
stor.SetEmuRandErrProb(0xf0)
for {
if err := b.run(new(compactionTransactCounter)); err != nil {
t.Logf("(expected) b.run: %v", err)
} else {
break
}
}
if err := s.commit(rec); err != nil {
t.Fatal(err)
}
c.release()
stor.SetEmuErrOnce(0, tsOpSync)
stor.SetEmuRandErr(0, tsOpRead, tsOpReadAt, tsOpWrite)
v = s.version()
if len(v.tables[1]) != len(v.tables[2]) {
t.Fatalf("invalid tables length, want %d, got %d", len(v.tables[1]), len(v.tables[2]))
}
for i, f0 := range v.tables[1] {
f1 := v.tables[2][i]
iter0 := s.tops.newIterator(f0, nil, nil)
iter1 := s.tops.newIterator(f1, nil, nil)
for j := 0; true; j++ {
next0 := iter0.Next()
next1 := iter1.Next()
if next0 != next1 {
t.Fatalf("#%d.%d invalid eoi: want %v, got %v", i, j, next0, next1)
}
key0 := iter0.Key()
key1 := iter1.Key()
if !bytes.Equal(key0, key1) {
t.Fatalf("#%d.%d invalid key: want %q, got %q", i, j, key0, key1)
}
if next0 == false {
break
}
}
iter0.Release()
iter1.Release()
}
v.release()
}

View File

@ -7,8 +7,7 @@
package leveldb
import (
"errors"
"github.com/syndtr/goleveldb/leveldb/errors"
"github.com/syndtr/goleveldb/leveldb/iterator"
"github.com/syndtr/goleveldb/leveldb/opt"
"github.com/syndtr/goleveldb/leveldb/storage"
@ -38,7 +37,9 @@ func (db *DB) logf(format string, v ...interface{}) { db.s.logf(format, v...) }
// Check and clean files.
func (db *DB) checkAndCleanFiles() error {
v := db.s.version_NB()
v := db.s.version()
defer v.release()
tablesMap := make(map[uint64]bool)
for _, tables := range v.tables {
for _, t := range tables {
@ -78,12 +79,14 @@ func (db *DB) checkAndCleanFiles() error {
}
if nTables != len(tablesMap) {
var missing []*storage.FileInfo
for num, present := range tablesMap {
if !present {
missing = append(missing, &storage.FileInfo{Type: storage.TypeTable, Num: num})
db.logf("db@janitor table missing @%d", num)
}
}
return ErrCorrupted{Type: MissingFiles, Err: errors.New("leveldb: table files missing")}
return errors.NewErrCorrupted(nil, &errors.ErrMissingFiles{Files: missing})
}
db.logf("db@janitor F·%d G·%d", len(files), len(rem))

View File

@ -59,7 +59,7 @@ func (db *DB) rotateMem(n int) (mem *memDB, err error) {
}
// Schedule memdb compaction.
db.compTrigger(db.mcompTriggerC)
db.compSendTrigger(db.mcompCmdC)
return
}
@ -77,12 +77,12 @@ func (db *DB) flush(n int) (mem *memDB, nn int, err error) {
}()
nn = mem.mdb.Free()
switch {
case v.tLen(0) >= kL0_SlowdownWritesTrigger && !delayed:
case v.tLen(0) >= db.s.o.GetWriteL0SlowdownTrigger() && !delayed:
delayed = true
time.Sleep(time.Millisecond)
case nn >= n:
return false
case v.tLen(0) >= kL0_StopWritesTrigger:
case v.tLen(0) >= db.s.o.GetWriteL0PauseTrigger():
delayed = true
err = db.compSendIdle(db.tcompCmdC)
if err != nil {
@ -109,7 +109,12 @@ func (db *DB) flush(n int) (mem *memDB, nn int, err error) {
for flush() {
}
if delayed {
db.logf("db@write delayed T·%v", time.Since(start))
db.writeDelay += time.Since(start)
db.writeDelayN++
} else if db.writeDelayN > 0 {
db.writeDelay = 0
db.writeDelayN = 0
db.logf("db@write was delayed N·%d T·%v", db.writeDelayN, db.writeDelay)
}
return
}
@ -120,7 +125,7 @@ func (db *DB) flush(n int) (mem *memDB, nn int, err error) {
// It is safe to modify the contents of the arguments after Write returns.
func (db *DB) Write(b *Batch, wo *opt.WriteOptions) (err error) {
err = db.ok()
if err != nil || b == nil || b.len() == 0 {
if err != nil || b == nil || b.Len() == 0 {
return
}
@ -133,6 +138,8 @@ func (db *DB) Write(b *Batch, wo *opt.WriteOptions) (err error) {
return <-db.writeAckC
}
case db.writeLockC <- struct{}{}:
case err = <-db.compPerErrC:
return
case _, _ = <-db.closeC:
return ErrClosed
}
@ -188,35 +195,43 @@ drain:
if b.size() >= (128 << 10) {
// Push the write batch to the journal writer
select {
case db.journalC <- b:
// Write into memdb
if berr := b.memReplay(mem.mdb); berr != nil {
panic(berr)
}
case err = <-db.compPerErrC:
return
case _, _ = <-db.closeC:
err = ErrClosed
return
case db.journalC <- b:
// Write into memdb
b.memReplay(mem.mdb)
}
// Wait for journal writer
select {
case _, _ = <-db.closeC:
err = ErrClosed
return
case err = <-db.journalAckC:
if err != nil {
// Revert memdb if error detected
b.revertMemReplay(mem.mdb)
if berr := b.revertMemReplay(mem.mdb); berr != nil {
panic(berr)
}
return
}
case _, _ = <-db.closeC:
err = ErrClosed
return
}
} else {
err = db.writeJournal(b)
if err != nil {
return
}
b.memReplay(mem.mdb)
if berr := b.memReplay(mem.mdb); berr != nil {
panic(berr)
}
}
// Set last seq number.
db.addSeq(uint64(b.len()))
db.addSeq(uint64(b.Len()))
if b.size() >= memFree {
db.rotateMem(0)
@ -268,6 +283,8 @@ func (db *DB) CompactRange(r util.Range) error {
// Lock writer.
select {
case db.writeLockC <- struct{}{}:
case err := <-db.compPerErrC:
return err
case _, _ = <-db.closeC:
return ErrClosed
}

View File

@ -7,32 +7,12 @@
package leveldb
import (
"errors"
"github.com/syndtr/goleveldb/leveldb/util"
"github.com/syndtr/goleveldb/leveldb/errors"
)
var (
ErrNotFound = util.ErrNotFound
ErrNotFound = errors.ErrNotFound
ErrSnapshotReleased = errors.New("leveldb: snapshot released")
ErrIterReleased = errors.New("leveldb: iterator released")
ErrClosed = errors.New("leveldb: closed")
)
type CorruptionType int
const (
CorruptedManifest CorruptionType = iota
MissingFiles
)
// ErrCorrupted is the type that wraps errors that indicate corruption in
// the database.
type ErrCorrupted struct {
Type CorruptionType
Err error
}
func (e ErrCorrupted) Error() string {
return e.Err.Error()
}

View File

@ -0,0 +1,76 @@
// Copyright (c) 2014, Suryandaru Triandana <syndtr@gmail.com>
// All rights reserved.
//
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
// Package errors provides common error types used throughout leveldb.
package errors
import (
"errors"
"fmt"
"github.com/syndtr/goleveldb/leveldb/storage"
"github.com/syndtr/goleveldb/leveldb/util"
)
var (
ErrNotFound = New("leveldb: not found")
ErrReleased = util.ErrReleased
ErrHasReleaser = util.ErrHasReleaser
)
// New returns an error that formats as the given text.
func New(text string) error {
return errors.New(text)
}
// ErrCorrupted is the type that wraps errors that indicate corruption in
// the database.
type ErrCorrupted struct {
File *storage.FileInfo
Err error
}
func (e *ErrCorrupted) Error() string {
if e.File != nil {
return fmt.Sprintf("%v [file=%v]", e.Err, e.File)
} else {
return e.Err.Error()
}
}
// NewErrCorrupted creates new ErrCorrupted error.
func NewErrCorrupted(f storage.File, err error) error {
return &ErrCorrupted{storage.NewFileInfo(f), err}
}
// IsCorrupted returns a boolean indicating whether the error is indicating
// a corruption.
func IsCorrupted(err error) bool {
switch err.(type) {
case *ErrCorrupted:
return true
}
return false
}
// ErrMissingFiles is the type that indicating a corruption due to missing
// files.
type ErrMissingFiles struct {
Files []*storage.FileInfo
}
func (e *ErrMissingFiles) Error() string { return "file missing" }
// SetFile sets 'file info' of the given error with the given file.
// Currently only ErrCorrupted is supported, otherwise will do nothing.
func SetFile(err error, f storage.File) error {
switch x := err.(type) {
case *ErrCorrupted:
x.File = storage.NewFileInfo(f)
return x
}
return err
}

View File

@ -7,6 +7,7 @@
package iterator
import (
"github.com/syndtr/goleveldb/leveldb/errors"
"github.com/syndtr/goleveldb/leveldb/util"
)
@ -22,9 +23,8 @@ type IteratorIndexer interface {
type indexedIterator struct {
util.BasicReleaser
index IteratorIndexer
strict bool
strictGet bool
index IteratorIndexer
strict bool
data Iterator
err error
@ -37,11 +37,6 @@ func (i *indexedIterator) setData() {
i.data.Release()
}
i.data = i.index.Get()
if i.strictGet {
if err := i.data.Error(); err != nil {
i.err = err
}
}
}
func (i *indexedIterator) clearData() {
@ -61,13 +56,11 @@ func (i *indexedIterator) indexErr() {
}
func (i *indexedIterator) dataErr() bool {
if i.errf != nil {
if err := i.data.Error(); err != nil {
if err := i.data.Error(); err != nil {
if i.errf != nil {
i.errf(err)
}
}
if i.strict {
if err := i.data.Error(); err != nil {
if i.strict || !errors.IsCorrupted(err) {
i.err = err
return true
}
@ -236,16 +229,14 @@ func (i *indexedIterator) SetErrorCallback(f func(err error)) {
i.errf = f
}
// NewIndexedIterator returns an indexed iterator. An index is iterator
// that returns another iterator, a data iterator. A data iterator is the
// NewIndexedIterator returns an 'indexed iterator'. An index is iterator
// that returns another iterator, a 'data iterator'. A 'data iterator' is the
// iterator that contains actual key/value pairs.
//
// If strict is true then error yield by data iterator will halt the indexed
// iterator, on contrary if strict is false then the indexed iterator will
// ignore those error and move on to the next index. If strictGet is true and
// index.Get() yield an 'error iterator' then the indexed iterator will be halted.
// An 'error iterator' is iterator which its Error() method always return non-nil
// even before any 'seeks method' is called.
func NewIndexedIterator(index IteratorIndexer, strict, strictGet bool) Iterator {
return &indexedIterator{index: index, strict: strict, strictGet: strictGet}
// If strict is true the any 'corruption errors' (i.e errors.IsCorrupted(err) == true)
// won't be ignored and will halt 'indexed iterator', otherwise the iterator will
// continue to the next 'data iterator'. Corruption on 'index iterator' will not be
// ignored and will halt the iterator.
func NewIndexedIterator(index IteratorIndexer, strict bool) Iterator {
return &indexedIterator{index: index, strict: strict}
}

View File

@ -65,7 +65,7 @@ var _ = testutil.Defer(func() {
// Test the iterator.
t := testutil.IteratorTesting{
KeyValue: kv.Clone(),
Iter: NewIndexedIterator(NewArrayIndexer(index), true, true),
Iter: NewIndexedIterator(NewArrayIndexer(index), true),
}
testutil.DoIteratorTesting(&t)
done <- true

View File

@ -8,6 +8,7 @@ package iterator
import (
"github.com/syndtr/goleveldb/leveldb/comparer"
"github.com/syndtr/goleveldb/leveldb/errors"
"github.com/syndtr/goleveldb/leveldb/util"
)
@ -42,13 +43,11 @@ func assertKey(key []byte) []byte {
}
func (i *mergedIterator) iterErr(iter Iterator) bool {
if i.errf != nil {
if err := iter.Error(); err != nil {
if err := iter.Error(); err != nil {
if i.errf != nil {
i.errf(err)
}
}
if i.strict {
if err := iter.Error(); err != nil {
if i.strict || !errors.IsCorrupted(err) {
i.err = err
return true
}
@ -292,9 +291,9 @@ func (i *mergedIterator) SetErrorCallback(f func(err error)) {
// keys: if iters[i] contains a key k then iters[j] will not contain that key k.
// None of the iters may be nil.
//
// If strict is true then error yield by any iterators will halt the merged
// iterator, on contrary if strict is false then the merged iterator will
// ignore those error and move on to the next iterator.
// If strict is true the any 'corruption errors' (i.e errors.IsCorrupted(err) == true)
// won't be ignored and will halt 'merged iterator', otherwise the iterator will
// continue to the next 'input iterator'.
func NewMergedIterator(iters []Iterator, cmp comparer.Comparer, strict bool) Iterator {
return &mergedIterator{
iters: iters,

View File

@ -79,10 +79,10 @@ package journal
import (
"encoding/binary"
"errors"
"fmt"
"io"
"github.com/syndtr/goleveldb/leveldb/errors"
"github.com/syndtr/goleveldb/leveldb/util"
)
@ -109,7 +109,7 @@ type ErrCorrupted struct {
Reason string
}
func (e ErrCorrupted) Error() string {
func (e *ErrCorrupted) Error() string {
return fmt.Sprintf("leveldb/journal: block/chunk corrupted: %s (%d bytes)", e.Reason, e.Size)
}
@ -162,10 +162,10 @@ var errSkip = errors.New("leveldb/journal: skipped")
func (r *Reader) corrupt(n int, reason string, skip bool) error {
if r.dropper != nil {
r.dropper.Drop(ErrCorrupted{n, reason})
r.dropper.Drop(&ErrCorrupted{n, reason})
}
if r.strict && !skip {
r.err = ErrCorrupted{n, reason}
r.err = errors.NewErrCorrupted(nil, &ErrCorrupted{n, reason})
return r.err
}
return errSkip

View File

@ -9,15 +9,30 @@ package leveldb
import (
"encoding/binary"
"fmt"
"github.com/syndtr/goleveldb/leveldb/errors"
)
type vType int
type ErrIkeyCorrupted struct {
Ikey []byte
Reason string
}
func (t vType) String() string {
switch t {
case tDel:
func (e *ErrIkeyCorrupted) Error() string {
return fmt.Sprintf("leveldb: iKey %q corrupted: %s", e.Ikey, e.Reason)
}
func newErrIkeyCorrupted(ikey []byte, reason string) error {
return errors.NewErrCorrupted(nil, &ErrIkeyCorrupted{append([]byte{}, ikey...), reason})
}
type kType int
func (kt kType) String() string {
switch kt {
case ktDel:
return "d"
case tVal:
case ktVal:
return "v"
}
return "x"
@ -26,16 +41,16 @@ func (t vType) String() string {
// Value types encoded as the last component of internal keys.
// Don't modify; this value are saved to disk.
const (
tDel vType = iota
tVal
ktDel kType = iota
ktVal
)
// tSeek defines the vType that should be passed when constructing an
// ktSeek defines the kType that should be passed when constructing an
// internal key for seeking to a particular sequence number (since we
// sort sequence numbers in decreasing order and the value type is
// embedded as the low 8 bits in the sequence number in internal keys,
// we need to use the highest-numbered ValueType, not the lowest).
const tSeek = tVal
const ktSeek = ktVal
const (
// Maximum value possible for sequence number; the 8-bits are
@ -43,7 +58,7 @@ const (
// 64-bit integer.
kMaxSeq uint64 = (uint64(1) << 56) - 1
// Maximum value possible for packed sequence number and type.
kMaxNum uint64 = (kMaxSeq << 8) | uint64(tSeek)
kMaxNum uint64 = (kMaxSeq << 8) | uint64(ktSeek)
)
// Maximum number encoded in bytes.
@ -55,85 +70,73 @@ func init() {
type iKey []byte
func newIKey(ukey []byte, seq uint64, t vType) iKey {
if seq > kMaxSeq || t > tVal {
panic("invalid seq number or value type")
func newIkey(ukey []byte, seq uint64, kt kType) iKey {
if seq > kMaxSeq {
panic("leveldb: invalid sequence number")
} else if kt > ktVal {
panic("leveldb: invalid type")
}
b := make(iKey, len(ukey)+8)
copy(b, ukey)
binary.LittleEndian.PutUint64(b[len(ukey):], (seq<<8)|uint64(t))
return b
ik := make(iKey, len(ukey)+8)
copy(ik, ukey)
binary.LittleEndian.PutUint64(ik[len(ukey):], (seq<<8)|uint64(kt))
return ik
}
func parseIkey(p []byte) (ukey []byte, seq uint64, t vType, ok bool) {
if len(p) < 8 {
return
func parseIkey(ik []byte) (ukey []byte, seq uint64, kt kType, err error) {
if len(ik) < 8 {
return nil, 0, 0, newErrIkeyCorrupted(ik, "invalid length")
}
num := binary.LittleEndian.Uint64(p[len(p)-8:])
seq, t = uint64(num>>8), vType(num&0xff)
if t > tVal {
return
num := binary.LittleEndian.Uint64(ik[len(ik)-8:])
seq, kt = uint64(num>>8), kType(num&0xff)
if kt > ktVal {
return nil, 0, 0, newErrIkeyCorrupted(ik, "invalid type")
}
ukey = p[:len(p)-8]
ok = true
ukey = ik[:len(ik)-8]
return
}
func validIkey(p []byte) bool {
_, _, _, ok := parseIkey(p)
return ok
func validIkey(ik []byte) bool {
_, _, _, err := parseIkey(ik)
return err == nil
}
func (p iKey) assert() {
if p == nil {
panic("nil iKey")
func (ik iKey) assert() {
if ik == nil {
panic("leveldb: nil iKey")
}
if len(p) < 8 {
panic(fmt.Sprintf("invalid iKey %q, len=%d", []byte(p), len(p)))
if len(ik) < 8 {
panic(fmt.Sprintf("leveldb: iKey %q, len=%d: invalid length", ik, len(ik)))
}
}
func (p iKey) ok() bool {
if len(p) < 8 {
return false
}
_, _, ok := p.parseNum()
return ok
func (ik iKey) ukey() []byte {
ik.assert()
return ik[:len(ik)-8]
}
func (p iKey) ukey() []byte {
p.assert()
return p[:len(p)-8]
func (ik iKey) num() uint64 {
ik.assert()
return binary.LittleEndian.Uint64(ik[len(ik)-8:])
}
func (p iKey) num() uint64 {
p.assert()
return binary.LittleEndian.Uint64(p[len(p)-8:])
}
func (p iKey) parseNum() (seq uint64, t vType, ok bool) {
if p == nil {
panic("nil iKey")
func (ik iKey) parseNum() (seq uint64, kt kType) {
num := ik.num()
seq, kt = uint64(num>>8), kType(num&0xff)
if kt > ktVal {
panic(fmt.Sprintf("leveldb: iKey %q, len=%d: invalid type %#x", ik, len(ik), kt))
}
if len(p) < 8 {
return
}
num := p.num()
seq, t = uint64(num>>8), vType(num&0xff)
if t > tVal {
return 0, 0, false
}
ok = true
return
}
func (p iKey) String() string {
if len(p) == 0 {
func (ik iKey) String() string {
if ik == nil {
return "<nil>"
}
if seq, t, ok := p.parseNum(); ok {
return fmt.Sprintf("%s,%s%d", shorten(string(p.ukey())), t, seq)
if ukey, seq, kt, err := parseIkey(ik); err == nil {
return fmt.Sprintf("%s,%s%d", shorten(string(ukey)), kt, seq)
} else {
return "<invalid>"
}
return "<invalid>"
}

View File

@ -15,8 +15,8 @@ import (
var defaultIComparer = &iComparer{comparer.DefaultComparer}
func ikey(key string, seq uint64, t vType) iKey {
return newIKey([]byte(key), uint64(seq), t)
func ikey(key string, seq uint64, kt kType) iKey {
return newIkey([]byte(key), uint64(seq), kt)
}
func shortSep(a, b []byte) []byte {
@ -37,27 +37,37 @@ func shortSuccessor(b []byte) []byte {
return dst
}
func testSingleKey(t *testing.T, key string, seq uint64, vt vType) {
ik := ikey(key, seq, vt)
func testSingleKey(t *testing.T, key string, seq uint64, kt kType) {
ik := ikey(key, seq, kt)
if !bytes.Equal(ik.ukey(), []byte(key)) {
t.Errorf("user key does not equal, got %v, want %v", string(ik.ukey()), key)
}
if rseq, rt, ok := ik.parseNum(); ok {
rseq, rt := ik.parseNum()
if rseq != seq {
t.Errorf("seq number does not equal, got %v, want %v", rseq, seq)
}
if rt != kt {
t.Errorf("type does not equal, got %v, want %v", rt, kt)
}
if rukey, rseq, rt, kerr := parseIkey(ik); kerr == nil {
if !bytes.Equal(rukey, []byte(key)) {
t.Errorf("user key does not equal, got %v, want %v", string(ik.ukey()), key)
}
if rseq != seq {
t.Errorf("seq number does not equal, got %v, want %v", rseq, seq)
}
if rt != vt {
t.Errorf("type does not equal, got %v, want %v", rt, vt)
if rt != kt {
t.Errorf("type does not equal, got %v, want %v", rt, kt)
}
} else {
t.Error("cannot parse seq and type")
t.Errorf("key error: %v", kerr)
}
}
func TestIKey_EncodeDecode(t *testing.T) {
func TestIkey_EncodeDecode(t *testing.T) {
keys := []string{"", "k", "hello", "longggggggggggggggggggggg"}
seqs := []uint64{
1, 2, 3,
@ -67,8 +77,8 @@ func TestIKey_EncodeDecode(t *testing.T) {
}
for _, key := range keys {
for _, seq := range seqs {
testSingleKey(t, key, seq, tVal)
testSingleKey(t, "hello", 1, tDel)
testSingleKey(t, key, seq, ktVal)
testSingleKey(t, "hello", 1, ktDel)
}
}
}
@ -79,45 +89,45 @@ func assertBytes(t *testing.T, want, got []byte) {
}
}
func TestIKeyShortSeparator(t *testing.T) {
func TestIkeyShortSeparator(t *testing.T) {
// When user keys are same
assertBytes(t, ikey("foo", 100, tVal),
shortSep(ikey("foo", 100, tVal),
ikey("foo", 99, tVal)))
assertBytes(t, ikey("foo", 100, tVal),
shortSep(ikey("foo", 100, tVal),
ikey("foo", 101, tVal)))
assertBytes(t, ikey("foo", 100, tVal),
shortSep(ikey("foo", 100, tVal),
ikey("foo", 100, tVal)))
assertBytes(t, ikey("foo", 100, tVal),
shortSep(ikey("foo", 100, tVal),
ikey("foo", 100, tDel)))
assertBytes(t, ikey("foo", 100, ktVal),
shortSep(ikey("foo", 100, ktVal),
ikey("foo", 99, ktVal)))
assertBytes(t, ikey("foo", 100, ktVal),
shortSep(ikey("foo", 100, ktVal),
ikey("foo", 101, ktVal)))
assertBytes(t, ikey("foo", 100, ktVal),
shortSep(ikey("foo", 100, ktVal),
ikey("foo", 100, ktVal)))
assertBytes(t, ikey("foo", 100, ktVal),
shortSep(ikey("foo", 100, ktVal),
ikey("foo", 100, ktDel)))
// When user keys are misordered
assertBytes(t, ikey("foo", 100, tVal),
shortSep(ikey("foo", 100, tVal),
ikey("bar", 99, tVal)))
assertBytes(t, ikey("foo", 100, ktVal),
shortSep(ikey("foo", 100, ktVal),
ikey("bar", 99, ktVal)))
// When user keys are different, but correctly ordered
assertBytes(t, ikey("g", uint64(kMaxSeq), tSeek),
shortSep(ikey("foo", 100, tVal),
ikey("hello", 200, tVal)))
assertBytes(t, ikey("g", uint64(kMaxSeq), ktSeek),
shortSep(ikey("foo", 100, ktVal),
ikey("hello", 200, ktVal)))
// When start user key is prefix of limit user key
assertBytes(t, ikey("foo", 100, tVal),
shortSep(ikey("foo", 100, tVal),
ikey("foobar", 200, tVal)))
assertBytes(t, ikey("foo", 100, ktVal),
shortSep(ikey("foo", 100, ktVal),
ikey("foobar", 200, ktVal)))
// When limit user key is prefix of start user key
assertBytes(t, ikey("foobar", 100, tVal),
shortSep(ikey("foobar", 100, tVal),
ikey("foo", 200, tVal)))
assertBytes(t, ikey("foobar", 100, ktVal),
shortSep(ikey("foobar", 100, ktVal),
ikey("foo", 200, ktVal)))
}
func TestIKeyShortestSuccessor(t *testing.T) {
assertBytes(t, ikey("g", uint64(kMaxSeq), tSeek),
shortSuccessor(ikey("foo", 100, tVal)))
assertBytes(t, ikey("\xff\xff", 100, tVal),
shortSuccessor(ikey("\xff\xff", 100, tVal)))
func TestIkeyShortestSuccessor(t *testing.T) {
assertBytes(t, ikey("g", uint64(kMaxSeq), ktSeek),
shortSuccessor(ikey("foo", 100, ktVal)))
assertBytes(t, ikey("\xff\xff", 100, ktVal),
shortSuccessor(ikey("\xff\xff", 100, ktVal)))
}

View File

@ -8,17 +8,17 @@
package memdb
import (
"errors"
"math/rand"
"sync"
"github.com/syndtr/goleveldb/leveldb/comparer"
"github.com/syndtr/goleveldb/leveldb/errors"
"github.com/syndtr/goleveldb/leveldb/iterator"
"github.com/syndtr/goleveldb/leveldb/util"
)
var (
ErrNotFound = util.ErrNotFound
ErrNotFound = errors.ErrNotFound
ErrIterReleased = errors.New("leveldb/memdb: iterator released")
)

View File

@ -11,6 +11,7 @@ import (
"github.com/syndtr/goleveldb/leveldb/cache"
"github.com/syndtr/goleveldb/leveldb/comparer"
"github.com/syndtr/goleveldb/leveldb/filter"
"math"
)
const (
@ -20,12 +21,24 @@ const (
)
const (
DefaultBlockCacheSize = 8 * MiB
DefaultBlockRestartInterval = 16
DefaultBlockSize = 4 * KiB
DefaultCompressionType = SnappyCompression
DefaultCachedOpenFiles = 500
DefaultWriteBuffer = 4 * MiB
DefaultBlockCacheSize = 8 * MiB
DefaultBlockRestartInterval = 16
DefaultBlockSize = 4 * KiB
DefaultCompactionExpandLimitFactor = 25
DefaultCompactionGPOverlapsFactor = 10
DefaultCompactionL0Trigger = 4
DefaultCompactionSourceLimitFactor = 1
DefaultCompactionTableSize = 2 * MiB
DefaultCompactionTableSizeMultiplier = 1.0
DefaultCompactionTotalSize = 10 * MiB
DefaultCompactionTotalSizeMultiplier = 10.0
DefaultCompressionType = SnappyCompression
DefaultCachedOpenFiles = 500
DefaultMaxMemCompationLevel = 2
DefaultNumLevel = 7
DefaultWriteBuffer = 4 * MiB
DefaultWriteL0PauseTrigger = 12
DefaultWriteL0SlowdownTrigger = 8
)
type noCache struct{}
@ -65,34 +78,47 @@ const (
nCompression
)
// Strict is the DB strict level.
// Strict is the DB 'strict level'.
type Strict uint
const (
// If present then a corrupted or invalid chunk or block in manifest
// journal will cause an error istead of being dropped.
// journal will cause an error instead of being dropped.
// This will prevent database with corrupted manifest to be opened.
StrictManifest Strict = 1 << iota
// If present then a corrupted or invalid chunk or block in journal
// will cause an error istead of being dropped.
StrictJournal
// If present then journal chunk checksum will be verified.
StrictJournalChecksum
// If present then an invalid key/value pair will cause an error
// instead of being skipped.
StrictIterator
// If present then a corrupted or invalid chunk or block in journal
// will cause an error instead of being dropped.
// This will prevent database with corrupted journal to be opened.
StrictJournal
// If present then 'sorted table' block checksum will be verified.
// This has effect on both 'read operation' and compaction.
StrictBlockChecksum
// If present then a corrupted 'sorted table' will fails compaction.
// The database will enter read-only mode.
StrictCompaction
// If present then a corrupted 'sorted table' will halts 'read operation'.
StrictReader
// If present then leveldb.Recover will drop corrupted 'sorted table'.
StrictRecovery
// This only applicable for ReadOptions, if present then this ReadOptions
// 'strict level' will override global ones.
StrictOverride
// StrictAll enables all strict flags.
StrictAll = StrictManifest | StrictJournal | StrictJournalChecksum | StrictIterator | StrictBlockChecksum
StrictAll = StrictManifest | StrictJournalChecksum | StrictJournal | StrictBlockChecksum | StrictCompaction | StrictReader
// DefaultStrict is the default strict flags. Specify any strict flags
// will override default strict flags as whole (i.e. not OR'ed).
DefaultStrict = StrictJournalChecksum | StrictIterator | StrictBlockChecksum
DefaultStrict = StrictJournalChecksum | StrictBlockChecksum | StrictCompaction | StrictReader
// NoStrict disables all strict flags. Override default strict flags.
NoStrict = ^StrictAll
@ -132,6 +158,73 @@ type Options struct {
// The default value is 500.
CachedOpenFiles int
// CompactionExpandLimitFactor limits compaction size after expanded.
// This will be multiplied by table size limit at compaction target level.
//
// The default value is 25.
CompactionExpandLimitFactor int
// CompactionGPOverlapsFactor limits overlaps in grandparent (Level + 2) that a
// single 'sorted table' generates.
// This will be multiplied by table size limit at grandparent level.
//
// The default value is 10.
CompactionGPOverlapsFactor int
// CompactionL0Trigger defines number of 'sorted table' at level-0 that will
// trigger compaction.
//
// The default value is 4.
CompactionL0Trigger int
// CompactionSourceLimitFactor limits compaction source size. This doesn't apply to
// level-0.
// This will be multiplied by table size limit at compaction target level.
//
// The default value is 1.
CompactionSourceLimitFactor int
// CompactionTableSize limits size of 'sorted table' that compaction generates.
// The limits for each level will be calculated as:
// CompactionTableSize * (CompactionTableSizeMultiplier ^ Level)
// The multiplier for each level can also fine-tuned using CompactionTableSizeMultiplierPerLevel.
//
// The default value is 2MiB.
CompactionTableSize int
// CompactionTableSizeMultiplier defines multiplier for CompactionTableSize.
//
// The default value is 1.
CompactionTableSizeMultiplier float64
// CompactionTableSizeMultiplierPerLevel defines per-level multiplier for
// CompactionTableSize.
// Use zero to skip a level.
//
// The default value is nil.
CompactionTableSizeMultiplierPerLevel []float64
// CompactionTotalSize limits total size of 'sorted table' for each level.
// The limits for each level will be calculated as:
// CompactionTotalSize * (CompactionTotalSizeMultiplier ^ Level)
// The multiplier for each level can also fine-tuned using
// CompactionTotalSizeMultiplierPerLevel.
//
// The default value is 10MiB.
CompactionTotalSize int
// CompactionTotalSizeMultiplier defines multiplier for CompactionTotalSize.
//
// The default value is 10.
CompactionTotalSizeMultiplier float64
// CompactionTotalSizeMultiplierPerLevel defines per-level multiplier for
// CompactionTotalSize.
// Use zero to skip a level.
//
// The default value is nil.
CompactionTotalSizeMultiplierPerLevel []float64
// Comparer defines a total ordering over the space of []byte keys: a 'less
// than' relationship. The same comparison algorithm must be used for reads
// and writes over the lifetime of the DB.
@ -144,6 +237,11 @@ type Options struct {
// The default value (DefaultCompression) uses snappy compression.
Compression Compression
// DisableCompactionBackoff allows disable compaction retry backoff.
//
// The default value is false.
DisableCompactionBackoff bool
// ErrorIfExist defines whether an error should returned if the DB already
// exist.
//
@ -172,6 +270,19 @@ type Options struct {
// The default value is nil.
Filter filter.Filter
// MaxMemCompationLevel defines maximum level a newly compacted 'memdb'
// will be pushed into if doesn't creates overlap. This should less than
// NumLevel. Use -1 for level-0.
//
// The default is 2.
MaxMemCompationLevel int
// NumLevel defines number of database level. The level shouldn't changed
// between opens, or the database will panic.
//
// The default is 7.
NumLevel int
// Strict defines the DB strict level.
Strict Strict
@ -183,6 +294,18 @@ type Options struct {
//
// The default value is 4MiB.
WriteBuffer int
// WriteL0StopTrigger defines number of 'sorted table' at level-0 that will
// pause write.
//
// The default value is 12.
WriteL0PauseTrigger int
// WriteL0SlowdownTrigger defines number of 'sorted table' at level-0 that
// will trigger write slowdown.
//
// The default value is 8.
WriteL0SlowdownTrigger int
}
func (o *Options) GetAltFilters() []filter.Filter {
@ -222,6 +345,79 @@ func (o *Options) GetCachedOpenFiles() int {
return o.CachedOpenFiles
}
func (o *Options) GetCompactionExpandLimit(level int) int {
factor := DefaultCompactionExpandLimitFactor
if o != nil && o.CompactionExpandLimitFactor > 0 {
factor = o.CompactionExpandLimitFactor
}
return o.GetCompactionTableSize(level+1) * factor
}
func (o *Options) GetCompactionGPOverlaps(level int) int {
factor := DefaultCompactionGPOverlapsFactor
if o != nil && o.CompactionGPOverlapsFactor > 0 {
factor = o.CompactionGPOverlapsFactor
}
return o.GetCompactionTableSize(level+2) * factor
}
func (o *Options) GetCompactionL0Trigger() int {
if o == nil || o.CompactionL0Trigger == 0 {
return DefaultCompactionL0Trigger
}
return o.CompactionL0Trigger
}
func (o *Options) GetCompactionSourceLimit(level int) int {
factor := DefaultCompactionSourceLimitFactor
if o != nil && o.CompactionSourceLimitFactor > 0 {
factor = o.CompactionSourceLimitFactor
}
return o.GetCompactionTableSize(level+1) * factor
}
func (o *Options) GetCompactionTableSize(level int) int {
var (
base = DefaultCompactionTableSize
mult float64
)
if o != nil {
if o.CompactionTableSize > 0 {
base = o.CompactionTableSize
}
if len(o.CompactionTableSizeMultiplierPerLevel) > level && o.CompactionTableSizeMultiplierPerLevel[level] > 0 {
mult = o.CompactionTableSizeMultiplierPerLevel[level]
} else if o.CompactionTableSizeMultiplier > 0 {
mult = math.Pow(o.CompactionTableSizeMultiplier, float64(level))
}
}
if mult == 0 {
mult = math.Pow(DefaultCompactionTableSizeMultiplier, float64(level))
}
return int(float64(base) * mult)
}
func (o *Options) GetCompactionTotalSize(level int) int64 {
var (
base = DefaultCompactionTotalSize
mult float64
)
if o != nil {
if o.CompactionTotalSize > 0 {
base = o.CompactionTotalSize
}
if len(o.CompactionTotalSizeMultiplierPerLevel) > level && o.CompactionTotalSizeMultiplierPerLevel[level] > 0 {
mult = o.CompactionTotalSizeMultiplierPerLevel[level]
} else if o.CompactionTotalSizeMultiplier > 0 {
mult = math.Pow(o.CompactionTotalSizeMultiplier, float64(level))
}
}
if mult == 0 {
mult = math.Pow(DefaultCompactionTotalSizeMultiplier, float64(level))
}
return int64(float64(base) * mult)
}
func (o *Options) GetComparer() comparer.Comparer {
if o == nil || o.Comparer == nil {
return comparer.DefaultComparer
@ -236,6 +432,13 @@ func (o *Options) GetCompression() Compression {
return o.Compression
}
func (o *Options) GetDisableCompactionBackoff() bool {
if o == nil {
return false
}
return o.DisableCompactionBackoff
}
func (o *Options) GetErrorIfExist() bool {
if o == nil {
return false
@ -257,6 +460,28 @@ func (o *Options) GetFilter() filter.Filter {
return o.Filter
}
func (o *Options) GetMaxMemCompationLevel() int {
level := DefaultMaxMemCompationLevel
if o != nil {
if o.MaxMemCompationLevel > 0 {
level = o.MaxMemCompationLevel
} else if o.MaxMemCompationLevel == -1 {
level = 0
}
}
if level >= o.GetNumLevel() {
return o.GetNumLevel() - 1
}
return level
}
func (o *Options) GetNumLevel() int {
if o == nil || o.NumLevel <= 0 {
return DefaultNumLevel
}
return o.NumLevel
}
func (o *Options) GetStrict(strict Strict) bool {
if o == nil || o.Strict == 0 {
return DefaultStrict&strict != 0
@ -271,6 +496,20 @@ func (o *Options) GetWriteBuffer() int {
return o.WriteBuffer
}
func (o *Options) GetWriteL0PauseTrigger() int {
if o == nil || o.WriteL0PauseTrigger == 0 {
return DefaultWriteL0PauseTrigger
}
return o.WriteL0PauseTrigger
}
func (o *Options) GetWriteL0SlowdownTrigger() int {
if o == nil || o.WriteL0SlowdownTrigger == 0 {
return DefaultWriteL0SlowdownTrigger
}
return o.WriteL0SlowdownTrigger
}
// ReadOptions holds the optional parameters for 'read operation'. The
// 'read operation' includes Get, Find and NewIterator.
type ReadOptions struct {
@ -281,8 +520,8 @@ type ReadOptions struct {
// The default value is false.
DontFillCache bool
// Strict overrides global DB strict level. Only StrictIterator and
// StrictBlockChecksum that does have effects here.
// Strict will be OR'ed with global DB 'strict level' unless StrictOverride
// is present. Currently only StrictReader that has effect here.
Strict Strict
}
@ -324,3 +563,11 @@ func (wo *WriteOptions) GetSync() bool {
}
return wo.Sync
}
func GetStrict(o *Options, ro *ReadOptions, strict Strict) bool {
if ro.GetStrict(StrictOverride) {
return ro.GetStrict(strict)
} else {
return o.GetStrict(strict) || ro.GetStrict(strict)
}
}

View File

@ -12,30 +12,86 @@ import (
"github.com/syndtr/goleveldb/leveldb/opt"
)
func (s *session) setOptions(o *opt.Options) {
s.o = &opt.Options{}
func dupOptions(o *opt.Options) *opt.Options {
newo := &opt.Options{}
if o != nil {
*s.o = *o
*newo = *o
}
return newo
}
func (s *session) setOptions(o *opt.Options) {
no := dupOptions(o)
// Alternative filters.
if filters := o.GetAltFilters(); len(filters) > 0 {
s.o.AltFilters = make([]filter.Filter, len(filters))
no.AltFilters = make([]filter.Filter, len(filters))
for i, filter := range filters {
s.o.AltFilters[i] = &iFilter{filter}
no.AltFilters[i] = &iFilter{filter}
}
}
// Block cache.
switch o.GetBlockCache() {
case nil:
s.o.BlockCache = cache.NewLRUCache(opt.DefaultBlockCacheSize)
no.BlockCache = cache.NewLRUCache(opt.DefaultBlockCacheSize)
case opt.NoCache:
s.o.BlockCache = nil
no.BlockCache = nil
}
// Comparer.
s.icmp = &iComparer{o.GetComparer()}
s.o.Comparer = s.icmp
no.Comparer = s.icmp
// Filter.
if filter := o.GetFilter(); filter != nil {
s.o.Filter = &iFilter{filter}
no.Filter = &iFilter{filter}
}
s.o = &cachedOptions{Options: no}
s.o.cache()
}
type cachedOptions struct {
*opt.Options
compactionExpandLimit []int
compactionGPOverlaps []int
compactionSourceLimit []int
compactionTableSize []int
compactionTotalSize []int64
}
func (co *cachedOptions) cache() {
numLevel := co.Options.GetNumLevel()
co.compactionExpandLimit = make([]int, numLevel)
co.compactionGPOverlaps = make([]int, numLevel)
co.compactionSourceLimit = make([]int, numLevel)
co.compactionTableSize = make([]int, numLevel)
co.compactionTotalSize = make([]int64, numLevel)
for level := 0; level < numLevel; level++ {
co.compactionExpandLimit[level] = co.Options.GetCompactionExpandLimit(level)
co.compactionGPOverlaps[level] = co.Options.GetCompactionGPOverlaps(level)
co.compactionSourceLimit[level] = co.Options.GetCompactionSourceLimit(level)
co.compactionTableSize[level] = co.Options.GetCompactionTableSize(level)
co.compactionTotalSize[level] = co.Options.GetCompactionTotalSize(level)
}
}
func (co *cachedOptions) GetCompactionExpandLimit(level int) int {
return co.compactionExpandLimit[level]
}
func (co *cachedOptions) GetCompactionGPOverlaps(level int) int {
return co.compactionGPOverlaps[level]
}
func (co *cachedOptions) GetCompactionSourceLimit(level int) int {
return co.compactionSourceLimit[level]
}
func (co *cachedOptions) GetCompactionTableSize(level int) int {
return co.compactionTableSize[level]
}
func (co *cachedOptions) GetCompactionTotalSize(level int) int64 {
return co.compactionTotalSize[level]
}

View File

@ -7,12 +7,13 @@
package leveldb
import (
"errors"
"fmt"
"io"
"os"
"sync"
"sync/atomic"
"github.com/syndtr/goleveldb/leveldb/errors"
"github.com/syndtr/goleveldb/leveldb/iterator"
"github.com/syndtr/goleveldb/leveldb/journal"
"github.com/syndtr/goleveldb/leveldb/opt"
@ -20,18 +21,31 @@ import (
"github.com/syndtr/goleveldb/leveldb/util"
)
type ErrManifestCorrupted struct {
Field string
Reason string
}
func (e *ErrManifestCorrupted) Error() string {
return fmt.Sprintf("leveldb: manifest corrupted (field '%s'): %s", e.Field, e.Reason)
}
func newErrManifestCorrupted(f storage.File, field, reason string) error {
return errors.NewErrCorrupted(f, &ErrManifestCorrupted{field, reason})
}
// session represent a persistent database session.
type session struct {
// Need 64-bit alignment.
stFileNum uint64 // current unused file number
stNextFileNum uint64 // current unused file number
stJournalNum uint64 // current journal file number; need external synchronization
stPrevJournalNum uint64 // prev journal file number; no longer used; for compatibility with older version of leveldb
stSeq uint64 // last mem compacted seq; need external synchronization
stSeqNum uint64 // last mem compacted seq; need external synchronization
stTempFileNum uint64
stor storage.Storage
storLock util.Releaser
o *opt.Options
o *cachedOptions
icmp *iComparer
tops *tOps
@ -39,9 +53,9 @@ type session struct {
manifestWriter storage.Writer
manifestFile storage.File
stCptrs [kNumLevels]iKey // compact pointers; need external synchronization
stVersion *version // current version
vmu sync.Mutex
stCompPtrs []iKey // compaction pointers; need external synchronization
stVersion *version // current version
vmu sync.Mutex
}
// Creates new initialized session instance.
@ -54,13 +68,14 @@ func newSession(stor storage.Storage, o *opt.Options) (s *session, err error) {
return
}
s = &session{
stor: stor,
storLock: storLock,
stor: stor,
storLock: storLock,
stCompPtrs: make([]iKey, o.GetNumLevel()),
}
s.setOptions(o)
s.tops = newTableOps(s, s.o.GetCachedOpenFiles())
s.setVersion(&version{s: s})
s.log("log@legend F·NumFile S·FileSize N·Entry C·BadEntry B·BadBlock D·DeletedEntry L·Level Q·SeqNum T·TimeElapsed")
s.setVersion(newVersion(s))
s.log("log@legend F·NumFile S·FileSize N·Entry C·BadEntry B·BadBlock Ke·KeyError D·DroppedEntry L·Level Q·SeqNum T·TimeElapsed")
return
}
@ -100,26 +115,26 @@ func (s *session) recover() (err error) {
// Don't return os.ErrNotExist if the underlying storage contains
// other files that belong to LevelDB. So the DB won't get trashed.
if files, _ := s.stor.GetFiles(storage.TypeAll); len(files) > 0 {
err = ErrCorrupted{Type: CorruptedManifest, Err: errors.New("leveldb: manifest file missing")}
err = &errors.ErrCorrupted{File: &storage.FileInfo{Type: storage.TypeManifest}, Err: &errors.ErrMissingFiles{}}
}
}
}()
file, err := s.stor.GetManifest()
m, err := s.stor.GetManifest()
if err != nil {
return
}
reader, err := file.Open()
reader, err := m.Open()
if err != nil {
return
}
defer reader.Close()
strict := s.o.GetStrict(opt.StrictManifest)
jr := journal.NewReader(reader, dropper{s, file}, strict, true)
jr := journal.NewReader(reader, dropper{s, m}, strict, true)
staging := s.version_NB().newStaging()
rec := &sessionRecord{}
staging := s.stVersion.newStaging()
rec := &sessionRecord{numLevel: s.o.GetNumLevel()}
for {
var r io.Reader
r, err = jr.Next()
@ -128,51 +143,57 @@ func (s *session) recover() (err error) {
err = nil
break
}
return
return errors.SetFile(err, m)
}
err = rec.decode(r)
if err == nil {
// save compact pointers
for _, r := range rec.compactionPointers {
s.stCptrs[r.level] = iKey(r.ikey)
for _, r := range rec.compPtrs {
s.stCompPtrs[r.level] = iKey(r.ikey)
}
// commit record to version staging
staging.commit(rec)
} else if strict {
return ErrCorrupted{Type: CorruptedManifest, Err: err}
} else {
s.logf("manifest error: %v (skipped)", err)
err = errors.SetFile(err, m)
if strict || !errors.IsCorrupted(err) {
return
} else {
s.logf("manifest error: %v (skipped)", errors.SetFile(err, m))
}
}
rec.resetCompactionPointers()
rec.resetCompPtrs()
rec.resetAddedTables()
rec.resetDeletedTables()
}
switch {
case !rec.has(recComparer):
return ErrCorrupted{Type: CorruptedManifest, Err: errors.New("leveldb: manifest missing comparer name")}
return newErrManifestCorrupted(m, "comparer", "missing")
case rec.comparer != s.icmp.uName():
return ErrCorrupted{Type: CorruptedManifest, Err: errors.New("leveldb: comparer mismatch, " + "want '" + s.icmp.uName() + "', " + "got '" + rec.comparer + "'")}
case !rec.has(recNextNum):
return ErrCorrupted{Type: CorruptedManifest, Err: errors.New("leveldb: manifest missing next file number")}
return newErrManifestCorrupted(m, "comparer", fmt.Sprintf("mismatch: want '%s', got '%s'", s.icmp.uName(), rec.comparer))
case !rec.has(recNextFileNum):
return newErrManifestCorrupted(m, "next-file-num", "missing")
case !rec.has(recJournalNum):
return ErrCorrupted{Type: CorruptedManifest, Err: errors.New("leveldb: manifest missing journal file number")}
case !rec.has(recSeq):
return ErrCorrupted{Type: CorruptedManifest, Err: errors.New("leveldb: manifest missing seq number")}
return newErrManifestCorrupted(m, "journal-file-num", "missing")
case !rec.has(recSeqNum):
return newErrManifestCorrupted(m, "seq-num", "missing")
}
s.manifestFile = file
s.manifestFile = m
s.setVersion(staging.finish())
s.setFileNum(rec.nextNum)
s.setNextFileNum(rec.nextFileNum)
s.recordCommited(rec)
return nil
}
// Commit session; need external synchronization.
func (s *session) commit(r *sessionRecord) (err error) {
v := s.version()
defer v.release()
// spawn new version based on current version
nv := s.version_NB().spawn(r)
nv := v.spawn(r)
if s.manifest == nil {
// manifest journal writer not yet created, create one
@ -191,13 +212,13 @@ func (s *session) commit(r *sessionRecord) (err error) {
// Pick a compaction based on current state; need external synchronization.
func (s *session) pickCompaction() *compaction {
v := s.version_NB()
v := s.version()
var level int
var t0 tFiles
if v.cScore >= 1 {
level = v.cLevel
cptr := s.stCptrs[level]
cptr := s.stCompPtrs[level]
tables := v.tables[level]
for _, t := range tables {
if cptr == nil || s.icmp.Compare(t.imax, cptr) > 0 {
@ -214,27 +235,21 @@ func (s *session) pickCompaction() *compaction {
level = ts.level
t0 = append(t0, ts.table)
} else {
v.release()
return nil
}
}
c := &compaction{s: s, v: v, level: level}
if level == 0 {
imin, imax := t0.getRange(s.icmp)
t0 = v.tables[0].getOverlaps(t0[:0], s.icmp, imin.ukey(), imax.ukey(), true)
}
c.tables[0] = t0
c.expand()
return c
return newCompaction(s, v, level, t0)
}
// Create compaction from given level and range; need external synchronization.
func (s *session) getCompactionRange(level int, umin, umax []byte) *compaction {
v := s.version_NB()
v := s.version()
t0 := v.tables[level].getOverlaps(nil, s.icmp, umin, umax, level == 0)
if len(t0) == 0 {
v.release()
return nil
}
@ -243,7 +258,7 @@ func (s *session) getCompactionRange(level int, umin, umax []byte) *compaction {
// and we must not pick one file and drop another older file if the
// two files overlap.
if level > 0 {
limit := uint64(kMaxTableSize)
limit := uint64(v.s.o.GetCompactionSourceLimit(level))
total := uint64(0)
for i, t := range t0 {
total += t.size
@ -255,9 +270,20 @@ func (s *session) getCompactionRange(level int, umin, umax []byte) *compaction {
}
}
c := &compaction{s: s, v: v, level: level}
c.tables[0] = t0
return newCompaction(s, v, level, t0)
}
func newCompaction(s *session, v *version, level int, t0 tFiles) *compaction {
c := &compaction{
s: s,
v: v,
level: level,
tables: [2]tFiles{t0, nil},
maxGPOverlaps: uint64(s.o.GetCompactionGPOverlaps(level)),
tPtrs: make([]int, s.o.GetNumLevel()),
}
c.expand()
c.save()
return c
}
@ -266,25 +292,57 @@ type compaction struct {
s *session
v *version
level int
tables [2]tFiles
level int
tables [2]tFiles
maxGPOverlaps uint64
gp tFiles
gpidx int
seenKey bool
overlappedBytes uint64
imin, imax iKey
gp tFiles
gpi int
seenKey bool
gpOverlappedBytes uint64
imin, imax iKey
tPtrs []int
released bool
tPtrs [kNumLevels]int
snapGPI int
snapSeenKey bool
snapGPOverlappedBytes uint64
snapTPtrs []int
}
func (c *compaction) save() {
c.snapGPI = c.gpi
c.snapSeenKey = c.seenKey
c.snapGPOverlappedBytes = c.gpOverlappedBytes
c.snapTPtrs = append(c.snapTPtrs[:0], c.tPtrs...)
}
func (c *compaction) restore() {
c.gpi = c.snapGPI
c.seenKey = c.snapSeenKey
c.gpOverlappedBytes = c.snapGPOverlappedBytes
c.tPtrs = append(c.tPtrs[:0], c.snapTPtrs...)
}
func (c *compaction) release() {
if !c.released {
c.released = true
c.v.release()
}
}
// Expand compacted tables; need external synchronization.
func (c *compaction) expand() {
level := c.level
vt0, vt1 := c.v.tables[level], c.v.tables[level+1]
limit := uint64(c.s.o.GetCompactionExpandLimit(c.level))
vt0, vt1 := c.v.tables[c.level], c.v.tables[c.level+1]
t0, t1 := c.tables[0], c.tables[1]
imin, imax := t0.getRange(c.s.icmp)
// We expand t0 here just incase ukey hop across tables.
t0 = vt0.getOverlaps(t0, c.s.icmp, imin.ukey(), imax.ukey(), c.level == 0)
if len(t0) != len(c.tables[0]) {
imin, imax = t0.getRange(c.s.icmp)
}
t1 = vt1.getOverlaps(t1, c.s.icmp, imin.ukey(), imax.ukey(), false)
// Get entire range covered by compaction.
amin, amax := append(t0, t1...).getRange(c.s.icmp)
@ -292,13 +350,13 @@ func (c *compaction) expand() {
// See if we can grow the number of inputs in "level" without
// changing the number of "level+1" files we pick up.
if len(t1) > 0 {
exp0 := vt0.getOverlaps(nil, c.s.icmp, amin.ukey(), amax.ukey(), level == 0)
if len(exp0) > len(t0) && t1.size()+exp0.size() < kExpCompactionMaxBytes {
exp0 := vt0.getOverlaps(nil, c.s.icmp, amin.ukey(), amax.ukey(), c.level == 0)
if len(exp0) > len(t0) && t1.size()+exp0.size() < limit {
xmin, xmax := exp0.getRange(c.s.icmp)
exp1 := vt1.getOverlaps(nil, c.s.icmp, xmin.ukey(), xmax.ukey(), false)
if len(exp1) == len(t1) {
c.s.logf("table@compaction expanding L%d+L%d (F·%d S·%s)+(F·%d S·%s) -> (F·%d S·%s)+(F·%d S·%s)",
level, level+1, len(t0), shortenb(int(t0.size())), len(t1), shortenb(int(t1.size())),
c.level, c.level+1, len(t0), shortenb(int(t0.size())), len(t1), shortenb(int(t1.size())),
len(exp0), shortenb(int(exp0.size())), len(exp1), shortenb(int(exp1.size())))
imin, imax = xmin, xmax
t0, t1 = exp0, exp1
@ -309,8 +367,8 @@ func (c *compaction) expand() {
// Compute the set of grandparent files that overlap this compaction
// (parent == level+1; grandparent == level+2)
if level+2 < kNumLevels {
c.gp = c.v.tables[level+2].getOverlaps(c.gp, c.s.icmp, amin.ukey(), amax.ukey(), false)
if c.level+2 < c.s.o.GetNumLevel() {
c.gp = c.v.tables[c.level+2].getOverlaps(c.gp, c.s.icmp, amin.ukey(), amax.ukey(), false)
}
c.tables[0], c.tables[1] = t0, t1
@ -319,7 +377,7 @@ func (c *compaction) expand() {
// Check whether compaction is trivial.
func (c *compaction) trivial() bool {
return len(c.tables[0]) == 1 && len(c.tables[1]) == 0 && c.gp.size() <= kMaxGrandParentOverlapBytes
return len(c.tables[0]) == 1 && len(c.tables[1]) == 0 && c.gp.size() <= c.maxGPOverlaps
}
func (c *compaction) baseLevelForKey(ukey []byte) bool {
@ -341,20 +399,20 @@ func (c *compaction) baseLevelForKey(ukey []byte) bool {
}
func (c *compaction) shouldStopBefore(ikey iKey) bool {
for ; c.gpidx < len(c.gp); c.gpidx++ {
gp := c.gp[c.gpidx]
for ; c.gpi < len(c.gp); c.gpi++ {
gp := c.gp[c.gpi]
if c.s.icmp.Compare(ikey, gp.imax) <= 0 {
break
}
if c.seenKey {
c.overlappedBytes += gp.size
c.gpOverlappedBytes += gp.size
}
}
c.seenKey = true
if c.overlappedBytes > kMaxGrandParentOverlapBytes {
if c.gpOverlappedBytes > c.maxGPOverlaps {
// Too much overlap for current output; start new output.
c.overlappedBytes = 0
c.gpOverlappedBytes = 0
return true
}
return false
@ -373,8 +431,12 @@ func (c *compaction) newIterator() iterator.Iterator {
// Options.
ro := &opt.ReadOptions{
DontFillCache: true,
Strict: opt.StrictOverride,
}
strict := c.s.o.GetStrict(opt.StrictCompaction)
if strict {
ro.Strict |= opt.StrictReader
}
strict := c.s.o.GetStrict(opt.StrictIterator)
for i, tables := range c.tables {
if len(tables) == 0 {
@ -387,10 +449,10 @@ func (c *compaction) newIterator() iterator.Iterator {
its = append(its, c.s.tops.newIterator(t, nil, ro))
}
} else {
it := iterator.NewIndexedIterator(tables.newIndexIterator(c.s.tops, c.s.icmp, nil, ro), strict, true)
it := iterator.NewIndexedIterator(tables.newIndexIterator(c.s.tops, c.s.icmp, nil, ro), strict)
its = append(its, it)
}
}
return iterator.NewMergedIterator(its, c.s.icmp, true)
return iterator.NewMergedIterator(its, c.s.icmp, strict)
}

View File

@ -9,11 +9,11 @@ package leveldb
import (
"bufio"
"encoding/binary"
"errors"
"io"
)
"strings"
var errCorruptManifest = errors.New("leveldb: corrupt manifest")
"github.com/syndtr/goleveldb/leveldb/errors"
)
type byteReader interface {
io.Reader
@ -22,13 +22,13 @@ type byteReader interface {
// These numbers are written to disk and should not be changed.
const (
recComparer = 1
recJournalNum = 2
recNextNum = 3
recSeq = 4
recCompactionPointer = 5
recDeletedTable = 6
recNewTable = 7
recComparer = 1
recJournalNum = 2
recNextFileNum = 3
recSeqNum = 4
recCompPtr = 5
recDelTable = 6
recAddTable = 7
// 8 was used for large value refs
recPrevJournalNum = 9
)
@ -38,7 +38,7 @@ type cpRecord struct {
ikey iKey
}
type ntRecord struct {
type atRecord struct {
level int
num uint64
size uint64
@ -46,27 +46,26 @@ type ntRecord struct {
imax iKey
}
func (r ntRecord) makeFile(s *session) *tFile {
return newTableFile(s.getTableFile(r.num), r.size, r.imin, r.imax)
}
type dtRecord struct {
level int
num uint64
}
type sessionRecord struct {
hasRec int
comparer string
journalNum uint64
prevJournalNum uint64
nextNum uint64
seq uint64
compactionPointers []cpRecord
addedTables []ntRecord
deletedTables []dtRecord
scratch [binary.MaxVarintLen64]byte
err error
numLevel int
hasRec int
comparer string
journalNum uint64
prevJournalNum uint64
nextFileNum uint64
seqNum uint64
compPtrs []cpRecord
addedTables []atRecord
deletedTables []dtRecord
scratch [binary.MaxVarintLen64]byte
err error
}
func (p *sessionRecord) has(rec int) bool {
@ -88,29 +87,29 @@ func (p *sessionRecord) setPrevJournalNum(num uint64) {
p.prevJournalNum = num
}
func (p *sessionRecord) setNextNum(num uint64) {
p.hasRec |= 1 << recNextNum
p.nextNum = num
func (p *sessionRecord) setNextFileNum(num uint64) {
p.hasRec |= 1 << recNextFileNum
p.nextFileNum = num
}
func (p *sessionRecord) setSeq(seq uint64) {
p.hasRec |= 1 << recSeq
p.seq = seq
func (p *sessionRecord) setSeqNum(num uint64) {
p.hasRec |= 1 << recSeqNum
p.seqNum = num
}
func (p *sessionRecord) addCompactionPointer(level int, ikey iKey) {
p.hasRec |= 1 << recCompactionPointer
p.compactionPointers = append(p.compactionPointers, cpRecord{level, ikey})
func (p *sessionRecord) addCompPtr(level int, ikey iKey) {
p.hasRec |= 1 << recCompPtr
p.compPtrs = append(p.compPtrs, cpRecord{level, ikey})
}
func (p *sessionRecord) resetCompactionPointers() {
p.hasRec &= ^(1 << recCompactionPointer)
p.compactionPointers = p.compactionPointers[:0]
func (p *sessionRecord) resetCompPtrs() {
p.hasRec &= ^(1 << recCompPtr)
p.compPtrs = p.compPtrs[:0]
}
func (p *sessionRecord) addTable(level int, num, size uint64, imin, imax iKey) {
p.hasRec |= 1 << recNewTable
p.addedTables = append(p.addedTables, ntRecord{level, num, size, imin, imax})
p.hasRec |= 1 << recAddTable
p.addedTables = append(p.addedTables, atRecord{level, num, size, imin, imax})
}
func (p *sessionRecord) addTableFile(level int, t *tFile) {
@ -118,17 +117,17 @@ func (p *sessionRecord) addTableFile(level int, t *tFile) {
}
func (p *sessionRecord) resetAddedTables() {
p.hasRec &= ^(1 << recNewTable)
p.hasRec &= ^(1 << recAddTable)
p.addedTables = p.addedTables[:0]
}
func (p *sessionRecord) deleteTable(level int, num uint64) {
p.hasRec |= 1 << recDeletedTable
func (p *sessionRecord) delTable(level int, num uint64) {
p.hasRec |= 1 << recDelTable
p.deletedTables = append(p.deletedTables, dtRecord{level, num})
}
func (p *sessionRecord) resetDeletedTables() {
p.hasRec &= ^(1 << recDeletedTable)
p.hasRec &= ^(1 << recDelTable)
p.deletedTables = p.deletedTables[:0]
}
@ -161,26 +160,26 @@ func (p *sessionRecord) encode(w io.Writer) error {
p.putUvarint(w, recJournalNum)
p.putUvarint(w, p.journalNum)
}
if p.has(recNextNum) {
p.putUvarint(w, recNextNum)
p.putUvarint(w, p.nextNum)
if p.has(recNextFileNum) {
p.putUvarint(w, recNextFileNum)
p.putUvarint(w, p.nextFileNum)
}
if p.has(recSeq) {
p.putUvarint(w, recSeq)
p.putUvarint(w, p.seq)
if p.has(recSeqNum) {
p.putUvarint(w, recSeqNum)
p.putUvarint(w, p.seqNum)
}
for _, r := range p.compactionPointers {
p.putUvarint(w, recCompactionPointer)
for _, r := range p.compPtrs {
p.putUvarint(w, recCompPtr)
p.putUvarint(w, uint64(r.level))
p.putBytes(w, r.ikey)
}
for _, r := range p.deletedTables {
p.putUvarint(w, recDeletedTable)
p.putUvarint(w, recDelTable)
p.putUvarint(w, uint64(r.level))
p.putUvarint(w, r.num)
}
for _, r := range p.addedTables {
p.putUvarint(w, recNewTable)
p.putUvarint(w, recAddTable)
p.putUvarint(w, uint64(r.level))
p.putUvarint(w, r.num)
p.putUvarint(w, r.size)
@ -190,14 +189,16 @@ func (p *sessionRecord) encode(w io.Writer) error {
return p.err
}
func (p *sessionRecord) readUvarint(r io.ByteReader) uint64 {
func (p *sessionRecord) readUvarintMayEOF(field string, r io.ByteReader, mayEOF bool) uint64 {
if p.err != nil {
return 0
}
x, err := binary.ReadUvarint(r)
if err != nil {
if err == io.EOF {
p.err = errCorruptManifest
if err == io.ErrUnexpectedEOF || (mayEOF == false && err == io.EOF) {
p.err = errors.NewErrCorrupted(nil, &ErrManifestCorrupted{field, "short read"})
} else if strings.HasPrefix(err.Error(), "binary:") {
p.err = errors.NewErrCorrupted(nil, &ErrManifestCorrupted{field, err.Error()})
} else {
p.err = err
}
@ -206,35 +207,39 @@ func (p *sessionRecord) readUvarint(r io.ByteReader) uint64 {
return x
}
func (p *sessionRecord) readBytes(r byteReader) []byte {
func (p *sessionRecord) readUvarint(field string, r io.ByteReader) uint64 {
return p.readUvarintMayEOF(field, r, false)
}
func (p *sessionRecord) readBytes(field string, r byteReader) []byte {
if p.err != nil {
return nil
}
n := p.readUvarint(r)
n := p.readUvarint(field, r)
if p.err != nil {
return nil
}
x := make([]byte, n)
_, p.err = io.ReadFull(r, x)
if p.err != nil {
if p.err == io.EOF {
p.err = errCorruptManifest
if p.err == io.ErrUnexpectedEOF {
p.err = errors.NewErrCorrupted(nil, &ErrManifestCorrupted{field, "short read"})
}
return nil
}
return x
}
func (p *sessionRecord) readLevel(r io.ByteReader) int {
func (p *sessionRecord) readLevel(field string, r io.ByteReader) int {
if p.err != nil {
return 0
}
x := p.readUvarint(r)
x := p.readUvarint(field, r)
if p.err != nil {
return 0
}
if x >= kNumLevels {
p.err = errCorruptManifest
if x >= uint64(p.numLevel) {
p.err = errors.NewErrCorrupted(nil, &ErrManifestCorrupted{field, "invalid level number"})
return 0
}
return int(x)
@ -247,59 +252,59 @@ func (p *sessionRecord) decode(r io.Reader) error {
}
p.err = nil
for p.err == nil {
rec, err := binary.ReadUvarint(br)
if err != nil {
if err == io.EOF {
err = nil
rec := p.readUvarintMayEOF("field-header", br, true)
if p.err != nil {
if p.err == io.EOF {
return nil
}
return err
return p.err
}
switch rec {
case recComparer:
x := p.readBytes(br)
x := p.readBytes("comparer", br)
if p.err == nil {
p.setComparer(string(x))
}
case recJournalNum:
x := p.readUvarint(br)
x := p.readUvarint("journal-num", br)
if p.err == nil {
p.setJournalNum(x)
}
case recPrevJournalNum:
x := p.readUvarint(br)
x := p.readUvarint("prev-journal-num", br)
if p.err == nil {
p.setPrevJournalNum(x)
}
case recNextNum:
x := p.readUvarint(br)
case recNextFileNum:
x := p.readUvarint("next-file-num", br)
if p.err == nil {
p.setNextNum(x)
p.setNextFileNum(x)
}
case recSeq:
x := p.readUvarint(br)
case recSeqNum:
x := p.readUvarint("seq-num", br)
if p.err == nil {
p.setSeq(x)
p.setSeqNum(x)
}
case recCompactionPointer:
level := p.readLevel(br)
ikey := p.readBytes(br)
case recCompPtr:
level := p.readLevel("comp-ptr.level", br)
ikey := p.readBytes("comp-ptr.ikey", br)
if p.err == nil {
p.addCompactionPointer(level, iKey(ikey))
p.addCompPtr(level, iKey(ikey))
}
case recNewTable:
level := p.readLevel(br)
num := p.readUvarint(br)
size := p.readUvarint(br)
imin := p.readBytes(br)
imax := p.readBytes(br)
case recAddTable:
level := p.readLevel("add-table.level", br)
num := p.readUvarint("add-table.num", br)
size := p.readUvarint("add-table.size", br)
imin := p.readBytes("add-table.imin", br)
imax := p.readBytes("add-table.imax", br)
if p.err == nil {
p.addTable(level, num, size, imin, imax)
}
case recDeletedTable:
level := p.readLevel(br)
num := p.readUvarint(br)
case recDelTable:
level := p.readLevel("del-table.level", br)
num := p.readUvarint("del-table.num", br)
if p.err == nil {
p.deleteTable(level, num)
p.delTable(level, num)
}
}
}

View File

@ -9,6 +9,8 @@ package leveldb
import (
"bytes"
"testing"
"github.com/syndtr/goleveldb/leveldb/opt"
)
func decodeEncode(v *sessionRecord) (res bool, err error) {
@ -17,7 +19,7 @@ func decodeEncode(v *sessionRecord) (res bool, err error) {
if err != nil {
return
}
v2 := new(sessionRecord)
v2 := &sessionRecord{numLevel: opt.DefaultNumLevel}
err = v.decode(b)
if err != nil {
return
@ -32,7 +34,7 @@ func decodeEncode(v *sessionRecord) (res bool, err error) {
func TestSessionRecord_EncodeDecode(t *testing.T) {
big := uint64(1) << 50
v := new(sessionRecord)
v := &sessionRecord{numLevel: opt.DefaultNumLevel}
i := uint64(0)
test := func() {
res, err := decodeEncode(v)
@ -47,16 +49,16 @@ func TestSessionRecord_EncodeDecode(t *testing.T) {
for ; i < 4; i++ {
test()
v.addTable(3, big+300+i, big+400+i,
newIKey([]byte("foo"), big+500+1, tVal),
newIKey([]byte("zoo"), big+600+1, tDel))
v.deleteTable(4, big+700+i)
v.addCompactionPointer(int(i), newIKey([]byte("x"), big+900+1, tVal))
newIkey([]byte("foo"), big+500+1, ktVal),
newIkey([]byte("zoo"), big+600+1, ktDel))
v.delTable(4, big+700+i)
v.addCompPtr(int(i), newIkey([]byte("x"), big+900+1, ktVal))
}
v.setComparer("foo")
v.setJournalNum(big + 100)
v.setPrevJournalNum(big + 99)
v.setNextNum(big + 200)
v.setSeq(big + 1000)
v.setNextFileNum(big + 200)
v.setSeqNum(big + 1000)
test()
}

View File

@ -22,7 +22,7 @@ type dropper struct {
}
func (d dropper) Drop(err error) {
if e, ok := err.(journal.ErrCorrupted); ok {
if e, ok := err.(*journal.ErrCorrupted); ok {
d.s.logf("journal@drop %s-%d S·%s %q", d.file.Type(), d.file.Num(), shortenb(e.Size), e.Reason)
} else {
d.s.logf("journal@drop %s-%d %q", d.file.Type(), d.file.Num(), err)
@ -51,9 +51,14 @@ func (s *session) newTemp() storage.File {
return s.stor.GetFile(num, storage.TypeTemp)
}
func (s *session) tableFileFromRecord(r atRecord) *tFile {
return newTableFile(s.getTableFile(r.num), r.size, r.imin, r.imax)
}
// Session state.
// Get current version.
// Get current version. This will incr version ref, must call
// version.release (exactly once) after use.
func (s *session) version() *version {
s.vmu.Lock()
defer s.vmu.Unlock()
@ -61,61 +66,56 @@ func (s *session) version() *version {
return s.stVersion
}
// Get current version; no barrier.
func (s *session) version_NB() *version {
return s.stVersion
}
// Set current version to v.
func (s *session) setVersion(v *version) {
s.vmu.Lock()
v.ref = 1
v.ref = 1 // Holds by session.
if old := s.stVersion; old != nil {
v.ref++
v.ref++ // Holds by old version.
old.next = v
old.release_NB()
old.releaseNB()
}
s.stVersion = v
s.vmu.Unlock()
}
// Get current unused file number.
func (s *session) fileNum() uint64 {
return atomic.LoadUint64(&s.stFileNum)
func (s *session) nextFileNum() uint64 {
return atomic.LoadUint64(&s.stNextFileNum)
}
// Get current unused file number to num.
func (s *session) setFileNum(num uint64) {
atomic.StoreUint64(&s.stFileNum, num)
// Set current unused file number to num.
func (s *session) setNextFileNum(num uint64) {
atomic.StoreUint64(&s.stNextFileNum, num)
}
// Mark file number as used.
func (s *session) markFileNum(num uint64) {
num += 1
nextFileNum := num + 1
for {
old, x := s.stFileNum, num
old, x := s.stNextFileNum, nextFileNum
if old > x {
x = old
}
if atomic.CompareAndSwapUint64(&s.stFileNum, old, x) {
if atomic.CompareAndSwapUint64(&s.stNextFileNum, old, x) {
break
}
}
}
// Allocate a file number.
func (s *session) allocFileNum() (num uint64) {
return atomic.AddUint64(&s.stFileNum, 1) - 1
func (s *session) allocFileNum() uint64 {
return atomic.AddUint64(&s.stNextFileNum, 1) - 1
}
// Reuse given file number.
func (s *session) reuseFileNum(num uint64) {
for {
old, x := s.stFileNum, num
old, x := s.stNextFileNum, num
if old != x+1 {
x = old
}
if atomic.CompareAndSwapUint64(&s.stFileNum, old, x) {
if atomic.CompareAndSwapUint64(&s.stNextFileNum, old, x) {
break
}
}
@ -126,20 +126,20 @@ func (s *session) reuseFileNum(num uint64) {
// Fill given session record obj with current states; need external
// synchronization.
func (s *session) fillRecord(r *sessionRecord, snapshot bool) {
r.setNextNum(s.fileNum())
r.setNextFileNum(s.nextFileNum())
if snapshot {
if !r.has(recJournalNum) {
r.setJournalNum(s.stJournalNum)
}
if !r.has(recSeq) {
r.setSeq(s.stSeq)
if !r.has(recSeqNum) {
r.setSeqNum(s.stSeqNum)
}
for level, ik := range s.stCptrs {
for level, ik := range s.stCompPtrs {
if ik != nil {
r.addCompactionPointer(level, ik)
r.addCompPtr(level, ik)
}
}
@ -158,12 +158,12 @@ func (s *session) recordCommited(r *sessionRecord) {
s.stPrevJournalNum = r.prevJournalNum
}
if r.has(recSeq) {
s.stSeq = r.seq
if r.has(recSeqNum) {
s.stSeqNum = r.seqNum
}
for _, p := range r.compactionPointers {
s.stCptrs[p.level] = iKey(p.ikey)
for _, p := range r.compPtrs {
s.stCompPtrs[p.level] = iKey(p.ikey)
}
}
@ -178,10 +178,11 @@ func (s *session) newManifest(rec *sessionRecord, v *version) (err error) {
jw := journal.NewWriter(writer)
if v == nil {
v = s.version_NB()
v = s.version()
defer v.release()
}
if rec == nil {
rec = new(sessionRecord)
rec = &sessionRecord{numLevel: s.o.GetNumLevel()}
}
s.fillRecord(rec, true)
v.fillRecord(rec)

View File

@ -125,3 +125,33 @@ type Storage interface {
// Other methods should not be called after the storage has been closed.
Close() error
}
// FileInfo wraps basic file info.
type FileInfo struct {
Type FileType
Num uint64
}
func (fi FileInfo) String() string {
switch fi.Type {
case TypeManifest:
return fmt.Sprintf("MANIFEST-%06d", fi.Num)
case TypeJournal:
return fmt.Sprintf("%06d.log", fi.Num)
case TypeTable:
return fmt.Sprintf("%06d.ldb", fi.Num)
case TypeTemp:
return fmt.Sprintf("%06d.tmp", fi.Num)
default:
return fmt.Sprintf("%#x-%d", fi.Type, fi.Num)
}
}
// NewFileInfo creates new FileInfo from the given File. It will returns nil
// if File is nil.
func NewFileInfo(f File) *FileInfo {
if f == nil {
return nil
}
return &FileInfo{f.Type(), f.Num()}
}

View File

@ -11,6 +11,7 @@ import (
"fmt"
"io"
"io/ioutil"
"math/rand"
"os"
"path/filepath"
"sync"
@ -36,6 +37,19 @@ var (
tsNum = 0
)
type tsOp uint
const (
tsOpOpen tsOp = iota
tsOpCreate
tsOpRead
tsOpReadAt
tsOpWrite
tsOpSync
tsOpNum
)
type tsLock struct {
ts *testStorage
r util.Releaser
@ -54,6 +68,9 @@ type tsReader struct {
func (tr tsReader) Read(b []byte) (n int, err error) {
ts := tr.tf.ts
ts.countRead(tr.tf.Type())
if tr.tf.shouldErrLocked(tsOpRead) {
return 0, errors.New("leveldb.testStorage: emulated read error")
}
n, err = tr.Reader.Read(b)
if err != nil && err != io.EOF {
ts.t.Errorf("E: read error, num=%d type=%v n=%d: %v", tr.tf.Num(), tr.tf.Type(), n, err)
@ -64,6 +81,9 @@ func (tr tsReader) Read(b []byte) (n int, err error) {
func (tr tsReader) ReadAt(b []byte, off int64) (n int, err error) {
ts := tr.tf.ts
ts.countRead(tr.tf.Type())
if tr.tf.shouldErrLocked(tsOpReadAt) {
return 0, errors.New("leveldb.testStorage: emulated readAt error")
}
n, err = tr.Reader.ReadAt(b, off)
if err != nil && err != io.EOF {
ts.t.Errorf("E: readAt error, num=%d type=%v off=%d n=%d: %v", tr.tf.Num(), tr.tf.Type(), off, n, err)
@ -83,15 +103,12 @@ type tsWriter struct {
}
func (tw tsWriter) Write(b []byte) (n int, err error) {
ts := tw.tf.ts
ts.mu.Lock()
defer ts.mu.Unlock()
if ts.emuWriteErr&tw.tf.Type() != 0 {
if tw.tf.shouldErrLocked(tsOpWrite) {
return 0, errors.New("leveldb.testStorage: emulated write error")
}
n, err = tw.Writer.Write(b)
if err != nil {
ts.t.Errorf("E: write error, num=%d type=%v n=%d: %v", tw.tf.Num(), tw.tf.Type(), n, err)
tw.tf.ts.t.Errorf("E: write error, num=%d type=%v n=%d: %v", tw.tf.Num(), tw.tf.Type(), n, err)
}
return
}
@ -99,23 +116,23 @@ func (tw tsWriter) Write(b []byte) (n int, err error) {
func (tw tsWriter) Sync() (err error) {
ts := tw.tf.ts
ts.mu.Lock()
defer ts.mu.Unlock()
for ts.emuDelaySync&tw.tf.Type() != 0 {
ts.cond.Wait()
}
if ts.emuSyncErr&tw.tf.Type() != 0 {
ts.mu.Unlock()
if tw.tf.shouldErrLocked(tsOpSync) {
return errors.New("leveldb.testStorage: emulated sync error")
}
err = tw.Writer.Sync()
if err != nil {
ts.t.Errorf("E: sync error, num=%d type=%v: %v", tw.tf.Num(), tw.tf.Type(), err)
tw.tf.ts.t.Errorf("E: sync error, num=%d type=%v: %v", tw.tf.Num(), tw.tf.Type(), err)
}
return
}
func (tw tsWriter) Close() (err error) {
err = tw.Writer.Close()
tw.tf.close("reader", err)
tw.tf.close("writer", err)
return
}
@ -128,6 +145,16 @@ func (tf tsFile) x() uint64 {
return tf.Num()<<typeShift | uint64(tf.Type())
}
func (tf tsFile) shouldErr(op tsOp) bool {
return tf.ts.shouldErr(tf, op)
}
func (tf tsFile) shouldErrLocked(op tsOp) bool {
tf.ts.mu.Lock()
defer tf.ts.mu.Unlock()
return tf.shouldErr(op)
}
func (tf tsFile) checkOpen(m string) error {
ts := tf.ts
if writer, ok := ts.opens[tf.x()]; ok {
@ -164,7 +191,7 @@ func (tf tsFile) Open() (r storage.Reader, err error) {
if err != nil {
return
}
if ts.emuOpenErr&tf.Type() != 0 {
if tf.shouldErr(tsOpOpen) {
err = errors.New("leveldb.testStorage: emulated open error")
return
}
@ -191,7 +218,7 @@ func (tf tsFile) Create() (w storage.Writer, err error) {
if err != nil {
return
}
if ts.emuCreateErr&tf.Type() != 0 {
if tf.shouldErr(tsOpCreate) {
err = errors.New("leveldb.testStorage: emulated create error")
return
}
@ -232,25 +259,61 @@ type testStorage struct {
cond sync.Cond
// Open files, true=writer, false=reader
opens map[uint64]bool
emuOpenErr storage.FileType
emuCreateErr storage.FileType
emuDelaySync storage.FileType
emuWriteErr storage.FileType
emuSyncErr storage.FileType
ignoreOpenErr storage.FileType
readCnt uint64
readCntEn storage.FileType
emuErr [tsOpNum]storage.FileType
emuErrOnce [tsOpNum]storage.FileType
emuRandErr [tsOpNum]storage.FileType
emuRandErrProb int
emuErrOnceMap map[uint64]uint
emuRandRand *rand.Rand
}
func (ts *testStorage) SetOpenErr(t storage.FileType) {
func (ts *testStorage) shouldErr(tf tsFile, op tsOp) bool {
if ts.emuErr[op]&tf.Type() != 0 {
return true
} else if ts.emuRandErr[op]&tf.Type() != 0 || ts.emuErrOnce[op]&tf.Type() != 0 {
sop := uint(1) << op
eop := ts.emuErrOnceMap[tf.x()]
if eop&sop == 0 && (ts.emuRandRand.Int()%ts.emuRandErrProb == 0 || ts.emuErrOnce[op]&tf.Type() != 0) {
ts.emuErrOnceMap[tf.x()] = eop | sop
ts.t.Logf("I: emulated error: file=%d type=%v op=%v", tf.Num(), tf.Type(), op)
return true
}
}
return false
}
func (ts *testStorage) SetEmuErr(t storage.FileType, ops ...tsOp) {
ts.mu.Lock()
ts.emuOpenErr = t
for _, op := range ops {
ts.emuErr[op] = t
}
ts.mu.Unlock()
}
func (ts *testStorage) SetCreateErr(t storage.FileType) {
func (ts *testStorage) SetEmuErrOnce(t storage.FileType, ops ...tsOp) {
ts.mu.Lock()
ts.emuCreateErr = t
for _, op := range ops {
ts.emuErrOnce[op] = t
}
ts.mu.Unlock()
}
func (ts *testStorage) SetEmuRandErr(t storage.FileType, ops ...tsOp) {
ts.mu.Lock()
for _, op := range ops {
ts.emuRandErr[op] = t
}
ts.mu.Unlock()
}
func (ts *testStorage) SetEmuRandErrProb(prob int) {
ts.mu.Lock()
ts.emuRandErrProb = prob
ts.mu.Unlock()
}
@ -268,18 +331,6 @@ func (ts *testStorage) ReleaseSync(t storage.FileType) {
ts.mu.Unlock()
}
func (ts *testStorage) SetWriteErr(t storage.FileType) {
ts.mu.Lock()
ts.emuWriteErr = t
ts.mu.Unlock()
}
func (ts *testStorage) SetSyncErr(t storage.FileType) {
ts.mu.Lock()
ts.emuSyncErr = t
ts.mu.Unlock()
}
func (ts *testStorage) ReadCounter() uint64 {
ts.mu.Lock()
defer ts.mu.Unlock()
@ -454,10 +505,13 @@ func newTestStorage(t *testing.T) *testStorage {
stor = storage.NewMemStorage()
}
ts := &testStorage{
t: t,
Storage: stor,
closeFn: closeFn,
opens: make(map[uint64]bool),
t: t,
Storage: stor,
closeFn: closeFn,
opens: make(map[uint64]bool),
emuErrOnceMap: make(map[uint64]uint),
emuRandErrProb: 0x999,
emuRandRand: rand.New(rand.NewSource(0xfacedead)),
}
ts.cond.L = &ts.mu
return ts

View File

@ -7,6 +7,7 @@
package leveldb
import (
"fmt"
"sort"
"sync/atomic"
@ -82,6 +83,18 @@ type tFiles []*tFile
func (tf tFiles) Len() int { return len(tf) }
func (tf tFiles) Swap(i, j int) { tf[i], tf[j] = tf[j], tf[i] }
func (tf tFiles) nums() string {
x := "[ "
for i, f := range tf {
if i != 0 {
x += ", "
}
x += fmt.Sprint(f.file.Num())
}
x += " ]"
return x
}
// Returns true if i smallest key is less than j.
// This used for sort by key in ascending order.
func (tf tFiles) lessByKey(icmp *iComparer, i, j int) bool {
@ -149,7 +162,7 @@ func (tf tFiles) overlaps(icmp *iComparer, umin, umax []byte, unsorted bool) boo
i := 0
if len(umin) > 0 {
// Find the earliest possible internal key for min.
i = tf.searchMax(icmp, newIKey(umin, kMaxSeq, tSeek))
i = tf.searchMax(icmp, newIkey(umin, kMaxSeq, ktSeek))
}
if i >= len(tf) {
// Beginning of range is after all files, so no overlap.
@ -159,24 +172,25 @@ func (tf tFiles) overlaps(icmp *iComparer, umin, umax []byte, unsorted bool) boo
}
// Returns tables whose its key range overlaps with given key range.
// If overlapped is true then the search will be expanded to tables that
// overlaps with each other.
// Range will be expanded if ukey found hop across tables.
// If overlapped is true then the search will be restarted if umax
// expanded.
// The dst content will be overwritten.
func (tf tFiles) getOverlaps(dst tFiles, icmp *iComparer, umin, umax []byte, overlapped bool) tFiles {
x := len(dst)
dst = dst[:0]
for i := 0; i < len(tf); {
t := tf[i]
if t.overlaps(icmp, umin, umax) {
if overlapped {
// For overlapped files, check if the newly added file has
// expanded the range. If so, restart search.
if umin != nil && icmp.uCompare(t.imin.ukey(), umin) < 0 {
umin = t.imin.ukey()
dst = dst[:x]
i = 0
continue
} else if umax != nil && icmp.uCompare(t.imax.ukey(), umax) > 0 {
umax = t.imax.ukey()
dst = dst[:x]
if umin != nil && icmp.uCompare(t.imin.ukey(), umin) < 0 {
umin = t.imin.ukey()
dst = dst[:0]
i = 0
continue
} else if umax != nil && icmp.uCompare(t.imax.ukey(), umax) > 0 {
umax = t.imax.ukey()
// Restart search if it is overlapped.
if overlapped {
dst = dst[:0]
i = 0
continue
}
@ -289,7 +303,7 @@ func (t *tOps) create() (*tWriter, error) {
t: t,
file: file,
w: fw,
tw: table.NewWriter(fw, t.s.o),
tw: table.NewWriter(fw, t.s.o.Options),
}, nil
}
@ -337,7 +351,13 @@ func (t *tOps) open(f *tFile) (ch cache.Handle, err error) {
if bc := t.s.o.GetBlockCache(); bc != nil {
bcacheNS = bc.GetNamespace(num)
}
return 1, table.NewReader(r, int64(f.size), bcacheNS, t.bpool, t.s.o)
var tr *table.Reader
tr, err = table.NewReader(r, int64(f.size), storage.NewFileInfo(f.file), bcacheNS, t.bpool, t.s.o.Options)
if err != nil {
r.Close()
return 0, nil
}
return 1, tr
})
if ch == nil && err == nil {
err = ErrClosed
@ -440,28 +460,34 @@ func (w *tWriter) empty() bool {
return w.first == nil
}
// Closes the storage.Writer.
func (w *tWriter) close() {
if w.w != nil {
w.w.Close()
w.w = nil
}
}
// Finalizes the table and returns table file.
func (w *tWriter) finish() (f *tFile, err error) {
defer w.close()
err = w.tw.Close()
if err != nil {
return
}
err = w.w.Sync()
if err != nil {
w.w.Close()
return
}
w.w.Close()
f = newTableFile(w.file, uint64(w.tw.BytesLen()), iKey(w.first), iKey(w.last))
return
}
// Drops the table.
func (w *tWriter) drop() {
w.w.Close()
w.close()
w.file.Remove()
w.t.s.reuseFileNum(w.file.Num())
w.w = nil
w.file = nil
w.tw = nil
w.first = nil

View File

@ -8,29 +8,41 @@ package table
import (
"encoding/binary"
"errors"
"fmt"
"io"
"sort"
"strings"
"sync"
"code.google.com/p/snappy-go/snappy"
"github.com/syndtr/gosnappy/snappy"
"github.com/syndtr/goleveldb/leveldb/cache"
"github.com/syndtr/goleveldb/leveldb/comparer"
"github.com/syndtr/goleveldb/leveldb/errors"
"github.com/syndtr/goleveldb/leveldb/filter"
"github.com/syndtr/goleveldb/leveldb/iterator"
"github.com/syndtr/goleveldb/leveldb/opt"
"github.com/syndtr/goleveldb/leveldb/storage"
"github.com/syndtr/goleveldb/leveldb/util"
)
var (
ErrNotFound = util.ErrNotFound
ErrNotFound = errors.ErrNotFound
ErrReaderReleased = errors.New("leveldb/table: reader released")
ErrIterReleased = errors.New("leveldb/table: iterator released")
)
type ErrCorrupted struct {
Pos int64
Size int64
Kind string
Reason string
}
func (e *ErrCorrupted) Error() string {
return fmt.Sprintf("leveldb/table: corruption on %s (pos=%d): %s", e.Kind, e.Pos, e.Reason)
}
func max(x, y int) int {
if x > y {
return x
@ -38,13 +50,19 @@ func max(x, y int) int {
return y
}
func verifyBlockChecksum(data []byte) bool {
n := len(data) - 4
checksum0 := binary.LittleEndian.Uint32(data[n:])
checksum1 := util.NewCRC(data[:n]).Value()
return checksum0 == checksum1
}
type block struct {
bpool *util.BufferPool
bh blockHandle
data []byte
restartsLen int
restartsOffset int
// Whether checksum is verified and valid.
checksum bool
}
func (b *block) seek(cmp comparer.Comparer, rstart, rlimit int, key []byte) (index, offset int, err error) {
@ -77,7 +95,7 @@ func (b *block) restartOffset(index int) int {
func (b *block) entry(offset int) (key, value []byte, nShared, n int, err error) {
if offset >= b.restartsOffset {
if offset != b.restartsOffset {
err = errors.New("leveldb/table: Reader: BlockEntry: invalid block (block entries offset not aligned)")
err = &ErrCorrupted{Reason: "entries offset not aligned"}
}
return
}
@ -87,7 +105,7 @@ func (b *block) entry(offset int) (key, value []byte, nShared, n int, err error)
m := n0 + n1 + n2
n = m + int(v1) + int(v2)
if n0 <= 0 || n1 <= 0 || n2 <= 0 || offset+n > b.restartsOffset {
err = errors.New("leveldb/table: Reader: invalid block (block entries corrupted)")
err = &ErrCorrupted{Reason: "entries corrupted"}
return
}
key = b.data[offset+m : offset+m+int(v1)]
@ -251,7 +269,7 @@ func (i *blockIter) Next() bool {
for i.offset < i.offsetRealStart {
key, value, nShared, n, err := i.block.entry(i.offset)
if err != nil {
i.sErr(err)
i.sErr(i.tr.fixErrCorruptedBH(i.block.bh, err))
return false
}
if n == 0 {
@ -265,13 +283,13 @@ func (i *blockIter) Next() bool {
if i.offset >= i.offsetLimit {
i.dir = dirEOI
if i.offset != i.offsetLimit {
i.sErr(errors.New("leveldb/table: Reader: Next: invalid block (block entries offset not aligned)"))
i.sErr(i.tr.newErrCorruptedBH(i.block.bh, "entries offset not aligned"))
}
return false
}
key, value, nShared, n, err := i.block.entry(i.offset)
if err != nil {
i.sErr(err)
i.sErr(i.tr.fixErrCorruptedBH(i.block.bh, err))
return false
}
if n == 0 {
@ -356,7 +374,7 @@ func (i *blockIter) Prev() bool {
for {
key, value, nShared, n, err := i.block.entry(offset)
if err != nil {
i.sErr(err)
i.sErr(i.tr.fixErrCorruptedBH(i.block.bh, err))
return false
}
if offset >= i.offsetRealStart {
@ -375,7 +393,7 @@ func (i *blockIter) Prev() bool {
// Stop if target offset reached.
if offset >= i.offset {
if offset != i.offset {
i.sErr(errors.New("leveldb/table: Reader: Prev: invalid block (block entries offset not aligned)"))
i.sErr(i.tr.newErrCorruptedBH(i.block.bh, "entries offset not aligned"))
return false
}
@ -473,7 +491,6 @@ type indexIter struct {
tr *Reader
slice *util.Range
// Options
checksum bool
fillCache bool
}
@ -484,28 +501,29 @@ func (i *indexIter) Get() iterator.Iterator {
}
dataBH, n := decodeBlockHandle(value)
if n == 0 {
return iterator.NewEmptyIterator(errors.New("leveldb/table: Reader: invalid table (bad data block handle)"))
return iterator.NewEmptyIterator(i.tr.newErrCorruptedBH(i.tr.indexBH, "bad data block handle"))
}
var slice *util.Range
if i.slice != nil && (i.blockIter.isFirst() || i.blockIter.isLast()) {
slice = i.slice
}
return i.tr.getDataIterErr(dataBH, slice, i.checksum, i.fillCache)
return i.tr.getDataIterErr(dataBH, slice, i.tr.verifyChecksum, i.fillCache)
}
// Reader is a table reader.
type Reader struct {
mu sync.RWMutex
fi *storage.FileInfo
reader io.ReaderAt
cache cache.Namespace
err error
bpool *util.BufferPool
// Options
cmp comparer.Comparer
filter filter.Filter
checksum bool
strictIter bool
o *opt.Options
cmp comparer.Comparer
filter filter.Filter
verifyChecksum bool
dataEnd int64
indexBH, filterBH blockHandle
@ -513,23 +531,43 @@ type Reader struct {
filterBlock *filterBlock
}
func verifyChecksum(data []byte) bool {
n := len(data) - 4
checksum0 := binary.LittleEndian.Uint32(data[n:])
checksum1 := util.NewCRC(data[:n]).Value()
return checksum0 == checksum1
func (r *Reader) blockKind(bh blockHandle) string {
switch bh.offset {
case r.indexBH.offset:
return "index-block"
case r.filterBH.offset:
return "filter-block"
default:
return "data-block"
}
}
func (r *Reader) readRawBlock(bh blockHandle, checksum bool) ([]byte, error) {
func (r *Reader) newErrCorrupted(pos, size int64, kind, reason string) error {
return &errors.ErrCorrupted{File: r.fi, Err: &ErrCorrupted{Pos: pos, Size: size, Kind: kind, Reason: reason}}
}
func (r *Reader) newErrCorruptedBH(bh blockHandle, reason string) error {
return r.newErrCorrupted(int64(bh.offset), int64(bh.length), r.blockKind(bh), reason)
}
func (r *Reader) fixErrCorruptedBH(bh blockHandle, err error) error {
if cerr, ok := err.(*ErrCorrupted); ok {
cerr.Pos = int64(bh.offset)
cerr.Size = int64(bh.length)
cerr.Kind = r.blockKind(bh)
return &errors.ErrCorrupted{File: r.fi, Err: cerr}
}
return err
}
func (r *Reader) readRawBlock(bh blockHandle, verifyChecksum bool) ([]byte, error) {
data := r.bpool.Get(int(bh.length + blockTrailerLen))
if _, err := r.reader.ReadAt(data, int64(bh.offset)); err != nil && err != io.EOF {
return nil, err
}
if checksum || r.checksum {
if !verifyChecksum(data) {
r.bpool.Put(data)
return nil, errors.New("leveldb/table: Reader: invalid block (checksum mismatch)")
}
if verifyChecksum && !verifyBlockChecksum(data) {
r.bpool.Put(data)
return nil, r.newErrCorruptedBH(bh, "checksum mismatch")
}
switch data[bh.length] {
case blockTypeNoCompression:
@ -537,38 +575,40 @@ func (r *Reader) readRawBlock(bh blockHandle, checksum bool) ([]byte, error) {
case blockTypeSnappyCompression:
decLen, err := snappy.DecodedLen(data[:bh.length])
if err != nil {
return nil, err
return nil, r.newErrCorruptedBH(bh, err.Error())
}
tmp := data
data, err = snappy.Decode(r.bpool.Get(decLen), tmp[:bh.length])
r.bpool.Put(tmp)
decData := r.bpool.Get(decLen)
decData, err = snappy.Decode(decData, data[:bh.length])
r.bpool.Put(data)
if err != nil {
return nil, err
r.bpool.Put(decData)
return nil, r.newErrCorruptedBH(bh, err.Error())
}
data = decData
default:
r.bpool.Put(data)
return nil, fmt.Errorf("leveldb/table: Reader: unknown block compression type: %d", data[bh.length])
return nil, r.newErrCorruptedBH(bh, fmt.Sprintf("unknown compression type %#x", data[bh.length]))
}
return data, nil
}
func (r *Reader) readBlock(bh blockHandle, checksum bool) (*block, error) {
data, err := r.readRawBlock(bh, checksum)
func (r *Reader) readBlock(bh blockHandle, verifyChecksum bool) (*block, error) {
data, err := r.readRawBlock(bh, verifyChecksum)
if err != nil {
return nil, err
}
restartsLen := int(binary.LittleEndian.Uint32(data[len(data)-4:]))
b := &block{
bpool: r.bpool,
bh: bh,
data: data,
restartsLen: restartsLen,
restartsOffset: len(data) - (restartsLen+1)*4,
checksum: checksum || r.checksum,
}
return b, nil
}
func (r *Reader) readBlockCached(bh blockHandle, checksum, fillCache bool) (*block, util.Releaser, error) {
func (r *Reader) readBlockCached(bh blockHandle, verifyChecksum, fillCache bool) (*block, util.Releaser, error) {
if r.cache != nil {
var err error
ch := r.cache.Get(bh.offset, func() (charge int, value interface{}) {
@ -576,7 +616,7 @@ func (r *Reader) readBlockCached(bh blockHandle, checksum, fillCache bool) (*blo
return 0, nil
}
var b *block
b, err = r.readBlock(bh, checksum)
b, err = r.readBlock(bh, verifyChecksum)
if err != nil {
return 0, nil
}
@ -586,14 +626,7 @@ func (r *Reader) readBlockCached(bh blockHandle, checksum, fillCache bool) (*blo
b, ok := ch.Value().(*block)
if !ok {
ch.Release()
return nil, nil, errors.New("leveldb/table: Reader: inconsistent block type")
}
if !b.checksum && (r.checksum || checksum) {
if !verifyChecksum(b.data) {
ch.Release()
return nil, nil, errors.New("leveldb/table: Reader: invalid block (checksum mismatch)")
}
b.checksum = true
return nil, nil, errors.New("leveldb/table: inconsistent block type")
}
return b, ch, err
} else if err != nil {
@ -601,7 +634,7 @@ func (r *Reader) readBlockCached(bh blockHandle, checksum, fillCache bool) (*blo
}
}
b, err := r.readBlock(bh, checksum)
b, err := r.readBlock(bh, verifyChecksum)
return b, b, err
}
@ -612,12 +645,12 @@ func (r *Reader) readFilterBlock(bh blockHandle) (*filterBlock, error) {
}
n := len(data)
if n < 5 {
return nil, errors.New("leveldb/table: Reader: invalid filter block (too short)")
return nil, r.newErrCorruptedBH(bh, "too short")
}
m := n - 5
oOffset := int(binary.LittleEndian.Uint32(data[m:]))
if oOffset > m {
return nil, errors.New("leveldb/table: Reader: invalid filter block (invalid offset)")
return nil, r.newErrCorruptedBH(bh, "invalid data-offsets offset")
}
b := &filterBlock{
bpool: r.bpool,
@ -647,7 +680,7 @@ func (r *Reader) readFilterBlockCached(bh blockHandle, fillCache bool) (*filterB
b, ok := ch.Value().(*filterBlock)
if !ok {
ch.Release()
return nil, nil, errors.New("leveldb/table: Reader: inconsistent block type")
return nil, nil, errors.New("leveldb/table: inconsistent block type")
}
return b, ch, err
} else if err != nil {
@ -673,25 +706,6 @@ func (r *Reader) getFilterBlock(fillCache bool) (*filterBlock, util.Releaser, er
return r.filterBlock, util.NoopReleaser{}, nil
}
func (r *Reader) getDataIter(dataBH blockHandle, slice *util.Range, checksum, fillCache bool) iterator.Iterator {
b, rel, err := r.readBlockCached(dataBH, checksum, fillCache)
if err != nil {
return iterator.NewEmptyIterator(err)
}
return r.newBlockIter(b, rel, slice, false)
}
func (r *Reader) getDataIterErr(dataBH blockHandle, slice *util.Range, checksum, fillCache bool) iterator.Iterator {
r.mu.RLock()
defer r.mu.RUnlock()
if r.err != nil {
return iterator.NewEmptyIterator(r.err)
}
return r.getDataIter(dataBH, slice, checksum, fillCache)
}
func (r *Reader) newBlockIter(b *block, bReleaser util.Releaser, slice *util.Range, inclLimit bool) *blockIter {
bi := &blockIter{
tr: r,
@ -726,12 +740,31 @@ func (r *Reader) newBlockIter(b *block, bReleaser util.Releaser, slice *util.Ran
}
bi.reset()
if bi.offsetStart > bi.offsetLimit {
bi.sErr(errors.New("leveldb/table: Reader: invalid slice range"))
bi.sErr(errors.New("leveldb/table: invalid slice range"))
}
}
return bi
}
func (r *Reader) getDataIter(dataBH blockHandle, slice *util.Range, verifyChecksum, fillCache bool) iterator.Iterator {
b, rel, err := r.readBlockCached(dataBH, verifyChecksum, fillCache)
if err != nil {
return iterator.NewEmptyIterator(err)
}
return r.newBlockIter(b, rel, slice, false)
}
func (r *Reader) getDataIterErr(dataBH blockHandle, slice *util.Range, verifyChecksum, fillCache bool) iterator.Iterator {
r.mu.RLock()
defer r.mu.RUnlock()
if r.err != nil {
return iterator.NewEmptyIterator(r.err)
}
return r.getDataIter(dataBH, slice, verifyChecksum, fillCache)
}
// NewIterator creates an iterator from the table.
//
// Slice allows slicing the iterator to only contains keys in the given
@ -760,10 +793,9 @@ func (r *Reader) NewIterator(slice *util.Range, ro *opt.ReadOptions) iterator.It
blockIter: r.newBlockIter(indexBlock, rel, slice, true),
tr: r,
slice: slice,
checksum: ro.GetStrict(opt.StrictBlockChecksum),
fillCache: !ro.GetDontFillCache(),
}
return iterator.NewIndexedIterator(index, r.strictIter || ro.GetStrict(opt.StrictIterator), true)
return iterator.NewIndexedIterator(index, opt.GetStrict(r.o, ro, opt.StrictReader))
}
// Find finds key/value pair whose key is greater than or equal to the
@ -798,7 +830,7 @@ func (r *Reader) Find(key []byte, ro *opt.ReadOptions) (rkey, value []byte, err
}
dataBH, n := decodeBlockHandle(index.Value())
if n == 0 {
err = errors.New("leveldb/table: Reader: invalid table (bad data block handle)")
r.err = r.newErrCorruptedBH(r.indexBH, "bad data block handle")
return
}
if r.filter != nil {
@ -811,7 +843,7 @@ func (r *Reader) Find(key []byte, ro *opt.ReadOptions) (rkey, value []byte, err
rel.Release()
}
}
data := r.getDataIter(dataBH, nil, ro.GetStrict(opt.StrictBlockChecksum), !ro.GetDontFillCache())
data := r.getDataIter(dataBH, nil, r.verifyChecksum, !ro.GetDontFillCache())
defer data.Release()
if !data.Seek(key) {
err = data.Error()
@ -877,7 +909,7 @@ func (r *Reader) OffsetOf(key []byte) (offset int64, err error) {
if index.Seek(key) {
dataBH, n := decodeBlockHandle(index.Value())
if n == 0 {
err = errors.New("leveldb/table: Reader: invalid table (bad data block handle)")
r.err = r.newErrCorruptedBH(r.indexBH, "bad data block handle")
return
}
offset = int64(dataBH.offset)
@ -914,51 +946,56 @@ func (r *Reader) Release() {
}
// NewReader creates a new initialized table reader for the file.
// The cache and bpool is optional and can be nil.
// The fi, cache and bpool is optional and can be nil.
//
// The returned table reader instance is goroutine-safe.
func NewReader(f io.ReaderAt, size int64, cache cache.Namespace, bpool *util.BufferPool, o *opt.Options) *Reader {
func NewReader(f io.ReaderAt, size int64, fi *storage.FileInfo, cache cache.Namespace, bpool *util.BufferPool, o *opt.Options) (*Reader, error) {
r := &Reader{
reader: f,
cache: cache,
bpool: bpool,
cmp: o.GetComparer(),
checksum: o.GetStrict(opt.StrictBlockChecksum),
strictIter: o.GetStrict(opt.StrictIterator),
fi: fi,
reader: f,
cache: cache,
bpool: bpool,
o: o,
cmp: o.GetComparer(),
verifyChecksum: o.GetStrict(opt.StrictBlockChecksum),
}
if f == nil {
r.err = errors.New("leveldb/table: Reader: nil file")
return r
return nil, errors.New("leveldb/table: nil file")
}
if size < footerLen {
r.err = errors.New("leveldb/table: Reader: invalid table (file size is too small)")
return r
r.err = r.newErrCorrupted(0, size, "table", "too small")
return r, nil
}
footerPos := size - footerLen
var footer [footerLen]byte
if _, err := r.reader.ReadAt(footer[:], size-footerLen); err != nil && err != io.EOF {
r.err = fmt.Errorf("leveldb/table: Reader: invalid table (could not read footer): %v", err)
if _, err := r.reader.ReadAt(footer[:], footerPos); err != nil && err != io.EOF {
return nil, err
}
if string(footer[footerLen-len(magic):footerLen]) != magic {
r.err = errors.New("leveldb/table: Reader: invalid table (bad magic number)")
return r
r.err = r.newErrCorrupted(footerPos, footerLen, "table-footer", "bad magic number")
return r, nil
}
// Decode the metaindex block handle.
metaBH, n := decodeBlockHandle(footer[:])
if n == 0 {
r.err = errors.New("leveldb/table: Reader: invalid table (bad metaindex block handle)")
return r
r.err = r.newErrCorrupted(footerPos, footerLen, "table-footer", "bad metaindex block handle")
return r, nil
}
// Decode the index block handle.
r.indexBH, n = decodeBlockHandle(footer[n:])
if n == 0 {
r.err = errors.New("leveldb/table: Reader: invalid table (bad index block handle)")
return r
r.err = r.newErrCorrupted(footerPos, footerLen, "table-footer", "bad index block handle")
return r, nil
}
// Read metaindex block.
metaBlock, err := r.readBlock(metaBH, true)
if err != nil {
r.err = err
return r
if errors.IsCorrupted(err) {
r.err = err
return r, nil
} else {
return nil, err
}
}
// Set data end.
r.dataEnd = int64(metaBH.offset)
@ -995,13 +1032,22 @@ func NewReader(f io.ReaderAt, size int64, cache cache.Namespace, bpool *util.Buf
// Cache index and filter block locally, since we don't have global cache.
if cache == nil {
r.indexBlock, r.err = r.readBlock(r.indexBH, true)
if r.err != nil {
return r
r.indexBlock, err = r.readBlock(r.indexBH, true)
if err != nil {
if errors.IsCorrupted(err) {
r.err = err
return r, nil
} else {
return nil, err
}
}
if r.filter != nil {
r.filterBlock, err = r.readFilterBlock(r.filterBH)
if err != nil {
if !errors.IsCorrupted(r.err) {
return nil, err
}
// Don't use filter then.
r.filter = nil
r.filterBH = blockHandle{}
@ -1009,5 +1055,5 @@ func NewReader(f io.ReaderAt, size int64, cache cache.Namespace, bpool *util.Buf
}
}
return r
return r, nil
}

View File

@ -133,9 +133,9 @@ Filter block trailer:
+- 4-bytes -+
/ \
+---------------+---------------+---------------+-------------------------+------------------+
| offset 1 | .... | offset n | filter offset (4-bytes) | base Lg (1-byte) |
+-------------- +---------------+---------------+-------------------------+------------------+
+---------------+---------------+---------------+-------------------------------+------------------+
| data 1 offset | .... | data n offset | data-offsets offset (4-bytes) | base Lg (1-byte) |
+-------------- +---------------+---------------+-------------------------------+------------------+
NOTE: All fixed-length integer are little-endian.

View File

@ -59,7 +59,8 @@ var _ = testutil.Defer(func() {
It("Should be able to approximate offset of a key correctly", func() {
Expect(err).ShouldNot(HaveOccurred())
tr := NewReader(bytes.NewReader(buf.Bytes()), int64(buf.Len()), nil, nil, o)
tr, err := NewReader(bytes.NewReader(buf.Bytes()), int64(buf.Len()), nil, nil, nil, o)
Expect(err).ShouldNot(HaveOccurred())
CheckOffset := func(key string, expect, threshold int) {
offset, err := tr.OffsetOf([]byte(key))
Expect(err).ShouldNot(HaveOccurred())
@ -95,7 +96,7 @@ var _ = testutil.Defer(func() {
tw.Close()
// Opening the table.
tr := NewReader(bytes.NewReader(buf.Bytes()), int64(buf.Len()), nil, nil, o)
tr, _ := NewReader(bytes.NewReader(buf.Bytes()), int64(buf.Len()), nil, nil, nil, o)
return tableWrapper{tr}
}
Test := func(kv *testutil.KeyValue, body func(r *Reader)) func() {

View File

@ -12,7 +12,7 @@ import (
"fmt"
"io"
"code.google.com/p/snappy-go/snappy"
"github.com/syndtr/gosnappy/snappy"
"github.com/syndtr/goleveldb/leveldb/comparer"
"github.com/syndtr/goleveldb/leveldb/filter"

View File

@ -12,6 +12,7 @@ import (
. "github.com/onsi/gomega"
"github.com/syndtr/goleveldb/leveldb/errors"
"github.com/syndtr/goleveldb/leveldb/iterator"
"github.com/syndtr/goleveldb/leveldb/util"
)
@ -110,7 +111,7 @@ func (t *DBTesting) TestAllPresent() {
func (t *DBTesting) TestDeletedKey(key []byte) {
_, err := t.DB.TestGet(key)
Expect(err).Should(Equal(util.ErrNotFound), "Get on deleted key %q, %s", key, t.text())
Expect(err).Should(Equal(errors.ErrNotFound), "Get on deleted key %q, %s", key, t.text())
}
func (t *DBTesting) TestAllDeleted() {

View File

@ -13,6 +13,7 @@ import (
. "github.com/onsi/ginkgo"
. "github.com/onsi/gomega"
"github.com/syndtr/goleveldb/leveldb/errors"
"github.com/syndtr/goleveldb/leveldb/util"
)
@ -59,7 +60,7 @@ func KeyValueTesting(rnd *rand.Rand, kv KeyValue, p DB, setup func(KeyValue) DB,
}
rkey, _, err := db.TestFind(key)
Expect(err).Should(HaveOccurred(), "Find for key %q yield key %q", key, rkey)
Expect(err).Should(Equal(util.ErrNotFound))
Expect(err).Should(Equal(errors.ErrNotFound))
}
})
@ -77,7 +78,7 @@ func KeyValueTesting(rnd *rand.Rand, kv KeyValue, p DB, setup func(KeyValue) DB,
if len(key_) > 0 {
_, err = db.TestGet(key_)
Expect(err).Should(HaveOccurred(), "Error for key %q", key_)
Expect(err).Should(Equal(util.ErrNotFound))
Expect(err).Should(Equal(errors.ErrNotFound))
}
})
}

View File

@ -14,10 +14,10 @@ import (
)
func shorten(str string) string {
if len(str) <= 4 {
if len(str) <= 8 {
return str
}
return str[:1] + ".." + str[len(str)-1:]
return str[:3] + ".." + str[len(str)-3:]
}
var bunits = [...]string{"", "Ki", "Mi", "Gi"}

View File

@ -8,6 +8,7 @@ package util
import (
"fmt"
"sync"
"sync/atomic"
"time"
)
@ -19,17 +20,16 @@ type buffer struct {
// BufferPool is a 'buffer pool'.
type BufferPool struct {
pool [6]chan []byte
size [5]uint32
sizeMiss [5]uint32
sizeHalf [5]uint32
baseline [4]int
baselinex0 int
baselinex1 int
baseline0 int
baseline1 int
baseline2 int
close chan struct{}
pool [6]chan []byte
size [5]uint32
sizeMiss [5]uint32
sizeHalf [5]uint32
baseline [4]int
baseline0 int
mu sync.RWMutex
closed bool
closeC chan struct{}
get uint32
put uint32
@ -58,6 +58,13 @@ func (p *BufferPool) Get(n int) []byte {
return make([]byte, n)
}
p.mu.RLock()
defer p.mu.RUnlock()
if p.closed {
return make([]byte, n)
}
atomic.AddUint32(&p.get, 1)
poolNum := p.poolNum(n)
@ -153,12 +160,16 @@ func (p *BufferPool) Put(b []byte) {
return
}
p.mu.RLock()
defer p.mu.RUnlock()
if p.closed {
return
}
atomic.AddUint32(&p.put, 1)
pool := p.pool[p.poolNum(cap(b))]
defer func() {
recover()
}()
select {
case pool <- b:
default:
@ -171,10 +182,12 @@ func (p *BufferPool) Close() {
return
}
select {
case p.close <- struct{}{}:
default:
p.mu.Lock()
if !p.closed {
p.closed = true
p.closeC <- struct{}{}
}
p.mu.Unlock()
}
func (p *BufferPool) String() string {
@ -197,7 +210,8 @@ func (p *BufferPool) drain() {
default:
}
}
case <-p.close:
case <-p.closeC:
close(p.closeC)
for _, ch := range p.pool {
close(ch)
}
@ -214,7 +228,7 @@ func NewBufferPool(baseline int) *BufferPool {
p := &BufferPool{
baseline0: baseline,
baseline: [...]int{baseline / 4, baseline / 2, baseline * 2, baseline * 4},
close: make(chan struct{}, 1),
closeC: make(chan struct{}, 1),
}
for i, cap := range []int{2, 2, 4, 4, 2, 1} {
p.pool[i] = make(chan []byte, cap)

View File

@ -12,7 +12,6 @@ import (
)
var (
ErrNotFound = errors.New("leveldb: not found")
ErrReleased = errors.New("leveldb: resource already relesed")
ErrHasReleaser = errors.New("leveldb: releaser already defined")
)

View File

@ -7,7 +7,6 @@
package leveldb
import (
"errors"
"sync/atomic"
"unsafe"
@ -16,19 +15,6 @@ import (
"github.com/syndtr/goleveldb/leveldb/util"
)
var levelMaxSize [kNumLevels]float64
func init() {
// Precompute max size of each level
for level := range levelMaxSize {
res := float64(10 * 1048576)
for n := level; n > 1; n-- {
res *= 10
}
levelMaxSize[level] = res
}
}
type tSet struct {
level int
table *tFile
@ -37,7 +23,7 @@ type tSet struct {
type version struct {
s *session
tables [kNumLevels]tFiles
tables []tFiles
// Level that should be compacted next and its compaction score.
// Score < 1 means compaction is not strictly needed. These fields
@ -47,11 +33,16 @@ type version struct {
cSeek unsafe.Pointer
ref int
ref int
// Succeeding version.
next *version
}
func (v *version) release_NB() {
func newVersion(s *session) *version {
return &version{s: s, tables: make([]tFiles, s.o.GetNumLevel())}
}
func (v *version) releaseNB() {
v.ref--
if v.ref > 0 {
return
@ -77,13 +68,13 @@ func (v *version) release_NB() {
}
}
v.next.release_NB()
v.next.releaseNB()
v.next = nil
}
func (v *version) release() {
v.s.vmu.Lock()
v.release_NB()
v.releaseNB()
v.s.vmu.Unlock()
}
@ -130,10 +121,11 @@ func (v *version) get(ikey iKey, ro *opt.ReadOptions) (value []byte, tcomp bool,
tset *tSet
tseek bool
l0found bool
l0seq uint64
l0vt vType
l0val []byte
// Level-0.
zfound bool
zseq uint64
zkt kType
zval []byte
)
err = ErrNotFound
@ -150,55 +142,52 @@ func (v *version) get(ikey iKey, ro *opt.ReadOptions) (value []byte, tcomp bool,
}
}
ikey__, val_, err_ := v.s.tops.find(t, ikey, ro)
switch err_ {
fikey, fval, ferr := v.s.tops.find(t, ikey, ro)
switch ferr {
case nil:
case ErrNotFound:
return true
default:
err = err_
err = ferr
return false
}
ikey_ := iKey(ikey__)
if seq, vt, ok := ikey_.parseNum(); ok {
if v.s.icmp.uCompare(ukey, ikey_.ukey()) != 0 {
return true
}
if level == 0 {
if seq >= l0seq {
l0found = true
l0seq = seq
l0vt = vt
l0val = val_
if fukey, fseq, fkt, fkerr := parseIkey(fikey); fkerr == nil {
if v.s.icmp.uCompare(ukey, fukey) == 0 {
if level == 0 {
if fseq >= zseq {
zfound = true
zseq = fseq
zkt = fkt
zval = fval
}
} else {
switch fkt {
case ktVal:
value = fval
err = nil
case ktDel:
default:
panic("leveldb: invalid iKey type")
}
return false
}
} else {
switch vt {
case tVal:
value = val_
err = nil
case tDel:
default:
panic("leveldb: invalid internal key type")
}
return false
}
} else {
err = errors.New("leveldb: internal key corrupted")
err = fkerr
return false
}
return true
}, func(level int) bool {
if l0found {
switch l0vt {
case tVal:
value = l0val
if zfound {
switch zkt {
case ktVal:
value = zval
err = nil
case tDel:
case ktDel:
default:
panic("leveldb: invalid internal key type")
panic("leveldb: invalid iKey type")
}
return false
}
@ -216,13 +205,13 @@ func (v *version) getIterators(slice *util.Range, ro *opt.ReadOptions) (its []it
its = append(its, it)
}
strict := v.s.o.GetStrict(opt.StrictIterator) || ro.GetStrict(opt.StrictIterator)
strict := opt.GetStrict(v.s.o.Options, ro, opt.StrictReader)
for _, tables := range v.tables[1:] {
if len(tables) == 0 {
continue
}
it := iterator.NewIndexedIterator(tables.newIndexIterator(v.s.tops, v.s.icmp, slice, ro), strict, true)
it := iterator.NewIndexedIterator(tables.newIndexIterator(v.s.tops, v.s.icmp, slice, ro), strict)
its = append(its, it)
}
@ -230,7 +219,7 @@ func (v *version) getIterators(slice *util.Range, ro *opt.ReadOptions) (its []it
}
func (v *version) newStaging() *versionStaging {
return &versionStaging{base: v}
return &versionStaging{base: v, tables: make([]tablesScratch, v.s.o.GetNumLevel())}
}
// Spawn a new version based on this version.
@ -285,12 +274,13 @@ func (v *version) offsetOf(ikey iKey) (n uint64, err error) {
func (v *version) pickLevel(umin, umax []byte) (level int) {
if !v.tables[0].overlaps(v.s.icmp, umin, umax, true) {
var overlaps tFiles
for ; level < kMaxMemCompactLevel; level++ {
maxLevel := v.s.o.GetMaxMemCompationLevel()
for ; level < maxLevel; level++ {
if v.tables[level+1].overlaps(v.s.icmp, umin, umax, false) {
break
}
overlaps = v.tables[level+2].getOverlaps(overlaps, v.s.icmp, umin, umax, false)
if overlaps.size() > kMaxGrandParentOverlapBytes {
if overlaps.size() > uint64(v.s.o.GetCompactionGPOverlaps(level)) {
break
}
}
@ -318,9 +308,9 @@ func (v *version) computeCompaction() {
// file size is small (perhaps because of a small write-buffer
// setting, or very high compression ratios, or lots of
// overwrites/deletions).
score = float64(len(tables)) / kL0_CompactionTrigger
score = float64(len(tables)) / float64(v.s.o.GetCompactionL0Trigger())
} else {
score = float64(tables.size()) / levelMaxSize[level]
score = float64(tables.size()) / float64(v.s.o.GetCompactionTotalSize(level))
}
if score > bestScore {
@ -337,12 +327,14 @@ func (v *version) needCompaction() bool {
return v.cScore >= 1 || atomic.LoadPointer(&v.cSeek) != nil
}
type tablesScratch struct {
added map[uint64]atRecord
deleted map[uint64]struct{}
}
type versionStaging struct {
base *version
tables [kNumLevels]struct {
added map[uint64]ntRecord
deleted map[uint64]struct{}
}
tables []tablesScratch
}
func (p *versionStaging) commit(r *sessionRecord) {
@ -367,7 +359,7 @@ func (p *versionStaging) commit(r *sessionRecord) {
tm := &(p.tables[r.level])
if tm.added == nil {
tm.added = make(map[uint64]ntRecord)
tm.added = make(map[uint64]atRecord)
}
tm.added[r.num] = r
@ -379,7 +371,7 @@ func (p *versionStaging) commit(r *sessionRecord) {
func (p *versionStaging) finish() *version {
// Build new version.
nv := &version{s: p.base.s}
nv := newVersion(p.base.s)
for level, tm := range p.tables {
btables := p.base.tables[level]
@ -402,7 +394,7 @@ func (p *versionStaging) finish() *version {
// New tables.
for _, r := range tm.added {
nt = append(nt, r.makeFile(p.base.s))
nt = append(nt, p.base.s.tableFileFromRecord(r))
}
// Sort tables.
@ -429,7 +421,7 @@ func (vr *versionReleaser) Release() {
v := vr.v
v.s.vmu.Lock()
if !vr.once {
v.release_NB()
v.releaseNB()
vr.once = true
}
v.s.vmu.Unlock()