|
5 | 5 | package block
|
6 | 6 |
|
7 | 7 | import (
|
| 8 | + "context" |
8 | 9 | "encoding/binary"
|
| 10 | + "path/filepath" |
| 11 | + "runtime" |
9 | 12 | "time"
|
10 | 13 |
|
11 | 14 | "github.com/cespare/xxhash/v2"
|
| 15 | + "github.com/cockroachdb/crlib/crtime" |
| 16 | + "github.com/cockroachdb/crlib/fifo" |
12 | 17 | "github.com/cockroachdb/errors"
|
13 | 18 | "github.com/cockroachdb/pebble/internal/base"
|
| 19 | + "github.com/cockroachdb/pebble/internal/cache" |
14 | 20 | "github.com/cockroachdb/pebble/internal/crc"
|
| 21 | + "github.com/cockroachdb/pebble/internal/invariants" |
| 22 | + "github.com/cockroachdb/pebble/internal/sstableinternal" |
| 23 | + "github.com/cockroachdb/pebble/objstorage" |
| 24 | + "github.com/cockroachdb/pebble/objstorage/objstorageprovider" |
15 | 25 | )
|
16 | 26 |
|
17 | 27 | // Handle is the file offset and length of a block.
|
@@ -145,6 +155,26 @@ func (c *Checksummer) Checksum(block []byte, blockType byte) (checksum uint32) {
|
145 | 155 | return checksum
|
146 | 156 | }
|
147 | 157 |
|
| 158 | +// ValidateChecksum validates the checksum of a block. |
| 159 | +func ValidateChecksum(checksumType ChecksumType, b []byte, bh Handle) error { |
| 160 | + expectedChecksum := binary.LittleEndian.Uint32(b[bh.Length+1:]) |
| 161 | + var computedChecksum uint32 |
| 162 | + switch checksumType { |
| 163 | + case ChecksumTypeCRC32c: |
| 164 | + computedChecksum = crc.New(b[:bh.Length+1]).Value() |
| 165 | + case ChecksumTypeXXHash64: |
| 166 | + computedChecksum = uint32(xxhash.Sum64(b[:bh.Length+1])) |
| 167 | + default: |
| 168 | + return errors.Errorf("unsupported checksum type: %d", checksumType) |
| 169 | + } |
| 170 | + if expectedChecksum != computedChecksum { |
| 171 | + return base.CorruptionErrorf("block %d/%d: %s checksum mismatch %x != %x", |
| 172 | + errors.Safe(bh.Offset), errors.Safe(bh.Length), checksumType, |
| 173 | + expectedChecksum, computedChecksum) |
| 174 | + } |
| 175 | + return nil |
| 176 | +} |
| 177 | + |
148 | 178 | // Metadata is an in-memory buffer that stores metadata for a block. It is
|
149 | 179 | // allocated together with the buffer storing the block and is initialized once
|
150 | 180 | // when the block is read from disk.
|
@@ -321,3 +351,284 @@ func (env *ReadEnv) BlockRead(blockLength uint64, readDuration time.Duration) {
|
321 | 351 | env.IterStats.Accumulate(blockLength, 0, readDuration)
|
322 | 352 | }
|
323 | 353 | }
|
| 354 | + |
| 355 | +// A Reader reads blocks from a single file, handling caching, checksum |
| 356 | +// validation and decompression. |
| 357 | +type Reader struct { |
| 358 | + readable objstorage.Readable |
| 359 | + cacheOpts sstableinternal.CacheOptions |
| 360 | + loadBlockSema *fifo.Semaphore |
| 361 | + logger base.LoggerAndTracer |
| 362 | + checksumType ChecksumType |
| 363 | +} |
| 364 | + |
| 365 | +// Init initializes the Reader to read blocks from the provided Readable. |
| 366 | +func (r *Reader) Init( |
| 367 | + readable objstorage.Readable, |
| 368 | + cacheOpts sstableinternal.CacheOptions, |
| 369 | + sema *fifo.Semaphore, |
| 370 | + logger base.LoggerAndTracer, |
| 371 | + checksumType ChecksumType, |
| 372 | +) { |
| 373 | + r.readable = readable |
| 374 | + r.cacheOpts = cacheOpts |
| 375 | + r.loadBlockSema = sema |
| 376 | + r.logger = logger |
| 377 | + r.checksumType = checksumType |
| 378 | + if r.cacheOpts.Cache == nil { |
| 379 | + r.cacheOpts.Cache = cache.New(0) |
| 380 | + } else { |
| 381 | + r.cacheOpts.Cache.Ref() |
| 382 | + } |
| 383 | + if r.cacheOpts.CacheID == 0 { |
| 384 | + r.cacheOpts.CacheID = r.cacheOpts.Cache.NewID() |
| 385 | + } |
| 386 | +} |
| 387 | + |
| 388 | +// FileNum returns the file number of the file being read. |
| 389 | +func (r *Reader) FileNum() base.DiskFileNum { |
| 390 | + return r.cacheOpts.FileNum |
| 391 | +} |
| 392 | + |
| 393 | +// ChecksumType returns the checksum type used by the reader. |
| 394 | +func (r *Reader) ChecksumType() ChecksumType { |
| 395 | + return r.checksumType |
| 396 | +} |
| 397 | + |
| 398 | +// Read reads the block referenced by the provided handle. The readHandle is |
| 399 | +// optional. |
| 400 | +func (r *Reader) Read( |
| 401 | + ctx context.Context, |
| 402 | + env ReadEnv, |
| 403 | + readHandle objstorage.ReadHandle, |
| 404 | + bh Handle, |
| 405 | + initBlockMetadataFn func(*Metadata, []byte) error, |
| 406 | +) (handle BufferHandle, _ error) { |
| 407 | + var cv *cache.Value |
| 408 | + var crh cache.ReadHandle |
| 409 | + hit := true |
| 410 | + if env.BufferPool == nil { |
| 411 | + var errorDuration time.Duration |
| 412 | + var err error |
| 413 | + cv, crh, errorDuration, hit, err = r.cacheOpts.Cache.GetWithReadHandle( |
| 414 | + ctx, r.cacheOpts.CacheID, r.cacheOpts.FileNum, bh.Offset) |
| 415 | + if errorDuration > 5*time.Millisecond && r.logger.IsTracingEnabled(ctx) { |
| 416 | + r.logger.Eventf( |
| 417 | + ctx, "waited for turn when %s time wasted by failed reads", errorDuration.String()) |
| 418 | + } |
| 419 | + // TODO(sumeer): consider tracing when waited longer than some duration |
| 420 | + // for turn to do the read. |
| 421 | + if err != nil { |
| 422 | + return BufferHandle{}, err |
| 423 | + } |
| 424 | + } else { |
| 425 | + // The compaction path uses env.BufferPool, and does not coordinate read |
| 426 | + // using a cache.ReadHandle. This is ok since only a single compaction is |
| 427 | + // reading a block. |
| 428 | + cv = r.cacheOpts.Cache.Get(r.cacheOpts.CacheID, r.cacheOpts.FileNum, bh.Offset) |
| 429 | + if cv != nil { |
| 430 | + hit = true |
| 431 | + } |
| 432 | + } |
| 433 | + // INVARIANT: hit => cv != nil |
| 434 | + if cv != nil { |
| 435 | + if hit { |
| 436 | + // Cache hit. |
| 437 | + if readHandle != nil { |
| 438 | + readHandle.RecordCacheHit(ctx, int64(bh.Offset), int64(bh.Length+TrailerLen)) |
| 439 | + } |
| 440 | + env.BlockServedFromCache(bh.Length) |
| 441 | + } |
| 442 | + if invariants.Enabled && crh.Valid() { |
| 443 | + panic("cache.ReadHandle must not be valid") |
| 444 | + } |
| 445 | + return CacheBufferHandle(cv), nil |
| 446 | + } |
| 447 | + |
| 448 | + // Need to read. First acquire loadBlockSema, if needed. |
| 449 | + if sema := r.loadBlockSema; sema != nil { |
| 450 | + if err := sema.Acquire(ctx, 1); err != nil { |
| 451 | + // An error here can only come from the context. |
| 452 | + return BufferHandle{}, err |
| 453 | + } |
| 454 | + defer sema.Release(1) |
| 455 | + } |
| 456 | + value, err := r.doRead(ctx, env, readHandle, bh, initBlockMetadataFn) |
| 457 | + if err != nil { |
| 458 | + if crh.Valid() { |
| 459 | + crh.SetReadError(err) |
| 460 | + } |
| 461 | + return BufferHandle{}, err |
| 462 | + } |
| 463 | + h := value.MakeHandle(crh, r.cacheOpts.CacheID, r.cacheOpts.FileNum, bh.Offset) |
| 464 | + return h, nil |
| 465 | +} |
| 466 | + |
| 467 | +// TODO(sumeer): should the threshold be configurable. |
| 468 | +const slowReadTracingThreshold = 5 * time.Millisecond |
| 469 | + |
| 470 | +// doRead is a helper for Read that does the read, checksum check, |
| 471 | +// decompression, and returns either a Value or an error. |
| 472 | +func (r *Reader) doRead( |
| 473 | + ctx context.Context, |
| 474 | + env ReadEnv, |
| 475 | + readHandle objstorage.ReadHandle, |
| 476 | + bh Handle, |
| 477 | + initBlockMetadataFn func(*Metadata, []byte) error, |
| 478 | +) (Value, error) { |
| 479 | + compressed := Alloc(int(bh.Length+TrailerLen), env.BufferPool) |
| 480 | + readStopwatch := makeStopwatch() |
| 481 | + var err error |
| 482 | + if readHandle != nil { |
| 483 | + err = readHandle.ReadAt(ctx, compressed.BlockData(), int64(bh.Offset)) |
| 484 | + } else { |
| 485 | + err = r.readable.ReadAt(ctx, compressed.BlockData(), int64(bh.Offset)) |
| 486 | + } |
| 487 | + readDuration := readStopwatch.stop() |
| 488 | + // Call IsTracingEnabled to avoid the allocations of boxing integers into an |
| 489 | + // interface{}, unless necessary. |
| 490 | + if readDuration >= slowReadTracingThreshold && r.logger.IsTracingEnabled(ctx) { |
| 491 | + _, file1, line1, _ := runtime.Caller(1) |
| 492 | + _, file2, line2, _ := runtime.Caller(2) |
| 493 | + r.logger.Eventf(ctx, "reading block of %d bytes took %s (fileNum=%s; %s/%s:%d -> %s/%s:%d)", |
| 494 | + int(bh.Length+TrailerLen), readDuration.String(), |
| 495 | + r.cacheOpts.FileNum, |
| 496 | + filepath.Base(filepath.Dir(file2)), filepath.Base(file2), line2, |
| 497 | + filepath.Base(filepath.Dir(file1)), filepath.Base(file1), line1) |
| 498 | + } |
| 499 | + if err != nil { |
| 500 | + compressed.Release() |
| 501 | + return Value{}, err |
| 502 | + } |
| 503 | + env.BlockRead(bh.Length, readDuration) |
| 504 | + if err = ValidateChecksum(r.checksumType, compressed.BlockData(), bh); err != nil { |
| 505 | + compressed.Release() |
| 506 | + err = errors.Wrapf(err, "pebble/table: table %s", r.cacheOpts.FileNum) |
| 507 | + return Value{}, err |
| 508 | + } |
| 509 | + typ := CompressionIndicator(compressed.BlockData()[bh.Length]) |
| 510 | + compressed.Truncate(int(bh.Length)) |
| 511 | + var decompressed Value |
| 512 | + if typ == NoCompressionIndicator { |
| 513 | + decompressed = compressed |
| 514 | + } else { |
| 515 | + // Decode the length of the decompressed value. |
| 516 | + decodedLen, prefixLen, err := DecompressedLen(typ, compressed.BlockData()) |
| 517 | + if err != nil { |
| 518 | + compressed.Release() |
| 519 | + return Value{}, err |
| 520 | + } |
| 521 | + decompressed = Alloc(decodedLen, env.BufferPool) |
| 522 | + err = DecompressInto(typ, compressed.BlockData()[prefixLen:], decompressed.BlockData()) |
| 523 | + compressed.Release() |
| 524 | + if err != nil { |
| 525 | + decompressed.Release() |
| 526 | + return Value{}, err |
| 527 | + } |
| 528 | + } |
| 529 | + if err = initBlockMetadataFn(decompressed.BlockMetadata(), decompressed.BlockData()); err != nil { |
| 530 | + decompressed.Release() |
| 531 | + return Value{}, err |
| 532 | + } |
| 533 | + return decompressed, nil |
| 534 | +} |
| 535 | + |
| 536 | +// Readable returns the underlying objstorage.Readable. |
| 537 | +// |
| 538 | +// Users should avoid accessing the underlying Readable if it can be avoided. |
| 539 | +func (r *Reader) Readable() objstorage.Readable { |
| 540 | + return r.readable |
| 541 | +} |
| 542 | + |
| 543 | +// GetFromCache retrieves the block from the cache, if it is present. |
| 544 | +// |
| 545 | +// Users should prefer using Read, which handles reading from object storage on |
| 546 | +// a cache miss. |
| 547 | +func (r *Reader) GetFromCache(bh Handle) *cache.Value { |
| 548 | + return r.cacheOpts.Cache.Get(r.cacheOpts.CacheID, r.cacheOpts.FileNum, bh.Offset) |
| 549 | +} |
| 550 | + |
| 551 | +// UsePreallocatedReadHandle returns a ReadHandle that reads from the reader and |
| 552 | +// uses the provided preallocated read handle to back the read handle, avoiding |
| 553 | +// an unnecessary allocation. |
| 554 | +func (r *Reader) UsePreallocatedReadHandle( |
| 555 | + readBeforeSize objstorage.ReadBeforeSize, rh *objstorageprovider.PreallocatedReadHandle, |
| 556 | +) objstorage.ReadHandle { |
| 557 | + return objstorageprovider.UsePreallocatedReadHandle(r.readable, readBeforeSize, rh) |
| 558 | +} |
| 559 | + |
| 560 | +// Close releases resources associated with the Reader. |
| 561 | +func (r *Reader) Close() error { |
| 562 | + r.cacheOpts.Cache.Unref() |
| 563 | + var err error |
| 564 | + if r.readable != nil { |
| 565 | + err = r.readable.Close() |
| 566 | + r.readable = nil |
| 567 | + } |
| 568 | + return err |
| 569 | +} |
| 570 | + |
| 571 | +// ReadRaw reads len(buf) bytes from the provided Readable at the given offset |
| 572 | +// into buf. It's used to read the footer of a table. |
| 573 | +func ReadRaw( |
| 574 | + ctx context.Context, |
| 575 | + f objstorage.Readable, |
| 576 | + readHandle objstorage.ReadHandle, |
| 577 | + logger base.LoggerAndTracer, |
| 578 | + fileNum base.DiskFileNum, |
| 579 | + buf []byte, |
| 580 | + off int64, |
| 581 | +) ([]byte, error) { |
| 582 | + size := f.Size() |
| 583 | + if size < int64(len(buf)) { |
| 584 | + return nil, base.CorruptionErrorf("pebble/table: invalid table %s (file size is too small)", errors.Safe(fileNum)) |
| 585 | + } |
| 586 | + |
| 587 | + readStopwatch := makeStopwatch() |
| 588 | + var err error |
| 589 | + if readHandle != nil { |
| 590 | + err = readHandle.ReadAt(ctx, buf, off) |
| 591 | + } else { |
| 592 | + err = f.ReadAt(ctx, buf, off) |
| 593 | + } |
| 594 | + readDuration := readStopwatch.stop() |
| 595 | + // Call IsTracingEnabled to avoid the allocations of boxing integers into an |
| 596 | + // interface{}, unless necessary. |
| 597 | + if readDuration >= slowReadTracingThreshold && logger.IsTracingEnabled(ctx) { |
| 598 | + logger.Eventf(ctx, "reading footer of %d bytes took %s", |
| 599 | + len(buf), readDuration.String()) |
| 600 | + } |
| 601 | + if err != nil { |
| 602 | + return nil, errors.Wrap(err, "pebble/table: invalid table (could not read footer)") |
| 603 | + } |
| 604 | + return buf, nil |
| 605 | +} |
| 606 | + |
| 607 | +// DeterministicReadBlockDurationForTesting is for tests that want a |
| 608 | +// deterministic value of the time to read a block (that is not in the cache). |
| 609 | +// The return value is a function that must be called before the test exits. |
| 610 | +func DeterministicReadBlockDurationForTesting() func() { |
| 611 | + drbdForTesting := deterministicReadBlockDurationForTesting |
| 612 | + deterministicReadBlockDurationForTesting = true |
| 613 | + return func() { |
| 614 | + deterministicReadBlockDurationForTesting = drbdForTesting |
| 615 | + } |
| 616 | +} |
| 617 | + |
| 618 | +var deterministicReadBlockDurationForTesting = false |
| 619 | + |
| 620 | +type deterministicStopwatchForTesting struct { |
| 621 | + startTime crtime.Mono |
| 622 | +} |
| 623 | + |
| 624 | +func makeStopwatch() deterministicStopwatchForTesting { |
| 625 | + return deterministicStopwatchForTesting{startTime: crtime.NowMono()} |
| 626 | +} |
| 627 | + |
| 628 | +func (w deterministicStopwatchForTesting) stop() time.Duration { |
| 629 | + dur := w.startTime.Elapsed() |
| 630 | + if deterministicReadBlockDurationForTesting { |
| 631 | + dur = slowReadTracingThreshold |
| 632 | + } |
| 633 | + return dur |
| 634 | +} |
0 commit comments