VictoriaMetrics/lib/storage/block_stream_reader.go
Aliaksandr Valialkin f4989edd96
lib/bytesutil: split Resize() into ResizeNoCopy() and ResizeWithCopy() functions
Previously bytesutil.Resize() was copying the original byte slice contents to a newly allocated slice.
This wasted CPU cycles and memory bandwidth in some places, where the original slice contents wasn't needed
after slize resizing. Switch such places to bytesutil.ResizeNoCopy().

Rename the original bytesutil.Resize() function to bytesutil.ResizeWithCopy() for the sake of improved readability.

Additionally, allocate new slice with `make()` instead of `append()`. This guarantees that the capacity of the allocated slice
exactly matches the requested size. The `append()` could return a slice with bigger capacity as an optimization for further `append()` calls.
This could result in excess memory usage when the returned byte slice was cached (for instance, in lib/blockcache).

Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/2007
2022-01-25 15:24:44 +02:00

385 lines
12 KiB
Go

package storage
import (
"fmt"
"io"
"path/filepath"
"sync"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/bytesutil"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/encoding"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/filestream"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/fs"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/logger"
)
// blockStreamReader represents block stream reader.
type blockStreamReader struct {
// Currently active block.
Block Block
// Filesystem path to the stream reader.
//
// Is empty for inmemory stream readers.
path string
ph partHeader
// Use io.Reader type for timestampsReader and valuesReader
// in order to remove I2I conversion in readBlock
// when passing them to fs.ReadFullData
timestampsReader io.Reader
valuesReader io.Reader
indexReader filestream.ReadCloser
mrs []metaindexRow
// Points the current mr from mrs.
mr *metaindexRow
// The total number of rows read so far.
rowsCount uint64
// The total number of blocks read so far.
blocksCount uint64
// The number of block headers in the current index block.
indexBlockHeadersCount uint32
timestampsBlockOffset uint64
valuesBlockOffset uint64
indexBlockOffset uint64
prevTimestampsBlockOffset uint64
prevTimestampsData []byte
indexData []byte
compressedIndexData []byte
// Cursor to indexData.
indexCursor []byte
err error
}
func (bsr *blockStreamReader) assertWriteClosers() {
_ = bsr.timestampsReader.(filestream.ReadCloser)
_ = bsr.valuesReader.(filestream.ReadCloser)
}
func (bsr *blockStreamReader) reset() {
bsr.Block.Reset()
bsr.path = ""
bsr.ph.Reset()
bsr.timestampsReader = nil
bsr.valuesReader = nil
bsr.indexReader = nil
bsr.mrs = bsr.mrs[:0]
bsr.mr = nil
bsr.rowsCount = 0
bsr.blocksCount = 0
bsr.indexBlockHeadersCount = 0
bsr.timestampsBlockOffset = 0
bsr.valuesBlockOffset = 0
bsr.indexBlockOffset = 0
bsr.prevTimestampsBlockOffset = 0
bsr.prevTimestampsData = bsr.prevTimestampsData[:0]
bsr.indexData = bsr.indexData[:0]
bsr.compressedIndexData = bsr.compressedIndexData[:0]
bsr.indexCursor = nil
bsr.err = nil
}
// String returns human-readable representation of bsr.
func (bsr *blockStreamReader) String() string {
if len(bsr.path) > 0 {
return bsr.path
}
return bsr.ph.String()
}
// InitFromInmemoryPart initializes bsr from the given mp.
func (bsr *blockStreamReader) InitFromInmemoryPart(mp *inmemoryPart) {
bsr.reset()
bsr.ph = mp.ph
bsr.timestampsReader = mp.timestampsData.NewReader()
bsr.valuesReader = mp.valuesData.NewReader()
bsr.indexReader = mp.indexData.NewReader()
var err error
bsr.mrs, err = unmarshalMetaindexRows(bsr.mrs[:0], mp.metaindexData.NewReader())
if err != nil {
logger.Panicf("BUG: cannot unmarshal metaindex rows from inmemoryPart: %s", err)
}
bsr.assertWriteClosers()
}
// InitFromFilePart initializes bsr from a file-based part on the given path.
//
// Files in the part are always read without OS cache pollution,
// since they are usually deleted after the merge.
func (bsr *blockStreamReader) InitFromFilePart(path string) error {
bsr.reset()
path = filepath.Clean(path)
if err := bsr.ph.ParseFromPath(path); err != nil {
return fmt.Errorf("cannot parse path to part: %w", err)
}
timestampsPath := path + "/timestamps.bin"
timestampsFile, err := filestream.Open(timestampsPath, true)
if err != nil {
return fmt.Errorf("cannot open timestamps file in stream mode: %w", err)
}
valuesPath := path + "/values.bin"
valuesFile, err := filestream.Open(valuesPath, true)
if err != nil {
timestampsFile.MustClose()
return fmt.Errorf("cannot open values file in stream mode: %w", err)
}
indexPath := path + "/index.bin"
indexFile, err := filestream.Open(indexPath, true)
if err != nil {
timestampsFile.MustClose()
valuesFile.MustClose()
return fmt.Errorf("cannot open index file in stream mode: %w", err)
}
metaindexPath := path + "/metaindex.bin"
metaindexFile, err := filestream.Open(metaindexPath, true)
if err != nil {
timestampsFile.MustClose()
valuesFile.MustClose()
indexFile.MustClose()
return fmt.Errorf("cannot open metaindex file in stream mode: %w", err)
}
mrs, err := unmarshalMetaindexRows(bsr.mrs[:0], metaindexFile)
metaindexFile.MustClose()
if err != nil {
timestampsFile.MustClose()
valuesFile.MustClose()
indexFile.MustClose()
return fmt.Errorf("cannot unmarshal metaindex rows from file part %q: %w", metaindexPath, err)
}
bsr.path = path
bsr.timestampsReader = timestampsFile
bsr.valuesReader = valuesFile
bsr.indexReader = indexFile
bsr.mrs = mrs
bsr.assertWriteClosers()
return nil
}
// MustClose closes the bsr.
//
// It closes *Reader files passed to Init.
func (bsr *blockStreamReader) MustClose() {
bsr.timestampsReader.(filestream.ReadCloser).MustClose()
bsr.valuesReader.(filestream.ReadCloser).MustClose()
bsr.indexReader.MustClose()
bsr.reset()
}
// Error returns the last error.
func (bsr *blockStreamReader) Error() error {
if bsr.err == nil || bsr.err == io.EOF {
return nil
}
return fmt.Errorf("error when reading part %q: %w", bsr, bsr.err)
}
// NextBlock advances bsr to the next block.
func (bsr *blockStreamReader) NextBlock() bool {
if bsr.err != nil {
return false
}
tsidPrev := bsr.Block.bh.TSID
bsr.Block.Reset()
err := bsr.readBlock()
if err == nil {
if bsr.Block.bh.TSID.Less(&tsidPrev) {
bsr.err = fmt.Errorf("possible data corruption: the next TSID=%v is smaller than the previous TSID=%v", &bsr.Block.bh.TSID, &tsidPrev)
return false
}
if bsr.Block.bh.RowsCount == 0 {
bsr.err = fmt.Errorf("invalid block read with zero rows; block=%+v", &bsr.Block)
return false
}
return true
}
if err == io.EOF {
bsr.err = io.EOF
return false
}
bsr.err = fmt.Errorf("cannot read next block: %w", err)
return false
}
func (bsr *blockStreamReader) readBlock() error {
if len(bsr.indexCursor) == 0 {
if bsr.mr != nil && bsr.indexBlockHeadersCount != bsr.mr.BlockHeadersCount {
return fmt.Errorf("invalid number of block headers in the previous index block at offset %d; got %d; want %d",
bsr.prevIndexBlockOffset(), bsr.indexBlockHeadersCount, bsr.mr.BlockHeadersCount)
}
bsr.indexBlockHeadersCount = 0
if err := bsr.readIndexBlock(); err != nil {
if err == io.EOF {
return io.EOF
}
return fmt.Errorf("cannot read index block from index data: %w", err)
}
}
// Read block header.
if len(bsr.indexCursor) < marshaledBlockHeaderSize {
return fmt.Errorf("too short index data for reading block header at offset %d; got %d bytes; want %d bytes",
bsr.prevIndexBlockOffset(), len(bsr.indexCursor), marshaledBlockHeaderSize)
}
bsr.Block.headerData = append(bsr.Block.headerData[:0], bsr.indexCursor[:marshaledBlockHeaderSize]...)
bsr.indexCursor = bsr.indexCursor[marshaledBlockHeaderSize:]
tail, err := bsr.Block.bh.Unmarshal(bsr.Block.headerData)
if err != nil {
return fmt.Errorf("cannot parse block header read from index data at offset %d: %w", bsr.prevIndexBlockOffset(), err)
}
if len(tail) > 0 {
return fmt.Errorf("non-empty tail left after parsing block header at offset %d: %x", bsr.prevIndexBlockOffset(), tail)
}
bsr.blocksCount++
if bsr.blocksCount > bsr.ph.BlocksCount {
return fmt.Errorf("too many blocks found in the block stream; got %d; cannot be bigger than %d", bsr.blocksCount, bsr.ph.BlocksCount)
}
// Validate block header.
bsr.rowsCount += uint64(bsr.Block.bh.RowsCount)
if bsr.rowsCount > bsr.ph.RowsCount {
return fmt.Errorf("too many rows found in the block stream; got %d; cannot be bigger than %d", bsr.rowsCount, bsr.ph.RowsCount)
}
if bsr.Block.bh.MinTimestamp < bsr.ph.MinTimestamp {
return fmt.Errorf("invalid MinTimestamp at block header at offset %d; got %d; cannot be smaller than %d",
bsr.prevIndexBlockOffset(), bsr.Block.bh.MinTimestamp, bsr.ph.MinTimestamp)
}
if bsr.Block.bh.MaxTimestamp > bsr.ph.MaxTimestamp {
return fmt.Errorf("invalid MaxTimestamp at block header at offset %d; got %d; cannot be bigger than %d",
bsr.prevIndexBlockOffset(), bsr.Block.bh.MaxTimestamp, bsr.ph.MaxTimestamp)
}
usePrevTimestamps := len(bsr.prevTimestampsData) > 0 && bsr.Block.bh.TimestampsBlockOffset == bsr.prevTimestampsBlockOffset
if usePrevTimestamps {
if int(bsr.Block.bh.TimestampsBlockSize) != len(bsr.prevTimestampsData) {
return fmt.Errorf("invalid TimestampsBlockSize at block header at offset %d; got %d; want %d",
bsr.prevIndexBlockOffset(), bsr.Block.bh.TimestampsBlockSize, len(bsr.prevTimestampsData))
}
} else if bsr.Block.bh.TimestampsBlockOffset != bsr.timestampsBlockOffset {
return fmt.Errorf("invalid TimestampsBlockOffset at block header at offset %d; got %d; want %d",
bsr.prevIndexBlockOffset(), bsr.Block.bh.TimestampsBlockOffset, bsr.timestampsBlockOffset)
}
if bsr.Block.bh.ValuesBlockOffset != bsr.valuesBlockOffset {
return fmt.Errorf("invalid ValuesBlockOffset at block header at offset %d; got %d; want %d",
bsr.prevIndexBlockOffset(), bsr.Block.bh.ValuesBlockOffset, bsr.valuesBlockOffset)
}
// Read timestamps data.
if usePrevTimestamps {
bsr.Block.timestampsData = append(bsr.Block.timestampsData[:0], bsr.prevTimestampsData...)
} else {
bsr.Block.timestampsData = bytesutil.ResizeNoCopy(bsr.Block.timestampsData, int(bsr.Block.bh.TimestampsBlockSize))
if err := fs.ReadFullData(bsr.timestampsReader, bsr.Block.timestampsData); err != nil {
return fmt.Errorf("cannot read timestamps block at offset %d: %w", bsr.timestampsBlockOffset, err)
}
bsr.prevTimestampsBlockOffset = bsr.timestampsBlockOffset
bsr.prevTimestampsData = append(bsr.prevTimestampsData[:0], bsr.Block.timestampsData...)
}
// Read values data.
bsr.Block.valuesData = bytesutil.ResizeNoCopy(bsr.Block.valuesData, int(bsr.Block.bh.ValuesBlockSize))
if err := fs.ReadFullData(bsr.valuesReader, bsr.Block.valuesData); err != nil {
return fmt.Errorf("cannot read values block at offset %d: %w", bsr.valuesBlockOffset, err)
}
// Update offsets.
if !usePrevTimestamps {
bsr.timestampsBlockOffset += uint64(bsr.Block.bh.TimestampsBlockSize)
}
bsr.valuesBlockOffset += uint64(bsr.Block.bh.ValuesBlockSize)
bsr.indexBlockHeadersCount++
return nil
}
func (bsr *blockStreamReader) readIndexBlock() error {
// Go to the next metaindex row.
if len(bsr.mrs) == 0 {
return io.EOF
}
bsr.mr = &bsr.mrs[0]
bsr.mrs = bsr.mrs[1:]
// Validate metaindex row.
if bsr.indexBlockOffset != bsr.mr.IndexBlockOffset {
return fmt.Errorf("invalid IndexBlockOffset in metaindex row; got %d; want %d", bsr.mr.IndexBlockOffset, bsr.indexBlockOffset)
}
if bsr.mr.MinTimestamp < bsr.ph.MinTimestamp {
return fmt.Errorf("invalid MinTimesamp in metaindex row; got %d; cannot be smaller than %d", bsr.mr.MinTimestamp, bsr.ph.MinTimestamp)
}
if bsr.mr.MaxTimestamp > bsr.ph.MaxTimestamp {
return fmt.Errorf("invalid MaxTimestamp in metaindex row; got %d; cannot be bigger than %d", bsr.mr.MaxTimestamp, bsr.ph.MaxTimestamp)
}
// Read index block.
bsr.compressedIndexData = bytesutil.ResizeNoCopy(bsr.compressedIndexData, int(bsr.mr.IndexBlockSize))
if err := fs.ReadFullData(bsr.indexReader, bsr.compressedIndexData); err != nil {
return fmt.Errorf("cannot read index block from index data at offset %d: %w", bsr.indexBlockOffset, err)
}
tmpData, err := encoding.DecompressZSTD(bsr.indexData[:0], bsr.compressedIndexData)
if err != nil {
return fmt.Errorf("cannot decompress index block read at offset %d: %w", bsr.indexBlockOffset, err)
}
bsr.indexData = tmpData
bsr.indexCursor = bsr.indexData
// Update offsets.
bsr.indexBlockOffset += uint64(bsr.mr.IndexBlockSize)
return nil
}
func (bsr *blockStreamReader) prevIndexBlockOffset() uint64 {
return bsr.indexBlockOffset - uint64(bsr.mr.IndexBlockSize)
}
func getBlockStreamReader() *blockStreamReader {
v := bsrPool.Get()
if v == nil {
return &blockStreamReader{}
}
return v.(*blockStreamReader)
}
func putBlockStreamReader(bsr *blockStreamReader) {
bsr.MustClose()
bsrPool.Put(bsr)
}
var bsrPool sync.Pool