VictoriaMetrics/lib/logstorage/datadb.go
Aliaksandr Valialkin fe5f16b662
lib/logstorage: dynamically adjust the number of (bloom, values) shards in a part depending on the number of non-const columns
This allows reducing the amounts of data, which must be read during queries over logs with big number of fields (aka "wide events").
This, in turn, improves query performance when the data, which needs to be scanned during the query, doesn't fit OS page cache.

(cherry picked from commit 7a62eefa34)
2024-10-30 09:52:51 +01:00

1280 lines
34 KiB
Go

package logstorage
import (
"encoding/json"
"fmt"
"math"
"os"
"path/filepath"
"sort"
"sync"
"sync/atomic"
"time"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/cgroup"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/fs"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/logger"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/memory"
)
// The maximum size of big part.
//
// This number limits the maximum time required for building big part.
// This time shouldn't exceed a few days.
const maxBigPartSize = 1e12
// The maximum number of inmemory parts in the partition.
//
// The actual number of inmemory parts may exceed this value if in-memory mergers
// cannot keep up with the rate of creating new in-memory parts.
const maxInmemoryPartsPerPartition = 20
// The interval for guaranteed flush of recently ingested data from memory to on-disk parts,
// so they survive process crash.
var dataFlushInterval = 5 * time.Second
// Default number of parts to merge at once.
//
// This number has been obtained empirically - it gives the lowest possible overhead.
// See appendPartsToMerge tests for details.
const defaultPartsToMerge = 15
// minMergeMultiplier is the minimum multiplier for the size of the output part
// compared to the size of the maximum input part for the merge.
//
// Higher value reduces write amplification (disk write IO induced by the merge),
// while increases the number of unmerged parts.
// The 1.7 is good enough for production workloads.
const minMergeMultiplier = 1.7
// datadb represents a database with log data
type datadb struct {
// mergeIdx is used for generating unique directory names for parts
mergeIdx atomic.Uint64
inmemoryMergesTotal atomic.Uint64
inmemoryActiveMerges atomic.Int64
smallPartMergesTotal atomic.Uint64
smallPartActiveMerges atomic.Int64
bigPartMergesTotal atomic.Uint64
bigPartActiveMerges atomic.Int64
// pt is the partition the datadb belongs to
pt *partition
// path is the path to the directory with log data
path string
// flushInterval is interval for flushing the inmemory parts to disk
flushInterval time.Duration
// inmemoryParts contains a list of inmemory parts
inmemoryParts []*partWrapper
// smallParts contains a list of file-based small parts
smallParts []*partWrapper
// bigParts contains a list of file-based big parts
bigParts []*partWrapper
// partsLock protects parts from concurrent access
partsLock sync.Mutex
// wg is used for determining when background workers stop
//
// wg.Add() must be called under partsLock after checking whether stopCh isn't closed.
// This should prevent from calling wg.Add() after stopCh is closed and wg.Wait() is called.
wg sync.WaitGroup
// stopCh is used for notifying background workers to stop
//
// It must be closed under partsLock in order to prevent from calling wg.Add()
// after stopCh is closed.
stopCh chan struct{}
}
// partWrapper is a wrapper for opened part.
type partWrapper struct {
// refCount is the number of references to p.
//
// When the number of references reaches zero, then p is closed.
refCount atomic.Int32
// The flag, which is set when the part must be deleted after refCount reaches zero.
mustDrop atomic.Bool
// p is an opened part
p *part
// mp references inmemory part used for initializing p.
mp *inmemoryPart
// isInMerge is set to true if the part takes part in merge.
isInMerge bool
// The deadline when in-memory part must be flushed to disk.
flushDeadline time.Time
}
func (pw *partWrapper) incRef() {
pw.refCount.Add(1)
}
func (pw *partWrapper) decRef() {
n := pw.refCount.Add(-1)
if n > 0 {
return
}
deletePath := ""
if pw.mp == nil {
if pw.mustDrop.Load() {
deletePath = pw.p.path
}
} else {
putInmemoryPart(pw.mp)
pw.mp = nil
}
mustClosePart(pw.p)
pw.p = nil
if deletePath != "" {
fs.MustRemoveAll(deletePath)
}
}
func mustCreateDatadb(path string) {
fs.MustMkdirFailIfExist(path)
mustWritePartNames(path, nil, nil)
}
// mustOpenDatadb opens datadb at the given path with the given flushInterval for in-memory data.
func mustOpenDatadb(pt *partition, path string, flushInterval time.Duration) *datadb {
// Remove temporary directories, which may be left after unclean shutdown.
fs.MustRemoveTemporaryDirs(path)
partNames := mustReadPartNames(path)
mustRemoveUnusedDirs(path, partNames)
var smallParts []*partWrapper
var bigParts []*partWrapper
for _, partName := range partNames {
// Make sure the partName exists on disk.
// If it is missing, then manual action from the user is needed,
// since this is unexpected state, which cannot occur under normal operation,
// including unclean shutdown.
partPath := filepath.Join(path, partName)
if !fs.IsPathExist(partPath) {
partsFile := filepath.Join(path, partsFilename)
logger.Panicf("FATAL: part %q is listed in %q, but is missing on disk; "+
"ensure %q contents is not corrupted; remove %q to rebuild its' content from the list of existing parts",
partPath, partsFile, partsFile, partsFile)
}
p := mustOpenFilePart(pt, partPath)
pw := newPartWrapper(p, nil, time.Time{})
if p.ph.CompressedSizeBytes > getMaxInmemoryPartSize() {
bigParts = append(bigParts, pw)
} else {
smallParts = append(smallParts, pw)
}
}
ddb := &datadb{
pt: pt,
flushInterval: flushInterval,
path: path,
smallParts: smallParts,
bigParts: bigParts,
stopCh: make(chan struct{}),
}
ddb.mergeIdx.Store(uint64(time.Now().UnixNano()))
ddb.startBackgroundWorkers()
return ddb
}
func (ddb *datadb) startBackgroundWorkers() {
// Start file parts mergers, so they could start merging unmerged parts if needed.
// There is no need in starting in-memory parts mergers, since there are no in-memory parts yet.
ddb.startSmallPartsMergers()
ddb.startBigPartsMergers()
ddb.startInmemoryPartsFlusher()
}
var (
inmemoryPartsConcurrencyCh = make(chan struct{}, cgroup.AvailableCPUs())
smallPartsConcurrencyCh = make(chan struct{}, cgroup.AvailableCPUs())
bigPartsConcurrencyCh = make(chan struct{}, cgroup.AvailableCPUs())
)
func (ddb *datadb) startSmallPartsMergers() {
ddb.partsLock.Lock()
for i := 0; i < cap(smallPartsConcurrencyCh); i++ {
ddb.startSmallPartsMergerLocked()
}
ddb.partsLock.Unlock()
}
func (ddb *datadb) startBigPartsMergers() {
ddb.partsLock.Lock()
for i := 0; i < cap(bigPartsConcurrencyCh); i++ {
ddb.startBigPartsMergerLocked()
}
ddb.partsLock.Unlock()
}
func (ddb *datadb) startInmemoryPartsMergerLocked() {
if needStop(ddb.stopCh) {
return
}
ddb.wg.Add(1)
go func() {
ddb.inmemoryPartsMerger()
ddb.wg.Done()
}()
}
func (ddb *datadb) startSmallPartsMergerLocked() {
if needStop(ddb.stopCh) {
return
}
ddb.wg.Add(1)
go func() {
ddb.smallPartsMerger()
ddb.wg.Done()
}()
}
func (ddb *datadb) startBigPartsMergerLocked() {
if needStop(ddb.stopCh) {
return
}
ddb.wg.Add(1)
go func() {
ddb.bigPartsMerger()
ddb.wg.Done()
}()
}
func (ddb *datadb) startInmemoryPartsFlusher() {
ddb.wg.Add(1)
go func() {
ddb.inmemoryPartsFlusher()
ddb.wg.Done()
}()
}
func (ddb *datadb) inmemoryPartsFlusher() {
// Do not add jitter to d in order to guarantee the flush interval
ticker := time.NewTicker(dataFlushInterval)
defer ticker.Stop()
for {
select {
case <-ddb.stopCh:
return
case <-ticker.C:
ddb.mustFlushInmemoryPartsToFiles(false)
}
}
}
func (ddb *datadb) mustFlushInmemoryPartsToFiles(isFinal bool) {
currentTime := time.Now()
var pws []*partWrapper
ddb.partsLock.Lock()
for _, pw := range ddb.inmemoryParts {
if !pw.isInMerge && (isFinal || pw.flushDeadline.Before(currentTime)) {
pw.isInMerge = true
pws = append(pws, pw)
}
}
ddb.partsLock.Unlock()
ddb.mustMergePartsToFiles(pws)
}
func (ddb *datadb) mustMergePartsToFiles(pws []*partWrapper) {
wg := getWaitGroup()
for len(pws) > 0 {
pwsToMerge, pwsRemaining := getPartsForOptimalMerge(pws)
wg.Add(1)
inmemoryPartsConcurrencyCh <- struct{}{}
go func(pwsChunk []*partWrapper) {
defer func() {
<-inmemoryPartsConcurrencyCh
wg.Done()
}()
ddb.mustMergeParts(pwsChunk, true)
}(pwsToMerge)
pws = pwsRemaining
}
wg.Wait()
putWaitGroup(wg)
}
// getPartsForOptimalMerge returns parts from pws for optimal merge, plus the remaining parts.
//
// the pws items are replaced by nil after the call. This is needed for helping Go GC to reclaim the referenced items.
func getPartsForOptimalMerge(pws []*partWrapper) ([]*partWrapper, []*partWrapper) {
pwsToMerge := appendPartsToMerge(nil, pws, math.MaxUint64)
if len(pwsToMerge) == 0 {
return pws, nil
}
m := partsToMap(pwsToMerge)
pwsRemaining := make([]*partWrapper, 0, len(pws)-len(pwsToMerge))
for _, pw := range pws {
if _, ok := m[pw]; !ok {
pwsRemaining = append(pwsRemaining, pw)
}
}
// Clear references to pws items, so they could be reclaimed faster by Go GC.
for i := range pws {
pws[i] = nil
}
return pwsToMerge, pwsRemaining
}
func getWaitGroup() *sync.WaitGroup {
v := wgPool.Get()
if v == nil {
return &sync.WaitGroup{}
}
return v.(*sync.WaitGroup)
}
func putWaitGroup(wg *sync.WaitGroup) {
wgPool.Put(wg)
}
var wgPool sync.Pool
func (ddb *datadb) inmemoryPartsMerger() {
for {
if needStop(ddb.stopCh) {
return
}
maxOutBytes := ddb.getMaxBigPartSize()
ddb.partsLock.Lock()
pws := getPartsToMergeLocked(ddb.inmemoryParts, maxOutBytes)
ddb.partsLock.Unlock()
if len(pws) == 0 {
// Nothing to merge
return
}
inmemoryPartsConcurrencyCh <- struct{}{}
ddb.mustMergeParts(pws, false)
<-inmemoryPartsConcurrencyCh
}
}
func (ddb *datadb) smallPartsMerger() {
for {
if needStop(ddb.stopCh) {
return
}
maxOutBytes := ddb.getMaxBigPartSize()
ddb.partsLock.Lock()
pws := getPartsToMergeLocked(ddb.smallParts, maxOutBytes)
ddb.partsLock.Unlock()
if len(pws) == 0 {
// Nothing to merge
return
}
smallPartsConcurrencyCh <- struct{}{}
ddb.mustMergeParts(pws, false)
<-smallPartsConcurrencyCh
}
}
func (ddb *datadb) bigPartsMerger() {
for {
if needStop(ddb.stopCh) {
return
}
maxOutBytes := ddb.getMaxBigPartSize()
ddb.partsLock.Lock()
pws := getPartsToMergeLocked(ddb.bigParts, maxOutBytes)
ddb.partsLock.Unlock()
if len(pws) == 0 {
// Nothing to merge
return
}
bigPartsConcurrencyCh <- struct{}{}
ddb.mustMergeParts(pws, false)
<-bigPartsConcurrencyCh
}
}
// getPartsToMergeLocked returns optimal parts to merge from pws.
//
// The summary size of the returned parts must be smaller than maxOutBytes.
func getPartsToMergeLocked(pws []*partWrapper, maxOutBytes uint64) []*partWrapper {
pwsRemaining := make([]*partWrapper, 0, len(pws))
for _, pw := range pws {
if !pw.isInMerge {
pwsRemaining = append(pwsRemaining, pw)
}
}
pwsToMerge := appendPartsToMerge(nil, pwsRemaining, maxOutBytes)
for _, pw := range pwsToMerge {
if pw.isInMerge {
logger.Panicf("BUG: partWrapper.isInMerge cannot be set")
}
pw.isInMerge = true
}
return pwsToMerge
}
func assertIsInMerge(pws []*partWrapper) {
for _, pw := range pws {
if !pw.isInMerge {
logger.Panicf("BUG: partWrapper.isInMerge unexpectedly set to false")
}
}
}
// mustMergeParts merges pws to a single resulting part.
//
// if isFinal is set, then the resulting part is guaranteed to be saved to disk.
// if isFinal is set, then the merge process cannot be interrupted.
// The pws may remain unmerged after returning from the function if there is no enough disk space.
//
// All the parts inside pws must have isInMerge field set to true.
// The isInMerge field inside pws parts is set to false before returning from the function.
func (ddb *datadb) mustMergeParts(pws []*partWrapper, isFinal bool) {
if len(pws) == 0 {
// Nothing to merge.
return
}
assertIsInMerge(pws)
defer ddb.releasePartsToMerge(pws)
startTime := time.Now()
dstPartType := ddb.getDstPartType(pws, isFinal)
if dstPartType != partInmemory {
// Make sure there is enough disk space for performing the merge
partsSize := getCompressedSize(pws)
needReleaseDiskSpace := tryReserveDiskSpace(ddb.path, partsSize)
if needReleaseDiskSpace {
defer releaseDiskSpace(partsSize)
} else {
if !isFinal {
// There is no enough disk space for performing the non-final merge.
return
}
// Try performing final merge even if there is no enough disk space
// in order to persist in-memory data to disk.
// It is better to crash on out of memory error in this case.
}
}
switch dstPartType {
case partInmemory:
ddb.inmemoryMergesTotal.Add(1)
ddb.inmemoryActiveMerges.Add(1)
defer ddb.inmemoryActiveMerges.Add(-1)
case partSmall:
ddb.smallPartMergesTotal.Add(1)
ddb.smallPartActiveMerges.Add(1)
defer ddb.smallPartActiveMerges.Add(-1)
case partBig:
ddb.bigPartMergesTotal.Add(1)
ddb.bigPartActiveMerges.Add(1)
defer ddb.bigPartActiveMerges.Add(-1)
default:
logger.Panicf("BUG: unknown partType=%d", dstPartType)
}
// Initialize destination paths.
mergeIdx := ddb.nextMergeIdx()
dstPartPath := ddb.getDstPartPath(dstPartType, mergeIdx)
if isFinal && len(pws) == 1 && pws[0].mp != nil {
// Fast path: flush a single in-memory part to disk.
mp := pws[0].mp
mp.MustStoreToDisk(dstPartPath)
pwNew := ddb.openCreatedPart(&mp.ph, pws, nil, dstPartPath)
ddb.swapSrcWithDstParts(pws, pwNew, dstPartType)
return
}
// Prepare blockStreamReaders for source parts.
bsrs := mustOpenBlockStreamReaders(pws)
// Prepare BlockStreamWriter for destination part.
srcSize := uint64(0)
srcRowsCount := uint64(0)
srcBlocksCount := uint64(0)
bloomValuesShardsCount := uint64(0)
for _, pw := range pws {
ph := &pw.p.ph
srcSize += ph.CompressedSizeBytes
srcRowsCount += ph.RowsCount
srcBlocksCount += ph.BlocksCount
if ph.BloomValuesFieldsCount > bloomValuesShardsCount {
bloomValuesShardsCount = ph.BloomValuesFieldsCount
}
}
bsw := getBlockStreamWriter()
var mpNew *inmemoryPart
if dstPartType == partInmemory {
mpNew = getInmemoryPart()
bsw.MustInitForInmemoryPart(mpNew)
} else {
nocache := dstPartType == partBig
bsw.MustInitForFilePart(dstPartPath, nocache, bloomValuesShardsCount)
}
// Merge source parts to destination part.
var ph partHeader
stopCh := ddb.stopCh
if isFinal {
// The final merge shouldn't be stopped even if ddb.stopCh is closed.
stopCh = nil
}
mustMergeBlockStreams(&ph, bsw, bsrs, stopCh)
putBlockStreamWriter(bsw)
for _, bsr := range bsrs {
putBlockStreamReader(bsr)
}
// Persist partHeader for destination part after the merge.
if mpNew != nil {
mpNew.ph = ph
} else {
ph.mustWriteMetadata(dstPartPath)
// Make sure the created part directory listing is synced.
fs.MustSyncPath(dstPartPath)
}
if needStop(stopCh) {
// Remove incomplete destination part
if dstPartType != partInmemory {
fs.MustRemoveAll(dstPartPath)
}
return
}
// Atomically swap the source parts with the newly created part.
pwNew := ddb.openCreatedPart(&ph, pws, mpNew, dstPartPath)
dstSize := uint64(0)
dstRowsCount := uint64(0)
dstBlocksCount := uint64(0)
if pwNew != nil {
pDst := pwNew.p
dstSize = pDst.ph.CompressedSizeBytes
dstRowsCount = pDst.ph.RowsCount
dstBlocksCount = pDst.ph.BlocksCount
}
ddb.swapSrcWithDstParts(pws, pwNew, dstPartType)
d := time.Since(startTime)
if d <= time.Minute {
return
}
// Log stats for long merges.
durationSecs := d.Seconds()
rowsPerSec := int(float64(srcRowsCount) / durationSecs)
logger.Infof("merged (%d parts, %d rows, %d blocks, %d bytes) into (1 part, %d rows, %d blocks, %d bytes) in %.3f seconds at %d rows/sec to %q",
len(pws), srcRowsCount, srcBlocksCount, srcSize, dstRowsCount, dstBlocksCount, dstSize, durationSecs, rowsPerSec, dstPartPath)
}
func (ddb *datadb) nextMergeIdx() uint64 {
return ddb.mergeIdx.Add(1)
}
type partType int
var (
partInmemory = partType(0)
partSmall = partType(1)
partBig = partType(2)
)
func (ddb *datadb) getDstPartType(pws []*partWrapper, isFinal bool) partType {
dstPartSize := getCompressedSize(pws)
if dstPartSize > ddb.getMaxSmallPartSize() {
return partBig
}
if isFinal || dstPartSize > getMaxInmemoryPartSize() {
return partSmall
}
if !areAllInmemoryParts(pws) {
// If at least a single source part is located in file,
// then the destination part must be in file for durability reasons.
return partSmall
}
return partInmemory
}
func (ddb *datadb) getDstPartPath(dstPartType partType, mergeIdx uint64) string {
ptPath := ddb.path
dstPartPath := ""
if dstPartType != partInmemory {
dstPartPath = filepath.Join(ptPath, fmt.Sprintf("%016X", mergeIdx))
}
return dstPartPath
}
func (ddb *datadb) openCreatedPart(ph *partHeader, pws []*partWrapper, mpNew *inmemoryPart, dstPartPath string) *partWrapper {
// Open the created part.
if ph.RowsCount == 0 {
// The created part is empty. Remove it
if mpNew == nil {
fs.MustRemoveAll(dstPartPath)
}
return nil
}
var p *part
var flushDeadline time.Time
if mpNew != nil {
// Open the created part from memory.
p = mustOpenInmemoryPart(ddb.pt, mpNew)
flushDeadline = ddb.getFlushToDiskDeadline(pws)
} else {
// Open the created part from disk.
p = mustOpenFilePart(ddb.pt, dstPartPath)
}
return newPartWrapper(p, mpNew, flushDeadline)
}
func (ddb *datadb) mustAddRows(lr *LogRows) {
if len(lr.streamIDs) == 0 {
return
}
inmemoryPartsConcurrencyCh <- struct{}{}
mp := getInmemoryPart()
mp.mustInitFromRows(lr)
p := mustOpenInmemoryPart(ddb.pt, mp)
<-inmemoryPartsConcurrencyCh
flushDeadline := time.Now().Add(ddb.flushInterval)
pw := newPartWrapper(p, mp, flushDeadline)
ddb.partsLock.Lock()
ddb.inmemoryParts = append(ddb.inmemoryParts, pw)
ddb.startInmemoryPartsMergerLocked()
ddb.partsLock.Unlock()
}
// DatadbStats contains various stats for datadb.
type DatadbStats struct {
// InmemoryMergesTotal is the number of inmemory merges performed in the given datadb.
InmemoryMergesTotal uint64
// InmemoryActiveMerges is the number of currently active inmemory merges performed by the given datadb.
InmemoryActiveMerges uint64
// SmallPartMergesTotal is the number of small file merges performed in the given datadb.
SmallPartMergesTotal uint64
// SmallPartActiveMerges is the number of currently active small file merges performed by the given datadb.
SmallPartActiveMerges uint64
// BigPartMergesTotal is the number of big file merges performed in the given datadb.
BigPartMergesTotal uint64
// BigPartActiveMerges is the number of currently active big file merges performed by the given datadb.
BigPartActiveMerges uint64
// InmemoryRowsCount is the number of rows, which weren't flushed to disk yet.
InmemoryRowsCount uint64
// SmallPartRowsCount is the number of rows stored on disk in small parts.
SmallPartRowsCount uint64
// BigPartRowsCount is the number of rows stored on disk in big parts.
BigPartRowsCount uint64
// InmemoryParts is the number of in-memory parts, which weren't flushed to disk yet.
InmemoryParts uint64
// SmallParts is the number of file-based small parts stored on disk.
SmallParts uint64
// BigParts is the number of file-based big parts stored on disk.
BigParts uint64
// InmemoryBlocks is the number of in-memory blocks, which weren't flushed to disk yet.
InmemoryBlocks uint64
// SmallPartBlocks is the number of file-based small blocks stored on disk.
SmallPartBlocks uint64
// BigPartBlocks is the number of file-based big blocks stored on disk.
BigPartBlocks uint64
// CompressedInmemorySize is the size of compressed data stored in memory.
CompressedInmemorySize uint64
// CompressedSmallPartSize is the size of compressed small parts data stored on disk.
CompressedSmallPartSize uint64
// CompressedBigPartSize is the size of compressed big data stored on disk.
CompressedBigPartSize uint64
// UncompressedInmemorySize is the size of uncompressed data stored in memory.
UncompressedInmemorySize uint64
// UncompressedSmallPartSize is the size of uncompressed small data stored on disk.
UncompressedSmallPartSize uint64
// UncompressedBigPartSize is the size of uncompressed big data stored on disk.
UncompressedBigPartSize uint64
}
func (s *DatadbStats) reset() {
*s = DatadbStats{}
}
// RowsCount returns the number of rows stored in datadb.
func (s *DatadbStats) RowsCount() uint64 {
return s.InmemoryRowsCount + s.SmallPartRowsCount + s.BigPartRowsCount
}
// updateStats updates s with ddb stats.
func (ddb *datadb) updateStats(s *DatadbStats) {
s.InmemoryMergesTotal += ddb.inmemoryMergesTotal.Load()
s.InmemoryActiveMerges += uint64(ddb.inmemoryActiveMerges.Load())
s.SmallPartMergesTotal += ddb.smallPartMergesTotal.Load()
s.SmallPartActiveMerges += uint64(ddb.smallPartActiveMerges.Load())
s.BigPartMergesTotal += ddb.bigPartMergesTotal.Load()
s.BigPartActiveMerges += uint64(ddb.bigPartActiveMerges.Load())
ddb.partsLock.Lock()
s.InmemoryRowsCount += getRowsCount(ddb.inmemoryParts)
s.SmallPartRowsCount += getRowsCount(ddb.smallParts)
s.BigPartRowsCount += getRowsCount(ddb.bigParts)
s.InmemoryParts += uint64(len(ddb.inmemoryParts))
s.SmallParts += uint64(len(ddb.smallParts))
s.BigParts += uint64(len(ddb.bigParts))
s.InmemoryBlocks += getBlocksCount(ddb.inmemoryParts)
s.SmallPartBlocks += getBlocksCount(ddb.smallParts)
s.BigPartBlocks += getBlocksCount(ddb.bigParts)
s.CompressedInmemorySize += getCompressedSize(ddb.inmemoryParts)
s.CompressedSmallPartSize += getCompressedSize(ddb.smallParts)
s.CompressedBigPartSize += getCompressedSize(ddb.bigParts)
s.UncompressedInmemorySize += getUncompressedSize(ddb.inmemoryParts)
s.UncompressedSmallPartSize += getUncompressedSize(ddb.smallParts)
s.UncompressedBigPartSize += getUncompressedSize(ddb.bigParts)
ddb.partsLock.Unlock()
}
// debugFlush() makes sure that the recently ingested data is availalbe for search.
func (ddb *datadb) debugFlush() {
// Nothing to do, since all the ingested data is available for search via ddb.inmemoryParts.
}
func (ddb *datadb) swapSrcWithDstParts(pws []*partWrapper, pwNew *partWrapper, dstPartType partType) {
// Atomically unregister old parts and add new part to pt.
partsToRemove := partsToMap(pws)
removedInmemoryParts := 0
removedSmallParts := 0
removedBigParts := 0
ddb.partsLock.Lock()
ddb.inmemoryParts, removedInmemoryParts = removeParts(ddb.inmemoryParts, partsToRemove)
ddb.smallParts, removedSmallParts = removeParts(ddb.smallParts, partsToRemove)
ddb.bigParts, removedBigParts = removeParts(ddb.bigParts, partsToRemove)
if pwNew != nil {
switch dstPartType {
case partInmemory:
ddb.inmemoryParts = append(ddb.inmemoryParts, pwNew)
ddb.startInmemoryPartsMergerLocked()
case partSmall:
ddb.smallParts = append(ddb.smallParts, pwNew)
ddb.startSmallPartsMergerLocked()
case partBig:
ddb.bigParts = append(ddb.bigParts, pwNew)
ddb.startBigPartsMergerLocked()
default:
logger.Panicf("BUG: unknown partType=%d", dstPartType)
}
}
// Atomically store the updated list of file-based parts on disk.
// This must be performed under partsLock in order to prevent from races
// when multiple concurrently running goroutines update the list.
if removedSmallParts > 0 || removedBigParts > 0 || pwNew != nil && dstPartType != partInmemory {
smallPartNames := getPartNames(ddb.smallParts)
bigPartNames := getPartNames(ddb.bigParts)
mustWritePartNames(ddb.path, smallPartNames, bigPartNames)
}
ddb.partsLock.Unlock()
removedParts := removedInmemoryParts + removedSmallParts + removedBigParts
if removedParts != len(partsToRemove) {
logger.Panicf("BUG: unexpected number of parts removed; got %d, want %d", removedParts, len(partsToRemove))
}
// Mark old parts as must be deleted and decrement reference count, so they are eventually closed and deleted.
for _, pw := range pws {
pw.mustDrop.Store(true)
pw.decRef()
}
}
func partsToMap(pws []*partWrapper) map[*partWrapper]struct{} {
m := make(map[*partWrapper]struct{}, len(pws))
for _, pw := range pws {
m[pw] = struct{}{}
}
if len(m) != len(pws) {
logger.Panicf("BUG: %d duplicate parts found out of %d parts", len(pws)-len(m), len(pws))
}
return m
}
func removeParts(pws []*partWrapper, partsToRemove map[*partWrapper]struct{}) ([]*partWrapper, int) {
dst := pws[:0]
for _, pw := range pws {
if _, ok := partsToRemove[pw]; !ok {
dst = append(dst, pw)
}
}
for i := len(dst); i < len(pws); i++ {
pws[i] = nil
}
return dst, len(pws) - len(dst)
}
func mustOpenBlockStreamReaders(pws []*partWrapper) []*blockStreamReader {
bsrs := make([]*blockStreamReader, 0, len(pws))
for _, pw := range pws {
bsr := getBlockStreamReader()
if pw.mp != nil {
bsr.MustInitFromInmemoryPart(pw.mp)
} else {
bsr.MustInitFromFilePart(pw.p.path)
}
bsrs = append(bsrs, bsr)
}
return bsrs
}
func newPartWrapper(p *part, mp *inmemoryPart, flushDeadline time.Time) *partWrapper {
pw := &partWrapper{
p: p,
mp: mp,
flushDeadline: flushDeadline,
}
// Increase reference counter for newly created part - it is decreased when the part
// is removed from the list of open parts.
pw.incRef()
return pw
}
func (ddb *datadb) getFlushToDiskDeadline(pws []*partWrapper) time.Time {
d := time.Now().Add(ddb.flushInterval)
for _, pw := range pws {
if pw.mp != nil && pw.flushDeadline.Before(d) {
d = pw.flushDeadline
}
}
return d
}
func getMaxInmemoryPartSize() uint64 {
// Allocate 10% of allowed memory for in-memory parts.
n := uint64(0.1 * float64(memory.Allowed()) / maxInmemoryPartsPerPartition)
if n < 1e6 {
n = 1e6
}
return n
}
func areAllInmemoryParts(pws []*partWrapper) bool {
for _, pw := range pws {
if pw.mp == nil {
return false
}
}
return true
}
func (ddb *datadb) releasePartsToMerge(pws []*partWrapper) {
ddb.partsLock.Lock()
for _, pw := range pws {
if !pw.isInMerge {
logger.Panicf("BUG: missing isInMerge flag on the part %q", pw.p.path)
}
pw.isInMerge = false
}
ddb.partsLock.Unlock()
}
func (ddb *datadb) getMaxBigPartSize() uint64 {
return getMaxOutBytes(ddb.path)
}
func (ddb *datadb) getMaxSmallPartSize() uint64 {
// Small parts are cached in the OS page cache,
// so limit their size by the remaining free RAM.
mem := memory.Remaining()
n := uint64(mem) / defaultPartsToMerge
if n < 10e6 {
n = 10e6
}
// Make sure the output part fits available disk space for small parts.
sizeLimit := getMaxOutBytes(ddb.path)
if n > sizeLimit {
n = sizeLimit
}
return n
}
func getMaxOutBytes(path string) uint64 {
n := availableDiskSpace(path)
if n > maxBigPartSize {
n = maxBigPartSize
}
return n
}
func availableDiskSpace(path string) uint64 {
available := fs.MustGetFreeSpace(path)
reserved := reservedDiskSpace.Load()
if available < reserved {
return 0
}
return available - reserved
}
func tryReserveDiskSpace(path string, n uint64) bool {
available := fs.MustGetFreeSpace(path)
reserved := reserveDiskSpace(n)
if available >= reserved {
return true
}
releaseDiskSpace(n)
return false
}
func reserveDiskSpace(n uint64) uint64 {
return reservedDiskSpace.Add(n)
}
func releaseDiskSpace(n uint64) {
reservedDiskSpace.Add(^(n - 1))
}
// reservedDiskSpace tracks global reserved disk space for currently executed
// background merges across all the partitions.
//
// It should allow avoiding background merges when there is no free disk space.
var reservedDiskSpace atomic.Uint64
func needStop(stopCh <-chan struct{}) bool {
select {
case <-stopCh:
return true
default:
return false
}
}
// mustCloseDatadb can be called only when nobody accesses ddb.
func mustCloseDatadb(ddb *datadb) {
// Notify background workers to stop.
// Make it under ddb.partsLock in order to prevent from calling ddb.wg.Add()
// after ddb.stopCh is closed and ddb.wg.Wait() is called.
ddb.partsLock.Lock()
close(ddb.stopCh)
ddb.partsLock.Unlock()
// Wait for background workers to stop.
ddb.wg.Wait()
// flush in-memory data to disk
ddb.mustFlushInmemoryPartsToFiles(true)
if len(ddb.inmemoryParts) > 0 {
logger.Panicf("BUG: the number of in-memory parts must be zero after flushing them to disk; got %d", len(ddb.inmemoryParts))
}
ddb.inmemoryParts = nil
// close small parts
for _, pw := range ddb.smallParts {
pw.decRef()
if n := pw.refCount.Load(); n != 0 {
logger.Panicf("BUG: there are %d references to smallPart", n)
}
}
ddb.smallParts = nil
// close big parts
for _, pw := range ddb.bigParts {
pw.decRef()
if n := pw.refCount.Load(); n != 0 {
logger.Panicf("BUG: there are %d references to bigPart", n)
}
}
ddb.bigParts = nil
ddb.path = ""
ddb.pt = nil
}
func getPartNames(pws []*partWrapper) []string {
partNames := make([]string, 0, len(pws))
for _, pw := range pws {
if pw.mp != nil {
// Skip in-memory parts
continue
}
partName := filepath.Base(pw.p.path)
partNames = append(partNames, partName)
}
sort.Strings(partNames)
return partNames
}
func mustWritePartNames(path string, smallPartNames, bigPartNames []string) {
partNames := append([]string{}, smallPartNames...)
partNames = append(partNames, bigPartNames...)
data, err := json.Marshal(partNames)
if err != nil {
logger.Panicf("BUG: cannot marshal partNames to JSON: %s", err)
}
partNamesPath := filepath.Join(path, partsFilename)
fs.MustWriteAtomic(partNamesPath, data, true)
}
func mustReadPartNames(path string) []string {
partNamesPath := filepath.Join(path, partsFilename)
data, err := os.ReadFile(partNamesPath)
if err != nil {
logger.Panicf("FATAL: cannot read %s: %s", partNamesPath, err)
}
var partNames []string
if err := json.Unmarshal(data, &partNames); err != nil {
logger.Panicf("FATAL: cannot parse %s: %s", partNamesPath, err)
}
return partNames
}
// mustRemoveUnusedDirs removes dirs at path, which are missing in partNames.
//
// These dirs may be left after unclean shutdown.
func mustRemoveUnusedDirs(path string, partNames []string) {
des := fs.MustReadDir(path)
m := make(map[string]struct{}, len(partNames))
for _, partName := range partNames {
m[partName] = struct{}{}
}
removedDirs := 0
for _, de := range des {
if !fs.IsDirOrSymlink(de) {
// Skip non-directories.
continue
}
fn := de.Name()
if _, ok := m[fn]; !ok {
deletePath := filepath.Join(path, fn)
fs.MustRemoveAll(deletePath)
removedDirs++
}
}
if removedDirs > 0 {
fs.MustSyncPath(path)
}
}
// appendPartsToMerge finds optimal parts to merge from src,
// appends them to dst and returns the result.
func appendPartsToMerge(dst, src []*partWrapper, maxOutBytes uint64) []*partWrapper {
if len(src) < 2 {
// There is no need in merging zero or one part :)
return dst
}
// Filter out too big parts.
// This should reduce N for O(N^2) algorithm below.
maxInPartBytes := uint64(float64(maxOutBytes) / minMergeMultiplier)
tmp := make([]*partWrapper, 0, len(src))
for _, pw := range src {
if pw.p.ph.CompressedSizeBytes > maxInPartBytes {
continue
}
tmp = append(tmp, pw)
}
src = tmp
sortPartsForOptimalMerge(src)
maxSrcParts := defaultPartsToMerge
if maxSrcParts > len(src) {
maxSrcParts = len(src)
}
minSrcParts := (maxSrcParts + 1) / 2
if minSrcParts < 2 {
minSrcParts = 2
}
// Exhaustive search for parts giving the lowest write amplification when merged.
var pws []*partWrapper
maxM := float64(0)
for i := minSrcParts; i <= maxSrcParts; i++ {
for j := 0; j <= len(src)-i; j++ {
a := src[j : j+i]
if a[0].p.ph.CompressedSizeBytes*uint64(len(a)) < a[len(a)-1].p.ph.CompressedSizeBytes {
// Do not merge parts with too big difference in size,
// since this results in unbalanced merges.
continue
}
outSize := getCompressedSize(a)
if outSize > maxOutBytes {
// There is no need in verifying remaining parts with bigger sizes.
break
}
m := float64(outSize) / float64(a[len(a)-1].p.ph.CompressedSizeBytes)
if m < maxM {
continue
}
maxM = m
pws = a
}
}
minM := float64(defaultPartsToMerge) / 2
if minM < minMergeMultiplier {
minM = minMergeMultiplier
}
if maxM < minM {
// There is no sense in merging parts with too small m,
// since this leads to high disk write IO.
return dst
}
return append(dst, pws...)
}
func sortPartsForOptimalMerge(pws []*partWrapper) {
// Sort src parts by size and backwards timestamp.
// This should improve adjanced points' locality in the merged parts.
sort.Slice(pws, func(i, j int) bool {
a := &pws[i].p.ph
b := &pws[j].p.ph
if a.CompressedSizeBytes == b.CompressedSizeBytes {
return a.MinTimestamp > b.MinTimestamp
}
return a.CompressedSizeBytes < b.CompressedSizeBytes
})
}
func getCompressedSize(pws []*partWrapper) uint64 {
n := uint64(0)
for _, pw := range pws {
n += pw.p.ph.CompressedSizeBytes
}
return n
}
func getUncompressedSize(pws []*partWrapper) uint64 {
n := uint64(0)
for _, pw := range pws {
n += pw.p.ph.UncompressedSizeBytes
}
return n
}
func getRowsCount(pws []*partWrapper) uint64 {
n := uint64(0)
for _, pw := range pws {
n += pw.p.ph.RowsCount
}
return n
}
func getBlocksCount(pws []*partWrapper) uint64 {
n := uint64(0)
for _, pw := range pws {
n += pw.p.ph.BlocksCount
}
return n
}
func (ddb *datadb) mustForceMergeAllParts() {
// Flush inmemory parts to files before forced merge
ddb.mustFlushInmemoryPartsToFiles(true)
var pws []*partWrapper
// Collect all the file parts for forced merge
ddb.partsLock.Lock()
pws = appendAllPartsForMergeLocked(pws, ddb.smallParts)
pws = appendAllPartsForMergeLocked(pws, ddb.bigParts)
ddb.partsLock.Unlock()
// If len(pws) == 1, then the merge must run anyway.
// This allows applying the configured retention, removing the deleted data, etc.
// Merge pws optimally
wg := getWaitGroup()
for len(pws) > 0 {
pwsToMerge, pwsRemaining := getPartsForOptimalMerge(pws)
wg.Add(1)
bigPartsConcurrencyCh <- struct{}{}
go func(pwsChunk []*partWrapper) {
defer func() {
<-bigPartsConcurrencyCh
wg.Done()
}()
ddb.mustMergeParts(pwsChunk, false)
}(pwsToMerge)
pws = pwsRemaining
}
wg.Wait()
putWaitGroup(wg)
}
func appendAllPartsForMergeLocked(dst, src []*partWrapper) []*partWrapper {
for _, pw := range src {
if !pw.isInMerge {
pw.isInMerge = true
dst = append(dst, pw)
}
}
return dst
}