2019-05-22 23:16:55 +02:00
|
|
|
package storage
|
|
|
|
|
|
|
|
import (
|
2023-03-19 09:36:05 +01:00
|
|
|
"encoding/json"
|
2020-09-17 02:02:35 +02:00
|
|
|
"errors"
|
2019-05-22 23:16:55 +02:00
|
|
|
"fmt"
|
|
|
|
"os"
|
|
|
|
"path/filepath"
|
|
|
|
"sort"
|
|
|
|
"strings"
|
|
|
|
"sync"
|
|
|
|
"sync/atomic"
|
|
|
|
"time"
|
|
|
|
"unsafe"
|
|
|
|
|
2020-12-08 19:49:32 +01:00
|
|
|
"github.com/VictoriaMetrics/VictoriaMetrics/lib/cgroup"
|
2019-05-22 23:16:55 +02:00
|
|
|
"github.com/VictoriaMetrics/VictoriaMetrics/lib/encoding"
|
|
|
|
"github.com/VictoriaMetrics/VictoriaMetrics/lib/fs"
|
|
|
|
"github.com/VictoriaMetrics/VictoriaMetrics/lib/logger"
|
|
|
|
"github.com/VictoriaMetrics/VictoriaMetrics/lib/memory"
|
2022-12-06 00:15:00 +01:00
|
|
|
"github.com/VictoriaMetrics/VictoriaMetrics/lib/mergeset"
|
2024-01-22 17:12:37 +01:00
|
|
|
"github.com/VictoriaMetrics/VictoriaMetrics/lib/timeutil"
|
2019-05-22 23:16:55 +02:00
|
|
|
)
|
|
|
|
|
2021-08-25 08:35:03 +02:00
|
|
|
// The maximum size of big part.
|
2019-05-22 23:16:55 +02:00
|
|
|
//
|
|
|
|
// This number limits the maximum time required for building big part.
|
|
|
|
// This time shouldn't exceed a few days.
|
2021-08-25 08:35:03 +02:00
|
|
|
const maxBigPartSize = 1e12
|
2019-05-22 23:16:55 +02:00
|
|
|
|
2024-05-12 10:23:53 +02:00
|
|
|
// The maximum expected number of inmemory parts per partition.
|
2022-12-13 01:49:21 +01:00
|
|
|
//
|
2024-05-12 10:23:53 +02:00
|
|
|
// The actual number of inmemory parts may exceed this value if in-memory mergers
|
|
|
|
// cannot keep up with the rate of creating new in-memory parts.
|
lib/{mergeset,storage}: make background merge more responsive and scalable
- Maintain a separate worker pool per each part type (in-memory, file, big and small).
Previously a shared pool was used for merging all the part types.
A single merge worker could merge parts with mixed types at once. For example,
it could merge simultaneously an in-memory part plus a big file part.
Such a merge could take hours for big file part. During the duration of this merge
the in-memory part was pinned in memory and couldn't be persisted to disk
under the configured -inmemoryDataFlushInterval .
Another common issue, which could happen when parts with mixed types are merged,
is uncontrolled growth of in-memory parts or small parts when all the merge workers
were busy with merging big files. Such growth could lead to significant performance
degradataion for queries, since every query needs to check ever growing list of parts.
This could also slow down the registration of new time series, since VictoriaMetrics
searches for the internal series_id in the indexdb for every new time series.
The third issue is graceful shutdown duration, which could be very long when a background
merge is running on in-memory parts plus big file parts. This merge couldn't be interrupted,
since it merges in-memory parts.
A separate pool of merge workers per every part type elegantly resolves both issues:
- In-memory parts are merged to file-based parts in a timely manner, since the maximum
size of in-memory parts is limited.
- Long-running merges for big parts do not block merges for in-memory parts and small parts.
- Graceful shutdown duration is now limited by the time needed for flushing in-memory parts to files.
Merging for file parts is instantly canceled on graceful shutdown now.
- Deprecate -smallMergeConcurrency command-line flag, since the new background merge algorithm
should automatically self-tune according to the number of available CPU cores.
- Deprecate -finalMergeDelay command-line flag, since it wasn't working correctly.
It is better to run forced merge when needed - https://docs.victoriametrics.com/#forced-merge
- Tune the number of shards for pending rows and items before the data goes to in-memory parts
and becomes visible for search. This improves the maximum data ingestion rate and the maximum rate
for registration of new time series. This should reduce the duration of data ingestion slowdown
in VictoriaMetrics cluster on e.g. re-routing events, when some of vmstorage nodes become temporarily
unavailable.
- Prevent from possible "sync: WaitGroup misuse" panic on graceful shutdown.
This is a follow-up for fa566c68a6ccf7385a05f649aee7e5f5a38afb15 .
Thanks @misutoth to for the inspiration at https://github.com/VictoriaMetrics/VictoriaMetrics/pull/5212
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/5190
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/3790
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/3551
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/3337
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/3425
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/3647
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/3641
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/648
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/291
2024-01-26 21:39:49 +01:00
|
|
|
const maxInmemoryParts = 60
|
2022-12-13 01:49:21 +01:00
|
|
|
|
2019-05-22 23:16:55 +02:00
|
|
|
// Default number of parts to merge at once.
|
|
|
|
//
|
|
|
|
// This number has been obtained empirically - it gives the lowest possible overhead.
|
|
|
|
// See appendPartsToMerge tests for details.
|
|
|
|
const defaultPartsToMerge = 15
|
|
|
|
|
2019-12-19 17:12:02 +01:00
|
|
|
// The number of shards for rawRow entries per partition.
|
|
|
|
//
|
|
|
|
// Higher number of shards reduces CPU contention and increases the max bandwidth on multi-core systems.
|
lib/{mergeset,storage}: make background merge more responsive and scalable
- Maintain a separate worker pool per each part type (in-memory, file, big and small).
Previously a shared pool was used for merging all the part types.
A single merge worker could merge parts with mixed types at once. For example,
it could merge simultaneously an in-memory part plus a big file part.
Such a merge could take hours for big file part. During the duration of this merge
the in-memory part was pinned in memory and couldn't be persisted to disk
under the configured -inmemoryDataFlushInterval .
Another common issue, which could happen when parts with mixed types are merged,
is uncontrolled growth of in-memory parts or small parts when all the merge workers
were busy with merging big files. Such growth could lead to significant performance
degradataion for queries, since every query needs to check ever growing list of parts.
This could also slow down the registration of new time series, since VictoriaMetrics
searches for the internal series_id in the indexdb for every new time series.
The third issue is graceful shutdown duration, which could be very long when a background
merge is running on in-memory parts plus big file parts. This merge couldn't be interrupted,
since it merges in-memory parts.
A separate pool of merge workers per every part type elegantly resolves both issues:
- In-memory parts are merged to file-based parts in a timely manner, since the maximum
size of in-memory parts is limited.
- Long-running merges for big parts do not block merges for in-memory parts and small parts.
- Graceful shutdown duration is now limited by the time needed for flushing in-memory parts to files.
Merging for file parts is instantly canceled on graceful shutdown now.
- Deprecate -smallMergeConcurrency command-line flag, since the new background merge algorithm
should automatically self-tune according to the number of available CPU cores.
- Deprecate -finalMergeDelay command-line flag, since it wasn't working correctly.
It is better to run forced merge when needed - https://docs.victoriametrics.com/#forced-merge
- Tune the number of shards for pending rows and items before the data goes to in-memory parts
and becomes visible for search. This improves the maximum data ingestion rate and the maximum rate
for registration of new time series. This should reduce the duration of data ingestion slowdown
in VictoriaMetrics cluster on e.g. re-routing events, when some of vmstorage nodes become temporarily
unavailable.
- Prevent from possible "sync: WaitGroup misuse" panic on graceful shutdown.
This is a follow-up for fa566c68a6ccf7385a05f649aee7e5f5a38afb15 .
Thanks @misutoth to for the inspiration at https://github.com/VictoriaMetrics/VictoriaMetrics/pull/5212
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/5190
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/3790
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/3551
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/3337
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/3425
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/3647
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/3641
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/648
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/291
2024-01-26 21:39:49 +01:00
|
|
|
var rawRowsShardsPerPartition = cgroup.AvailableCPUs()
|
2019-12-19 17:12:02 +01:00
|
|
|
|
2023-02-13 13:27:13 +01:00
|
|
|
// The interval for flushing buffered rows into parts, so they become visible to search.
|
2024-02-22 19:06:37 +01:00
|
|
|
const pendingRowsFlushInterval = 2 * time.Second
|
2022-12-06 00:15:00 +01:00
|
|
|
|
|
|
|
// The interval for guaranteed flush of recently ingested data from memory to on-disk parts,
|
|
|
|
// so they survive process crash.
|
|
|
|
var dataFlushInterval = 5 * time.Second
|
|
|
|
|
|
|
|
// SetDataFlushInterval sets the interval for guaranteed flush of recently ingested data from memory to disk.
|
|
|
|
//
|
|
|
|
// The data can be flushed from memory to disk more frequently if it doesn't fit the memory limit.
|
|
|
|
//
|
|
|
|
// This function must be called before initializing the storage.
|
|
|
|
func SetDataFlushInterval(d time.Duration) {
|
2024-05-13 16:44:30 +02:00
|
|
|
if d >= time.Second {
|
2022-12-06 00:15:00 +01:00
|
|
|
dataFlushInterval = d
|
|
|
|
mergeset.SetDataFlushInterval(d)
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2024-02-22 19:06:37 +01:00
|
|
|
// The maximum number of rawRow items in rawRowsShard.
|
|
|
|
//
|
|
|
|
// Limit the maximum shard size to 8Mb, since this gives the lowest CPU usage under high ingestion rate.
|
|
|
|
const maxRawRowsPerShard = (8 << 20) / int(unsafe.Sizeof(rawRow{}))
|
2019-05-22 23:16:55 +02:00
|
|
|
|
|
|
|
// partition represents a partition.
|
|
|
|
type partition struct {
|
2024-02-23 23:15:21 +01:00
|
|
|
activeInmemoryMerges atomic.Int64
|
|
|
|
activeSmallMerges atomic.Int64
|
|
|
|
activeBigMerges atomic.Int64
|
2019-10-17 17:22:56 +02:00
|
|
|
|
2024-02-23 23:15:21 +01:00
|
|
|
inmemoryMergesCount atomic.Uint64
|
|
|
|
smallMergesCount atomic.Uint64
|
|
|
|
bigMergesCount atomic.Uint64
|
2022-12-06 00:15:00 +01:00
|
|
|
|
2024-02-23 23:15:21 +01:00
|
|
|
inmemoryRowsMerged atomic.Uint64
|
|
|
|
smallRowsMerged atomic.Uint64
|
|
|
|
bigRowsMerged atomic.Uint64
|
2019-10-17 17:22:56 +02:00
|
|
|
|
2024-02-23 23:15:21 +01:00
|
|
|
inmemoryRowsDeleted atomic.Uint64
|
|
|
|
smallRowsDeleted atomic.Uint64
|
|
|
|
bigRowsDeleted atomic.Uint64
|
2019-10-17 17:22:56 +02:00
|
|
|
|
2022-06-15 17:37:52 +02:00
|
|
|
isDedupScheduled atomic.Bool
|
|
|
|
|
2024-02-23 23:15:21 +01:00
|
|
|
mergeIdx atomic.Uint64
|
2019-10-17 17:22:56 +02:00
|
|
|
|
lib/{mergeset,storage}: make background merge more responsive and scalable
- Maintain a separate worker pool per each part type (in-memory, file, big and small).
Previously a shared pool was used for merging all the part types.
A single merge worker could merge parts with mixed types at once. For example,
it could merge simultaneously an in-memory part plus a big file part.
Such a merge could take hours for big file part. During the duration of this merge
the in-memory part was pinned in memory and couldn't be persisted to disk
under the configured -inmemoryDataFlushInterval .
Another common issue, which could happen when parts with mixed types are merged,
is uncontrolled growth of in-memory parts or small parts when all the merge workers
were busy with merging big files. Such growth could lead to significant performance
degradataion for queries, since every query needs to check ever growing list of parts.
This could also slow down the registration of new time series, since VictoriaMetrics
searches for the internal series_id in the indexdb for every new time series.
The third issue is graceful shutdown duration, which could be very long when a background
merge is running on in-memory parts plus big file parts. This merge couldn't be interrupted,
since it merges in-memory parts.
A separate pool of merge workers per every part type elegantly resolves both issues:
- In-memory parts are merged to file-based parts in a timely manner, since the maximum
size of in-memory parts is limited.
- Long-running merges for big parts do not block merges for in-memory parts and small parts.
- Graceful shutdown duration is now limited by the time needed for flushing in-memory parts to files.
Merging for file parts is instantly canceled on graceful shutdown now.
- Deprecate -smallMergeConcurrency command-line flag, since the new background merge algorithm
should automatically self-tune according to the number of available CPU cores.
- Deprecate -finalMergeDelay command-line flag, since it wasn't working correctly.
It is better to run forced merge when needed - https://docs.victoriametrics.com/#forced-merge
- Tune the number of shards for pending rows and items before the data goes to in-memory parts
and becomes visible for search. This improves the maximum data ingestion rate and the maximum rate
for registration of new time series. This should reduce the duration of data ingestion slowdown
in VictoriaMetrics cluster on e.g. re-routing events, when some of vmstorage nodes become temporarily
unavailable.
- Prevent from possible "sync: WaitGroup misuse" panic on graceful shutdown.
This is a follow-up for fa566c68a6ccf7385a05f649aee7e5f5a38afb15 .
Thanks @misutoth to for the inspiration at https://github.com/VictoriaMetrics/VictoriaMetrics/pull/5212
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/5190
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/3790
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/3551
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/3337
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/3425
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/3647
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/3641
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/648
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/291
2024-01-26 21:39:49 +01:00
|
|
|
// the path to directory with smallParts.
|
2019-05-22 23:16:55 +02:00
|
|
|
smallPartsPath string
|
lib/{mergeset,storage}: make background merge more responsive and scalable
- Maintain a separate worker pool per each part type (in-memory, file, big and small).
Previously a shared pool was used for merging all the part types.
A single merge worker could merge parts with mixed types at once. For example,
it could merge simultaneously an in-memory part plus a big file part.
Such a merge could take hours for big file part. During the duration of this merge
the in-memory part was pinned in memory and couldn't be persisted to disk
under the configured -inmemoryDataFlushInterval .
Another common issue, which could happen when parts with mixed types are merged,
is uncontrolled growth of in-memory parts or small parts when all the merge workers
were busy with merging big files. Such growth could lead to significant performance
degradataion for queries, since every query needs to check ever growing list of parts.
This could also slow down the registration of new time series, since VictoriaMetrics
searches for the internal series_id in the indexdb for every new time series.
The third issue is graceful shutdown duration, which could be very long when a background
merge is running on in-memory parts plus big file parts. This merge couldn't be interrupted,
since it merges in-memory parts.
A separate pool of merge workers per every part type elegantly resolves both issues:
- In-memory parts are merged to file-based parts in a timely manner, since the maximum
size of in-memory parts is limited.
- Long-running merges for big parts do not block merges for in-memory parts and small parts.
- Graceful shutdown duration is now limited by the time needed for flushing in-memory parts to files.
Merging for file parts is instantly canceled on graceful shutdown now.
- Deprecate -smallMergeConcurrency command-line flag, since the new background merge algorithm
should automatically self-tune according to the number of available CPU cores.
- Deprecate -finalMergeDelay command-line flag, since it wasn't working correctly.
It is better to run forced merge when needed - https://docs.victoriametrics.com/#forced-merge
- Tune the number of shards for pending rows and items before the data goes to in-memory parts
and becomes visible for search. This improves the maximum data ingestion rate and the maximum rate
for registration of new time series. This should reduce the duration of data ingestion slowdown
in VictoriaMetrics cluster on e.g. re-routing events, when some of vmstorage nodes become temporarily
unavailable.
- Prevent from possible "sync: WaitGroup misuse" panic on graceful shutdown.
This is a follow-up for fa566c68a6ccf7385a05f649aee7e5f5a38afb15 .
Thanks @misutoth to for the inspiration at https://github.com/VictoriaMetrics/VictoriaMetrics/pull/5212
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/5190
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/3790
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/3551
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/3337
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/3425
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/3647
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/3641
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/648
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/291
2024-01-26 21:39:49 +01:00
|
|
|
|
|
|
|
// the path to directory with bigParts.
|
|
|
|
bigPartsPath string
|
2019-05-22 23:16:55 +02:00
|
|
|
|
2022-10-23 15:08:54 +02:00
|
|
|
// The parent storage.
|
|
|
|
s *Storage
|
2019-05-22 23:16:55 +02:00
|
|
|
|
|
|
|
// Name is the name of the partition in the form YYYY_MM.
|
|
|
|
name string
|
|
|
|
|
|
|
|
// The time range for the partition. Usually this is a whole month.
|
|
|
|
tr TimeRange
|
|
|
|
|
2022-12-06 00:15:00 +01:00
|
|
|
// rawRows contains recently added rows that haven't been converted into parts yet.
|
lib/{mergeset,storage}: make background merge more responsive and scalable
- Maintain a separate worker pool per each part type (in-memory, file, big and small).
Previously a shared pool was used for merging all the part types.
A single merge worker could merge parts with mixed types at once. For example,
it could merge simultaneously an in-memory part plus a big file part.
Such a merge could take hours for big file part. During the duration of this merge
the in-memory part was pinned in memory and couldn't be persisted to disk
under the configured -inmemoryDataFlushInterval .
Another common issue, which could happen when parts with mixed types are merged,
is uncontrolled growth of in-memory parts or small parts when all the merge workers
were busy with merging big files. Such growth could lead to significant performance
degradataion for queries, since every query needs to check ever growing list of parts.
This could also slow down the registration of new time series, since VictoriaMetrics
searches for the internal series_id in the indexdb for every new time series.
The third issue is graceful shutdown duration, which could be very long when a background
merge is running on in-memory parts plus big file parts. This merge couldn't be interrupted,
since it merges in-memory parts.
A separate pool of merge workers per every part type elegantly resolves both issues:
- In-memory parts are merged to file-based parts in a timely manner, since the maximum
size of in-memory parts is limited.
- Long-running merges for big parts do not block merges for in-memory parts and small parts.
- Graceful shutdown duration is now limited by the time needed for flushing in-memory parts to files.
Merging for file parts is instantly canceled on graceful shutdown now.
- Deprecate -smallMergeConcurrency command-line flag, since the new background merge algorithm
should automatically self-tune according to the number of available CPU cores.
- Deprecate -finalMergeDelay command-line flag, since it wasn't working correctly.
It is better to run forced merge when needed - https://docs.victoriametrics.com/#forced-merge
- Tune the number of shards for pending rows and items before the data goes to in-memory parts
and becomes visible for search. This improves the maximum data ingestion rate and the maximum rate
for registration of new time series. This should reduce the duration of data ingestion slowdown
in VictoriaMetrics cluster on e.g. re-routing events, when some of vmstorage nodes become temporarily
unavailable.
- Prevent from possible "sync: WaitGroup misuse" panic on graceful shutdown.
This is a follow-up for fa566c68a6ccf7385a05f649aee7e5f5a38afb15 .
Thanks @misutoth to for the inspiration at https://github.com/VictoriaMetrics/VictoriaMetrics/pull/5212
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/5190
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/3790
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/3551
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/3337
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/3425
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/3647
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/3641
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/648
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/291
2024-01-26 21:39:49 +01:00
|
|
|
//
|
|
|
|
// rawRows are converted into inmemoryParts on every pendingRowsFlushInterval or when rawRows becomes full.
|
|
|
|
//
|
|
|
|
// rawRows aren't visible for search due to performance reasons.
|
2022-12-06 00:15:00 +01:00
|
|
|
rawRows rawRowsShards
|
|
|
|
|
|
|
|
// partsLock protects inmemoryParts, smallParts and bigParts.
|
2019-05-22 23:16:55 +02:00
|
|
|
partsLock sync.Mutex
|
|
|
|
|
lib/{mergeset,storage}: make background merge more responsive and scalable
- Maintain a separate worker pool per each part type (in-memory, file, big and small).
Previously a shared pool was used for merging all the part types.
A single merge worker could merge parts with mixed types at once. For example,
it could merge simultaneously an in-memory part plus a big file part.
Such a merge could take hours for big file part. During the duration of this merge
the in-memory part was pinned in memory and couldn't be persisted to disk
under the configured -inmemoryDataFlushInterval .
Another common issue, which could happen when parts with mixed types are merged,
is uncontrolled growth of in-memory parts or small parts when all the merge workers
were busy with merging big files. Such growth could lead to significant performance
degradataion for queries, since every query needs to check ever growing list of parts.
This could also slow down the registration of new time series, since VictoriaMetrics
searches for the internal series_id in the indexdb for every new time series.
The third issue is graceful shutdown duration, which could be very long when a background
merge is running on in-memory parts plus big file parts. This merge couldn't be interrupted,
since it merges in-memory parts.
A separate pool of merge workers per every part type elegantly resolves both issues:
- In-memory parts are merged to file-based parts in a timely manner, since the maximum
size of in-memory parts is limited.
- Long-running merges for big parts do not block merges for in-memory parts and small parts.
- Graceful shutdown duration is now limited by the time needed for flushing in-memory parts to files.
Merging for file parts is instantly canceled on graceful shutdown now.
- Deprecate -smallMergeConcurrency command-line flag, since the new background merge algorithm
should automatically self-tune according to the number of available CPU cores.
- Deprecate -finalMergeDelay command-line flag, since it wasn't working correctly.
It is better to run forced merge when needed - https://docs.victoriametrics.com/#forced-merge
- Tune the number of shards for pending rows and items before the data goes to in-memory parts
and becomes visible for search. This improves the maximum data ingestion rate and the maximum rate
for registration of new time series. This should reduce the duration of data ingestion slowdown
in VictoriaMetrics cluster on e.g. re-routing events, when some of vmstorage nodes become temporarily
unavailable.
- Prevent from possible "sync: WaitGroup misuse" panic on graceful shutdown.
This is a follow-up for fa566c68a6ccf7385a05f649aee7e5f5a38afb15 .
Thanks @misutoth to for the inspiration at https://github.com/VictoriaMetrics/VictoriaMetrics/pull/5212
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/5190
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/3790
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/3551
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/3337
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/3425
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/3647
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/3641
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/648
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/291
2024-01-26 21:39:49 +01:00
|
|
|
// Contains inmemory parts with recently ingested data, which are visible for search.
|
2022-12-06 00:15:00 +01:00
|
|
|
inmemoryParts []*partWrapper
|
|
|
|
|
lib/{mergeset,storage}: make background merge more responsive and scalable
- Maintain a separate worker pool per each part type (in-memory, file, big and small).
Previously a shared pool was used for merging all the part types.
A single merge worker could merge parts with mixed types at once. For example,
it could merge simultaneously an in-memory part plus a big file part.
Such a merge could take hours for big file part. During the duration of this merge
the in-memory part was pinned in memory and couldn't be persisted to disk
under the configured -inmemoryDataFlushInterval .
Another common issue, which could happen when parts with mixed types are merged,
is uncontrolled growth of in-memory parts or small parts when all the merge workers
were busy with merging big files. Such growth could lead to significant performance
degradataion for queries, since every query needs to check ever growing list of parts.
This could also slow down the registration of new time series, since VictoriaMetrics
searches for the internal series_id in the indexdb for every new time series.
The third issue is graceful shutdown duration, which could be very long when a background
merge is running on in-memory parts plus big file parts. This merge couldn't be interrupted,
since it merges in-memory parts.
A separate pool of merge workers per every part type elegantly resolves both issues:
- In-memory parts are merged to file-based parts in a timely manner, since the maximum
size of in-memory parts is limited.
- Long-running merges for big parts do not block merges for in-memory parts and small parts.
- Graceful shutdown duration is now limited by the time needed for flushing in-memory parts to files.
Merging for file parts is instantly canceled on graceful shutdown now.
- Deprecate -smallMergeConcurrency command-line flag, since the new background merge algorithm
should automatically self-tune according to the number of available CPU cores.
- Deprecate -finalMergeDelay command-line flag, since it wasn't working correctly.
It is better to run forced merge when needed - https://docs.victoriametrics.com/#forced-merge
- Tune the number of shards for pending rows and items before the data goes to in-memory parts
and becomes visible for search. This improves the maximum data ingestion rate and the maximum rate
for registration of new time series. This should reduce the duration of data ingestion slowdown
in VictoriaMetrics cluster on e.g. re-routing events, when some of vmstorage nodes become temporarily
unavailable.
- Prevent from possible "sync: WaitGroup misuse" panic on graceful shutdown.
This is a follow-up for fa566c68a6ccf7385a05f649aee7e5f5a38afb15 .
Thanks @misutoth to for the inspiration at https://github.com/VictoriaMetrics/VictoriaMetrics/pull/5212
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/5190
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/3790
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/3551
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/3337
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/3425
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/3647
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/3641
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/648
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/291
2024-01-26 21:39:49 +01:00
|
|
|
// Contains file-based parts with small number of items, which are visible for search.
|
2019-05-22 23:16:55 +02:00
|
|
|
smallParts []*partWrapper
|
|
|
|
|
lib/{mergeset,storage}: make background merge more responsive and scalable
- Maintain a separate worker pool per each part type (in-memory, file, big and small).
Previously a shared pool was used for merging all the part types.
A single merge worker could merge parts with mixed types at once. For example,
it could merge simultaneously an in-memory part plus a big file part.
Such a merge could take hours for big file part. During the duration of this merge
the in-memory part was pinned in memory and couldn't be persisted to disk
under the configured -inmemoryDataFlushInterval .
Another common issue, which could happen when parts with mixed types are merged,
is uncontrolled growth of in-memory parts or small parts when all the merge workers
were busy with merging big files. Such growth could lead to significant performance
degradataion for queries, since every query needs to check ever growing list of parts.
This could also slow down the registration of new time series, since VictoriaMetrics
searches for the internal series_id in the indexdb for every new time series.
The third issue is graceful shutdown duration, which could be very long when a background
merge is running on in-memory parts plus big file parts. This merge couldn't be interrupted,
since it merges in-memory parts.
A separate pool of merge workers per every part type elegantly resolves both issues:
- In-memory parts are merged to file-based parts in a timely manner, since the maximum
size of in-memory parts is limited.
- Long-running merges for big parts do not block merges for in-memory parts and small parts.
- Graceful shutdown duration is now limited by the time needed for flushing in-memory parts to files.
Merging for file parts is instantly canceled on graceful shutdown now.
- Deprecate -smallMergeConcurrency command-line flag, since the new background merge algorithm
should automatically self-tune according to the number of available CPU cores.
- Deprecate -finalMergeDelay command-line flag, since it wasn't working correctly.
It is better to run forced merge when needed - https://docs.victoriametrics.com/#forced-merge
- Tune the number of shards for pending rows and items before the data goes to in-memory parts
and becomes visible for search. This improves the maximum data ingestion rate and the maximum rate
for registration of new time series. This should reduce the duration of data ingestion slowdown
in VictoriaMetrics cluster on e.g. re-routing events, when some of vmstorage nodes become temporarily
unavailable.
- Prevent from possible "sync: WaitGroup misuse" panic on graceful shutdown.
This is a follow-up for fa566c68a6ccf7385a05f649aee7e5f5a38afb15 .
Thanks @misutoth to for the inspiration at https://github.com/VictoriaMetrics/VictoriaMetrics/pull/5212
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/5190
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/3790
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/3551
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/3337
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/3425
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/3647
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/3641
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/648
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/291
2024-01-26 21:39:49 +01:00
|
|
|
// Contains file-based parts with big number of items, which are visible for search.
|
2019-05-22 23:16:55 +02:00
|
|
|
bigParts []*partWrapper
|
|
|
|
|
lib/{mergeset,storage}: make background merge more responsive and scalable
- Maintain a separate worker pool per each part type (in-memory, file, big and small).
Previously a shared pool was used for merging all the part types.
A single merge worker could merge parts with mixed types at once. For example,
it could merge simultaneously an in-memory part plus a big file part.
Such a merge could take hours for big file part. During the duration of this merge
the in-memory part was pinned in memory and couldn't be persisted to disk
under the configured -inmemoryDataFlushInterval .
Another common issue, which could happen when parts with mixed types are merged,
is uncontrolled growth of in-memory parts or small parts when all the merge workers
were busy with merging big files. Such growth could lead to significant performance
degradataion for queries, since every query needs to check ever growing list of parts.
This could also slow down the registration of new time series, since VictoriaMetrics
searches for the internal series_id in the indexdb for every new time series.
The third issue is graceful shutdown duration, which could be very long when a background
merge is running on in-memory parts plus big file parts. This merge couldn't be interrupted,
since it merges in-memory parts.
A separate pool of merge workers per every part type elegantly resolves both issues:
- In-memory parts are merged to file-based parts in a timely manner, since the maximum
size of in-memory parts is limited.
- Long-running merges for big parts do not block merges for in-memory parts and small parts.
- Graceful shutdown duration is now limited by the time needed for flushing in-memory parts to files.
Merging for file parts is instantly canceled on graceful shutdown now.
- Deprecate -smallMergeConcurrency command-line flag, since the new background merge algorithm
should automatically self-tune according to the number of available CPU cores.
- Deprecate -finalMergeDelay command-line flag, since it wasn't working correctly.
It is better to run forced merge when needed - https://docs.victoriametrics.com/#forced-merge
- Tune the number of shards for pending rows and items before the data goes to in-memory parts
and becomes visible for search. This improves the maximum data ingestion rate and the maximum rate
for registration of new time series. This should reduce the duration of data ingestion slowdown
in VictoriaMetrics cluster on e.g. re-routing events, when some of vmstorage nodes become temporarily
unavailable.
- Prevent from possible "sync: WaitGroup misuse" panic on graceful shutdown.
This is a follow-up for fa566c68a6ccf7385a05f649aee7e5f5a38afb15 .
Thanks @misutoth to for the inspiration at https://github.com/VictoriaMetrics/VictoriaMetrics/pull/5212
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/5190
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/3790
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/3551
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/3337
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/3425
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/3647
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/3641
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/648
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/291
2024-01-26 21:39:49 +01:00
|
|
|
// stopCh is used for notifying all the background workers to stop.
|
|
|
|
//
|
|
|
|
// It must be closed under partsLock in order to prevent from calling wg.Add()
|
|
|
|
// after stopCh is closed.
|
2019-05-22 23:16:55 +02:00
|
|
|
stopCh chan struct{}
|
|
|
|
|
lib/{mergeset,storage}: make background merge more responsive and scalable
- Maintain a separate worker pool per each part type (in-memory, file, big and small).
Previously a shared pool was used for merging all the part types.
A single merge worker could merge parts with mixed types at once. For example,
it could merge simultaneously an in-memory part plus a big file part.
Such a merge could take hours for big file part. During the duration of this merge
the in-memory part was pinned in memory and couldn't be persisted to disk
under the configured -inmemoryDataFlushInterval .
Another common issue, which could happen when parts with mixed types are merged,
is uncontrolled growth of in-memory parts or small parts when all the merge workers
were busy with merging big files. Such growth could lead to significant performance
degradataion for queries, since every query needs to check ever growing list of parts.
This could also slow down the registration of new time series, since VictoriaMetrics
searches for the internal series_id in the indexdb for every new time series.
The third issue is graceful shutdown duration, which could be very long when a background
merge is running on in-memory parts plus big file parts. This merge couldn't be interrupted,
since it merges in-memory parts.
A separate pool of merge workers per every part type elegantly resolves both issues:
- In-memory parts are merged to file-based parts in a timely manner, since the maximum
size of in-memory parts is limited.
- Long-running merges for big parts do not block merges for in-memory parts and small parts.
- Graceful shutdown duration is now limited by the time needed for flushing in-memory parts to files.
Merging for file parts is instantly canceled on graceful shutdown now.
- Deprecate -smallMergeConcurrency command-line flag, since the new background merge algorithm
should automatically self-tune according to the number of available CPU cores.
- Deprecate -finalMergeDelay command-line flag, since it wasn't working correctly.
It is better to run forced merge when needed - https://docs.victoriametrics.com/#forced-merge
- Tune the number of shards for pending rows and items before the data goes to in-memory parts
and becomes visible for search. This improves the maximum data ingestion rate and the maximum rate
for registration of new time series. This should reduce the duration of data ingestion slowdown
in VictoriaMetrics cluster on e.g. re-routing events, when some of vmstorage nodes become temporarily
unavailable.
- Prevent from possible "sync: WaitGroup misuse" panic on graceful shutdown.
This is a follow-up for fa566c68a6ccf7385a05f649aee7e5f5a38afb15 .
Thanks @misutoth to for the inspiration at https://github.com/VictoriaMetrics/VictoriaMetrics/pull/5212
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/5190
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/3790
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/3551
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/3337
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/3425
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/3647
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/3641
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/648
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/291
2024-01-26 21:39:49 +01:00
|
|
|
// wg is used for waiting for all the background workers to stop.
|
|
|
|
//
|
|
|
|
// wg.Add() must be called under partsLock after checking whether stopCh isn't closed.
|
|
|
|
// This should prevent from calling wg.Add() after stopCh is closed and wg.Wait() is called.
|
2022-12-04 08:03:05 +01:00
|
|
|
wg sync.WaitGroup
|
2019-05-22 23:16:55 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
// partWrapper is a wrapper for the part.
|
|
|
|
type partWrapper struct {
|
2019-10-17 17:22:56 +02:00
|
|
|
// The number of references to the part.
|
2024-02-23 21:54:55 +01:00
|
|
|
refCount atomic.Int32
|
2023-03-19 09:36:05 +01:00
|
|
|
|
|
|
|
// The flag, which is set when the part must be deleted after refCount reaches zero.
|
2023-06-15 11:17:45 +02:00
|
|
|
// This field should be updated only after partWrapper
|
|
|
|
// was removed from the list of active parts.
|
2024-02-23 21:54:55 +01:00
|
|
|
mustDrop atomic.Bool
|
2019-10-17 17:22:56 +02:00
|
|
|
|
2019-05-22 23:16:55 +02:00
|
|
|
// The part itself.
|
|
|
|
p *part
|
|
|
|
|
|
|
|
// non-nil if the part is inmemoryPart.
|
|
|
|
mp *inmemoryPart
|
|
|
|
|
|
|
|
// Whether the part is in merge now.
|
|
|
|
isInMerge bool
|
2022-12-06 00:15:00 +01:00
|
|
|
|
|
|
|
// The deadline when in-memory part must be flushed to disk.
|
|
|
|
flushToDiskDeadline time.Time
|
2019-05-22 23:16:55 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
func (pw *partWrapper) incRef() {
|
2024-02-23 21:54:55 +01:00
|
|
|
pw.refCount.Add(1)
|
2019-05-22 23:16:55 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
func (pw *partWrapper) decRef() {
|
2024-02-23 21:54:55 +01:00
|
|
|
n := pw.refCount.Add(-1)
|
|
|
|
if n < 0 {
|
|
|
|
logger.Panicf("BUG: pw.refCount must be bigger than 0; got %d", n)
|
2019-05-22 23:16:55 +02:00
|
|
|
}
|
|
|
|
if n > 0 {
|
|
|
|
return
|
|
|
|
}
|
|
|
|
|
2023-03-19 09:36:05 +01:00
|
|
|
deletePath := ""
|
2024-02-23 21:54:55 +01:00
|
|
|
if pw.mp == nil && pw.mustDrop.Load() {
|
2023-03-19 09:36:05 +01:00
|
|
|
deletePath = pw.p.path
|
|
|
|
}
|
2019-05-22 23:16:55 +02:00
|
|
|
if pw.mp != nil {
|
|
|
|
putInmemoryPart(pw.mp)
|
|
|
|
pw.mp = nil
|
|
|
|
}
|
|
|
|
pw.p.MustClose()
|
|
|
|
pw.p = nil
|
2023-03-19 09:36:05 +01:00
|
|
|
|
|
|
|
if deletePath != "" {
|
|
|
|
fs.MustRemoveAll(deletePath)
|
|
|
|
}
|
2019-05-22 23:16:55 +02:00
|
|
|
}
|
|
|
|
|
2023-04-14 07:11:56 +02:00
|
|
|
// mustCreatePartition creates new partition for the given timestamp and the given paths
|
2019-05-22 23:16:55 +02:00
|
|
|
// to small and big partitions.
|
2023-04-14 07:11:56 +02:00
|
|
|
func mustCreatePartition(timestamp int64, smallPartitionsPath, bigPartitionsPath string, s *Storage) *partition {
|
2019-05-22 23:16:55 +02:00
|
|
|
name := timestampToPartitionName(timestamp)
|
2023-03-25 22:33:54 +01:00
|
|
|
smallPartsPath := filepath.Join(filepath.Clean(smallPartitionsPath), name)
|
|
|
|
bigPartsPath := filepath.Join(filepath.Clean(bigPartitionsPath), name)
|
2019-05-22 23:16:55 +02:00
|
|
|
logger.Infof("creating a partition %q with smallPartsPath=%q, bigPartsPath=%q", name, smallPartsPath, bigPartsPath)
|
|
|
|
|
2023-04-14 07:11:56 +02:00
|
|
|
fs.MustMkdirFailIfExist(smallPartsPath)
|
|
|
|
fs.MustMkdirFailIfExist(bigPartsPath)
|
2019-05-22 23:16:55 +02:00
|
|
|
|
2022-10-24 00:30:50 +02:00
|
|
|
pt := newPartition(name, smallPartsPath, bigPartsPath, s)
|
2019-05-22 23:16:55 +02:00
|
|
|
pt.tr.fromPartitionTimestamp(timestamp)
|
2022-12-04 09:01:04 +01:00
|
|
|
pt.startBackgroundWorkers()
|
2019-05-22 23:16:55 +02:00
|
|
|
|
|
|
|
logger.Infof("partition %q has been created", name)
|
|
|
|
|
2023-04-14 07:11:56 +02:00
|
|
|
return pt
|
2019-05-22 23:16:55 +02:00
|
|
|
}
|
|
|
|
|
2022-12-04 09:01:04 +01:00
|
|
|
func (pt *partition) startBackgroundWorkers() {
|
lib/{mergeset,storage}: make background merge more responsive and scalable
- Maintain a separate worker pool per each part type (in-memory, file, big and small).
Previously a shared pool was used for merging all the part types.
A single merge worker could merge parts with mixed types at once. For example,
it could merge simultaneously an in-memory part plus a big file part.
Such a merge could take hours for big file part. During the duration of this merge
the in-memory part was pinned in memory and couldn't be persisted to disk
under the configured -inmemoryDataFlushInterval .
Another common issue, which could happen when parts with mixed types are merged,
is uncontrolled growth of in-memory parts or small parts when all the merge workers
were busy with merging big files. Such growth could lead to significant performance
degradataion for queries, since every query needs to check ever growing list of parts.
This could also slow down the registration of new time series, since VictoriaMetrics
searches for the internal series_id in the indexdb for every new time series.
The third issue is graceful shutdown duration, which could be very long when a background
merge is running on in-memory parts plus big file parts. This merge couldn't be interrupted,
since it merges in-memory parts.
A separate pool of merge workers per every part type elegantly resolves both issues:
- In-memory parts are merged to file-based parts in a timely manner, since the maximum
size of in-memory parts is limited.
- Long-running merges for big parts do not block merges for in-memory parts and small parts.
- Graceful shutdown duration is now limited by the time needed for flushing in-memory parts to files.
Merging for file parts is instantly canceled on graceful shutdown now.
- Deprecate -smallMergeConcurrency command-line flag, since the new background merge algorithm
should automatically self-tune according to the number of available CPU cores.
- Deprecate -finalMergeDelay command-line flag, since it wasn't working correctly.
It is better to run forced merge when needed - https://docs.victoriametrics.com/#forced-merge
- Tune the number of shards for pending rows and items before the data goes to in-memory parts
and becomes visible for search. This improves the maximum data ingestion rate and the maximum rate
for registration of new time series. This should reduce the duration of data ingestion slowdown
in VictoriaMetrics cluster on e.g. re-routing events, when some of vmstorage nodes become temporarily
unavailable.
- Prevent from possible "sync: WaitGroup misuse" panic on graceful shutdown.
This is a follow-up for fa566c68a6ccf7385a05f649aee7e5f5a38afb15 .
Thanks @misutoth to for the inspiration at https://github.com/VictoriaMetrics/VictoriaMetrics/pull/5212
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/5190
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/3790
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/3551
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/3337
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/3425
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/3647
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/3641
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/648
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/291
2024-01-26 21:39:49 +01:00
|
|
|
// Start file parts mergers, so they could start merging unmerged parts if needed.
|
|
|
|
// There is no need in starting in-memory parts mergers, since there are no in-memory parts yet.
|
|
|
|
pt.startSmallPartsMergers()
|
|
|
|
pt.startBigPartsMergers()
|
|
|
|
|
2022-12-06 00:15:00 +01:00
|
|
|
pt.startPendingRowsFlusher()
|
lib/{mergeset,storage}: make background merge more responsive and scalable
- Maintain a separate worker pool per each part type (in-memory, file, big and small).
Previously a shared pool was used for merging all the part types.
A single merge worker could merge parts with mixed types at once. For example,
it could merge simultaneously an in-memory part plus a big file part.
Such a merge could take hours for big file part. During the duration of this merge
the in-memory part was pinned in memory and couldn't be persisted to disk
under the configured -inmemoryDataFlushInterval .
Another common issue, which could happen when parts with mixed types are merged,
is uncontrolled growth of in-memory parts or small parts when all the merge workers
were busy with merging big files. Such growth could lead to significant performance
degradataion for queries, since every query needs to check ever growing list of parts.
This could also slow down the registration of new time series, since VictoriaMetrics
searches for the internal series_id in the indexdb for every new time series.
The third issue is graceful shutdown duration, which could be very long when a background
merge is running on in-memory parts plus big file parts. This merge couldn't be interrupted,
since it merges in-memory parts.
A separate pool of merge workers per every part type elegantly resolves both issues:
- In-memory parts are merged to file-based parts in a timely manner, since the maximum
size of in-memory parts is limited.
- Long-running merges for big parts do not block merges for in-memory parts and small parts.
- Graceful shutdown duration is now limited by the time needed for flushing in-memory parts to files.
Merging for file parts is instantly canceled on graceful shutdown now.
- Deprecate -smallMergeConcurrency command-line flag, since the new background merge algorithm
should automatically self-tune according to the number of available CPU cores.
- Deprecate -finalMergeDelay command-line flag, since it wasn't working correctly.
It is better to run forced merge when needed - https://docs.victoriametrics.com/#forced-merge
- Tune the number of shards for pending rows and items before the data goes to in-memory parts
and becomes visible for search. This improves the maximum data ingestion rate and the maximum rate
for registration of new time series. This should reduce the duration of data ingestion slowdown
in VictoriaMetrics cluster on e.g. re-routing events, when some of vmstorage nodes become temporarily
unavailable.
- Prevent from possible "sync: WaitGroup misuse" panic on graceful shutdown.
This is a follow-up for fa566c68a6ccf7385a05f649aee7e5f5a38afb15 .
Thanks @misutoth to for the inspiration at https://github.com/VictoriaMetrics/VictoriaMetrics/pull/5212
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/5190
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/3790
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/3551
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/3337
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/3425
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/3647
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/3641
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/648
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/291
2024-01-26 21:39:49 +01:00
|
|
|
pt.startInmemoryPartsFlusher()
|
2022-12-04 09:01:04 +01:00
|
|
|
pt.startStalePartsRemover()
|
|
|
|
}
|
|
|
|
|
2019-05-22 23:16:55 +02:00
|
|
|
// Drop drops all the data on the storage for the given pt.
|
|
|
|
//
|
|
|
|
// The pt must be detached from table before calling pt.Drop.
|
|
|
|
func (pt *partition) Drop() {
|
|
|
|
logger.Infof("dropping partition %q at smallPartsPath=%q, bigPartsPath=%q", pt.name, pt.smallPartsPath, pt.bigPartsPath)
|
2020-12-25 10:45:47 +01:00
|
|
|
|
2022-09-13 12:37:34 +02:00
|
|
|
fs.MustRemoveDirAtomic(pt.smallPartsPath)
|
|
|
|
fs.MustRemoveDirAtomic(pt.bigPartsPath)
|
2019-05-22 23:16:55 +02:00
|
|
|
logger.Infof("partition %q has been dropped", pt.name)
|
|
|
|
}
|
|
|
|
|
2023-04-15 08:01:20 +02:00
|
|
|
// mustOpenPartition opens the existing partition from the given paths.
|
|
|
|
func mustOpenPartition(smallPartsPath, bigPartsPath string, s *Storage) *partition {
|
2019-05-22 23:16:55 +02:00
|
|
|
smallPartsPath = filepath.Clean(smallPartsPath)
|
|
|
|
bigPartsPath = filepath.Clean(bigPartsPath)
|
|
|
|
|
2023-03-25 19:57:37 +01:00
|
|
|
name := filepath.Base(smallPartsPath)
|
2023-03-25 19:43:19 +01:00
|
|
|
if !strings.HasSuffix(bigPartsPath, name) {
|
2023-11-21 10:52:53 +01:00
|
|
|
logger.Panicf("FATAL: partition name in bigPartsPath %q doesn't match smallPartsPath %q; want %q", bigPartsPath, smallPartsPath, name)
|
2019-05-22 23:16:55 +02:00
|
|
|
}
|
|
|
|
|
2024-04-16 19:07:36 +02:00
|
|
|
partsFile := filepath.Join(smallPartsPath, partsFilename)
|
|
|
|
partNamesSmall, partNamesBig := mustReadPartNames(partsFile, smallPartsPath, bigPartsPath)
|
2023-03-19 09:36:05 +01:00
|
|
|
|
2024-04-16 19:07:36 +02:00
|
|
|
smallParts := mustOpenParts(partsFile, smallPartsPath, partNamesSmall)
|
|
|
|
bigParts := mustOpenParts(partsFile, bigPartsPath, partNamesBig)
|
2019-05-22 23:16:55 +02:00
|
|
|
|
2024-04-16 19:07:36 +02:00
|
|
|
if !fs.IsPathExist(partsFile) {
|
2023-07-07 02:05:59 +02:00
|
|
|
// Create parts.json file if it doesn't exist yet.
|
|
|
|
// This should protect from possible carshloops just after the migration from versions below v1.90.0
|
|
|
|
// See https://github.com/VictoriaMetrics/VictoriaMetrics/issues/4336
|
2023-06-15 11:19:22 +02:00
|
|
|
mustWritePartNames(smallParts, bigParts, smallPartsPath)
|
|
|
|
}
|
|
|
|
|
2022-10-24 00:30:50 +02:00
|
|
|
pt := newPartition(name, smallPartsPath, bigPartsPath, s)
|
2019-05-22 23:16:55 +02:00
|
|
|
pt.smallParts = smallParts
|
|
|
|
pt.bigParts = bigParts
|
|
|
|
if err := pt.tr.fromPartitionName(name); err != nil {
|
2023-04-15 08:01:20 +02:00
|
|
|
logger.Panicf("FATAL: cannot obtain partition time range from smallPartsPath %q: %s", smallPartsPath, err)
|
2019-05-22 23:16:55 +02:00
|
|
|
}
|
2022-12-04 09:01:04 +01:00
|
|
|
pt.startBackgroundWorkers()
|
2019-05-22 23:16:55 +02:00
|
|
|
|
2023-04-15 08:01:20 +02:00
|
|
|
return pt
|
2019-05-22 23:16:55 +02:00
|
|
|
}
|
|
|
|
|
2022-10-24 00:30:50 +02:00
|
|
|
func newPartition(name, smallPartsPath, bigPartsPath string, s *Storage) *partition {
|
2019-12-19 17:12:02 +01:00
|
|
|
p := &partition{
|
2019-05-22 23:16:55 +02:00
|
|
|
name: name,
|
|
|
|
smallPartsPath: smallPartsPath,
|
|
|
|
bigPartsPath: bigPartsPath,
|
lib/{mergeset,storage}: make background merge more responsive and scalable
- Maintain a separate worker pool per each part type (in-memory, file, big and small).
Previously a shared pool was used for merging all the part types.
A single merge worker could merge parts with mixed types at once. For example,
it could merge simultaneously an in-memory part plus a big file part.
Such a merge could take hours for big file part. During the duration of this merge
the in-memory part was pinned in memory and couldn't be persisted to disk
under the configured -inmemoryDataFlushInterval .
Another common issue, which could happen when parts with mixed types are merged,
is uncontrolled growth of in-memory parts or small parts when all the merge workers
were busy with merging big files. Such growth could lead to significant performance
degradataion for queries, since every query needs to check ever growing list of parts.
This could also slow down the registration of new time series, since VictoriaMetrics
searches for the internal series_id in the indexdb for every new time series.
The third issue is graceful shutdown duration, which could be very long when a background
merge is running on in-memory parts plus big file parts. This merge couldn't be interrupted,
since it merges in-memory parts.
A separate pool of merge workers per every part type elegantly resolves both issues:
- In-memory parts are merged to file-based parts in a timely manner, since the maximum
size of in-memory parts is limited.
- Long-running merges for big parts do not block merges for in-memory parts and small parts.
- Graceful shutdown duration is now limited by the time needed for flushing in-memory parts to files.
Merging for file parts is instantly canceled on graceful shutdown now.
- Deprecate -smallMergeConcurrency command-line flag, since the new background merge algorithm
should automatically self-tune according to the number of available CPU cores.
- Deprecate -finalMergeDelay command-line flag, since it wasn't working correctly.
It is better to run forced merge when needed - https://docs.victoriametrics.com/#forced-merge
- Tune the number of shards for pending rows and items before the data goes to in-memory parts
and becomes visible for search. This improves the maximum data ingestion rate and the maximum rate
for registration of new time series. This should reduce the duration of data ingestion slowdown
in VictoriaMetrics cluster on e.g. re-routing events, when some of vmstorage nodes become temporarily
unavailable.
- Prevent from possible "sync: WaitGroup misuse" panic on graceful shutdown.
This is a follow-up for fa566c68a6ccf7385a05f649aee7e5f5a38afb15 .
Thanks @misutoth to for the inspiration at https://github.com/VictoriaMetrics/VictoriaMetrics/pull/5212
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/5190
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/3790
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/3551
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/3337
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/3425
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/3647
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/3641
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/648
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/291
2024-01-26 21:39:49 +01:00
|
|
|
s: s,
|
|
|
|
stopCh: make(chan struct{}),
|
2019-05-22 23:16:55 +02:00
|
|
|
}
|
2024-02-23 23:15:21 +01:00
|
|
|
p.mergeIdx.Store(uint64(time.Now().UnixNano()))
|
2019-12-19 17:12:02 +01:00
|
|
|
p.rawRows.init()
|
|
|
|
return p
|
2019-05-22 23:16:55 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
// partitionMetrics contains essential metrics for the partition.
|
|
|
|
type partitionMetrics struct {
|
|
|
|
PendingRows uint64
|
|
|
|
|
2022-01-20 17:34:59 +01:00
|
|
|
IndexBlocksCacheSize uint64
|
|
|
|
IndexBlocksCacheSizeBytes uint64
|
|
|
|
IndexBlocksCacheSizeMaxBytes uint64
|
|
|
|
IndexBlocksCacheRequests uint64
|
|
|
|
IndexBlocksCacheMisses uint64
|
2019-05-22 23:16:55 +02:00
|
|
|
|
2022-12-06 00:15:00 +01:00
|
|
|
InmemorySizeBytes uint64
|
|
|
|
SmallSizeBytes uint64
|
|
|
|
BigSizeBytes uint64
|
|
|
|
|
|
|
|
InmemoryRowsCount uint64
|
|
|
|
SmallRowsCount uint64
|
|
|
|
BigRowsCount uint64
|
2019-07-04 18:09:40 +02:00
|
|
|
|
2022-12-06 00:15:00 +01:00
|
|
|
InmemoryBlocksCount uint64
|
|
|
|
SmallBlocksCount uint64
|
|
|
|
BigBlocksCount uint64
|
2019-05-22 23:16:55 +02:00
|
|
|
|
2022-12-06 00:15:00 +01:00
|
|
|
InmemoryPartsCount uint64
|
|
|
|
SmallPartsCount uint64
|
|
|
|
BigPartsCount uint64
|
2019-05-22 23:16:55 +02:00
|
|
|
|
2022-12-06 00:15:00 +01:00
|
|
|
ActiveInmemoryMerges uint64
|
|
|
|
ActiveSmallMerges uint64
|
|
|
|
ActiveBigMerges uint64
|
2019-05-22 23:16:55 +02:00
|
|
|
|
2022-12-06 00:15:00 +01:00
|
|
|
InmemoryMergesCount uint64
|
|
|
|
SmallMergesCount uint64
|
|
|
|
BigMergesCount uint64
|
2019-05-22 23:16:55 +02:00
|
|
|
|
2022-12-06 00:15:00 +01:00
|
|
|
InmemoryRowsMerged uint64
|
|
|
|
SmallRowsMerged uint64
|
|
|
|
BigRowsMerged uint64
|
2019-05-22 23:16:55 +02:00
|
|
|
|
2022-12-06 00:15:00 +01:00
|
|
|
InmemoryRowsDeleted uint64
|
|
|
|
SmallRowsDeleted uint64
|
|
|
|
BigRowsDeleted uint64
|
2019-05-22 23:16:55 +02:00
|
|
|
|
2022-12-06 00:15:00 +01:00
|
|
|
InmemoryPartsRefCount uint64
|
|
|
|
SmallPartsRefCount uint64
|
|
|
|
BigPartsRefCount uint64
|
2022-06-15 17:37:52 +02:00
|
|
|
|
|
|
|
ScheduledDownsamplingPartitions uint64
|
|
|
|
ScheduledDownsamplingPartitionsSize uint64
|
2022-12-06 00:15:00 +01:00
|
|
|
}
|
2020-09-29 20:47:40 +02:00
|
|
|
|
2022-12-06 00:15:00 +01:00
|
|
|
// TotalRowsCount returns total number of rows in tm.
|
|
|
|
func (pm *partitionMetrics) TotalRowsCount() uint64 {
|
|
|
|
return pm.PendingRows + pm.InmemoryRowsCount + pm.SmallRowsCount + pm.BigRowsCount
|
2019-05-22 23:16:55 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
// UpdateMetrics updates m with metrics from pt.
|
|
|
|
func (pt *partition) UpdateMetrics(m *partitionMetrics) {
|
2022-12-06 00:15:00 +01:00
|
|
|
m.PendingRows += uint64(pt.rawRows.Len())
|
2019-05-22 23:16:55 +02:00
|
|
|
|
|
|
|
pt.partsLock.Lock()
|
|
|
|
|
2022-06-15 17:37:52 +02:00
|
|
|
isDedupScheduled := pt.isDedupScheduled.Load()
|
|
|
|
if isDedupScheduled {
|
|
|
|
m.ScheduledDownsamplingPartitions++
|
|
|
|
}
|
|
|
|
|
2022-12-06 00:15:00 +01:00
|
|
|
for _, pw := range pt.inmemoryParts {
|
2019-05-22 23:16:55 +02:00
|
|
|
p := pw.p
|
2022-12-06 00:15:00 +01:00
|
|
|
m.InmemoryRowsCount += p.ph.RowsCount
|
|
|
|
m.InmemoryBlocksCount += p.ph.BlocksCount
|
|
|
|
m.InmemorySizeBytes += p.size
|
2024-02-23 21:54:55 +01:00
|
|
|
m.InmemoryPartsRefCount += uint64(pw.refCount.Load())
|
2022-06-15 17:37:52 +02:00
|
|
|
if isDedupScheduled {
|
|
|
|
m.ScheduledDownsamplingPartitionsSize += p.size
|
|
|
|
}
|
2019-05-22 23:16:55 +02:00
|
|
|
}
|
|
|
|
for _, pw := range pt.smallParts {
|
|
|
|
p := pw.p
|
|
|
|
m.SmallRowsCount += p.ph.RowsCount
|
|
|
|
m.SmallBlocksCount += p.ph.BlocksCount
|
2019-07-04 18:09:40 +02:00
|
|
|
m.SmallSizeBytes += p.size
|
2024-02-23 21:54:55 +01:00
|
|
|
m.SmallPartsRefCount += uint64(pw.refCount.Load())
|
2022-06-15 17:37:52 +02:00
|
|
|
if isDedupScheduled {
|
|
|
|
m.ScheduledDownsamplingPartitionsSize += p.size
|
|
|
|
}
|
2019-05-22 23:16:55 +02:00
|
|
|
}
|
2022-12-06 00:15:00 +01:00
|
|
|
for _, pw := range pt.bigParts {
|
|
|
|
p := pw.p
|
|
|
|
m.BigRowsCount += p.ph.RowsCount
|
|
|
|
m.BigBlocksCount += p.ph.BlocksCount
|
|
|
|
m.BigSizeBytes += p.size
|
2024-02-23 21:54:55 +01:00
|
|
|
m.BigPartsRefCount += uint64(pw.refCount.Load())
|
2022-06-15 17:37:52 +02:00
|
|
|
if isDedupScheduled {
|
|
|
|
m.ScheduledDownsamplingPartitionsSize += p.size
|
|
|
|
}
|
2022-12-06 00:15:00 +01:00
|
|
|
}
|
2019-05-22 23:16:55 +02:00
|
|
|
|
2022-12-06 00:15:00 +01:00
|
|
|
m.InmemoryPartsCount += uint64(len(pt.inmemoryParts))
|
2019-05-22 23:16:55 +02:00
|
|
|
m.SmallPartsCount += uint64(len(pt.smallParts))
|
2022-12-06 00:15:00 +01:00
|
|
|
m.BigPartsCount += uint64(len(pt.bigParts))
|
2019-05-22 23:16:55 +02:00
|
|
|
|
|
|
|
pt.partsLock.Unlock()
|
|
|
|
|
2022-01-20 17:34:59 +01:00
|
|
|
m.IndexBlocksCacheSize = uint64(ibCache.Len())
|
|
|
|
m.IndexBlocksCacheSizeBytes = uint64(ibCache.SizeBytes())
|
|
|
|
m.IndexBlocksCacheSizeMaxBytes = uint64(ibCache.SizeMaxBytes())
|
|
|
|
m.IndexBlocksCacheRequests = ibCache.Requests()
|
|
|
|
m.IndexBlocksCacheMisses = ibCache.Misses()
|
2019-05-22 23:16:55 +02:00
|
|
|
|
2024-02-23 23:15:21 +01:00
|
|
|
m.ActiveInmemoryMerges += uint64(pt.activeInmemoryMerges.Load())
|
|
|
|
m.ActiveSmallMerges += uint64(pt.activeSmallMerges.Load())
|
|
|
|
m.ActiveBigMerges += uint64(pt.activeBigMerges.Load())
|
2019-05-22 23:16:55 +02:00
|
|
|
|
2024-02-23 23:15:21 +01:00
|
|
|
m.InmemoryMergesCount += pt.inmemoryMergesCount.Load()
|
|
|
|
m.SmallMergesCount += pt.smallMergesCount.Load()
|
|
|
|
m.BigMergesCount += pt.bigMergesCount.Load()
|
2019-05-22 23:16:55 +02:00
|
|
|
|
2024-02-23 23:15:21 +01:00
|
|
|
m.InmemoryRowsMerged += pt.inmemoryRowsMerged.Load()
|
|
|
|
m.SmallRowsMerged += pt.smallRowsMerged.Load()
|
|
|
|
m.BigRowsMerged += pt.bigRowsMerged.Load()
|
2019-05-22 23:16:55 +02:00
|
|
|
|
2024-02-23 23:15:21 +01:00
|
|
|
m.InmemoryRowsDeleted += pt.inmemoryRowsDeleted.Load()
|
|
|
|
m.SmallRowsDeleted += pt.smallRowsDeleted.Load()
|
|
|
|
m.BigRowsDeleted += pt.bigRowsDeleted.Load()
|
2019-05-22 23:16:55 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
// AddRows adds the given rows to the partition pt.
|
|
|
|
//
|
|
|
|
// All the rows must fit the partition by timestamp range
|
|
|
|
// and must have valid PrecisionBits.
|
|
|
|
func (pt *partition) AddRows(rows []rawRow) {
|
|
|
|
if len(rows) == 0 {
|
|
|
|
return
|
|
|
|
}
|
|
|
|
|
2023-04-15 05:52:36 +02:00
|
|
|
if isDebug {
|
|
|
|
// Validate all the rows.
|
|
|
|
for i := range rows {
|
|
|
|
r := &rows[i]
|
|
|
|
if !pt.HasTimestamp(r.Timestamp) {
|
|
|
|
logger.Panicf("BUG: row %+v has Timestamp outside partition %q range %+v", r, pt.smallPartsPath, &pt.tr)
|
|
|
|
}
|
|
|
|
if err := encoding.CheckPrecisionBits(r.PrecisionBits); err != nil {
|
|
|
|
logger.Panicf("BUG: row %+v has invalid PrecisionBits: %s", r, err)
|
|
|
|
}
|
2019-05-22 23:16:55 +02:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2019-12-19 17:12:02 +01:00
|
|
|
pt.rawRows.addRows(pt, rows)
|
|
|
|
}
|
|
|
|
|
2023-04-15 05:52:36 +02:00
|
|
|
var isDebug = false
|
|
|
|
|
2019-12-19 17:12:02 +01:00
|
|
|
type rawRowsShards struct {
|
2024-02-23 21:27:03 +01:00
|
|
|
flushDeadlineMs atomic.Int64
|
2024-02-23 18:53:55 +01:00
|
|
|
|
2024-02-23 21:27:03 +01:00
|
|
|
shardIdx atomic.Uint32
|
2024-02-22 23:02:22 +01:00
|
|
|
|
2019-12-19 17:12:02 +01:00
|
|
|
// Shards reduce lock contention when adding rows on multi-CPU systems.
|
|
|
|
shards []rawRowsShard
|
2024-02-22 23:02:22 +01:00
|
|
|
|
|
|
|
rowssToFlushLock sync.Mutex
|
|
|
|
rowssToFlush [][]rawRow
|
2019-12-19 17:12:02 +01:00
|
|
|
}
|
|
|
|
|
2021-04-27 14:36:31 +02:00
|
|
|
func (rrss *rawRowsShards) init() {
|
|
|
|
rrss.shards = make([]rawRowsShard, rawRowsShardsPerPartition)
|
2019-12-19 17:12:02 +01:00
|
|
|
}
|
|
|
|
|
2021-04-27 14:36:31 +02:00
|
|
|
func (rrss *rawRowsShards) addRows(pt *partition, rows []rawRow) {
|
|
|
|
shards := rrss.shards
|
2022-12-06 00:15:00 +01:00
|
|
|
shardsLen := uint32(len(shards))
|
|
|
|
for len(rows) > 0 {
|
2024-02-23 21:27:03 +01:00
|
|
|
n := rrss.shardIdx.Add(1)
|
2022-12-06 00:15:00 +01:00
|
|
|
idx := n % shardsLen
|
2024-02-22 23:02:22 +01:00
|
|
|
tailRows, rowsToFlush := shards[idx].addRows(rows)
|
|
|
|
rrss.addRowsToFlush(pt, rowsToFlush)
|
|
|
|
rows = tailRows
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
func (rrss *rawRowsShards) addRowsToFlush(pt *partition, rowsToFlush []rawRow) {
|
|
|
|
if len(rowsToFlush) == 0 {
|
|
|
|
return
|
|
|
|
}
|
|
|
|
|
|
|
|
var rowssToMerge [][]rawRow
|
|
|
|
|
|
|
|
rrss.rowssToFlushLock.Lock()
|
|
|
|
if len(rrss.rowssToFlush) == 0 {
|
|
|
|
rrss.updateFlushDeadline()
|
|
|
|
}
|
|
|
|
rrss.rowssToFlush = append(rrss.rowssToFlush, rowsToFlush)
|
|
|
|
if len(rrss.rowssToFlush) >= defaultPartsToMerge {
|
|
|
|
rowssToMerge = rrss.rowssToFlush
|
|
|
|
rrss.rowssToFlush = nil
|
2022-12-06 00:15:00 +01:00
|
|
|
}
|
2024-02-22 23:02:22 +01:00
|
|
|
rrss.rowssToFlushLock.Unlock()
|
|
|
|
|
|
|
|
pt.flushRowssToInmemoryParts(rowssToMerge)
|
2019-12-19 17:12:02 +01:00
|
|
|
}
|
|
|
|
|
2021-04-27 14:36:31 +02:00
|
|
|
func (rrss *rawRowsShards) Len() int {
|
2019-12-19 17:12:02 +01:00
|
|
|
n := 0
|
2021-04-27 14:36:31 +02:00
|
|
|
for i := range rrss.shards[:] {
|
|
|
|
n += rrss.shards[i].Len()
|
2019-12-19 17:12:02 +01:00
|
|
|
}
|
2024-02-22 23:02:22 +01:00
|
|
|
|
|
|
|
rrss.rowssToFlushLock.Lock()
|
|
|
|
for _, rows := range rrss.rowssToFlush {
|
|
|
|
n += len(rows)
|
|
|
|
}
|
|
|
|
rrss.rowssToFlushLock.Unlock()
|
|
|
|
|
2019-12-19 17:12:02 +01:00
|
|
|
return n
|
|
|
|
}
|
|
|
|
|
2024-02-22 23:02:22 +01:00
|
|
|
func (rrss *rawRowsShards) updateFlushDeadline() {
|
2024-02-23 21:27:03 +01:00
|
|
|
rrss.flushDeadlineMs.Store(time.Now().Add(pendingRowsFlushInterval).UnixMilli())
|
2024-02-22 23:02:22 +01:00
|
|
|
}
|
|
|
|
|
2022-10-20 15:17:09 +02:00
|
|
|
type rawRowsShardNopad struct {
|
2024-02-23 21:27:03 +01:00
|
|
|
flushDeadlineMs atomic.Int64
|
2022-10-20 15:17:09 +02:00
|
|
|
|
|
|
|
mu sync.Mutex
|
|
|
|
rows []rawRow
|
|
|
|
}
|
|
|
|
|
|
|
|
type rawRowsShard struct {
|
|
|
|
rawRowsShardNopad
|
|
|
|
|
|
|
|
// The padding prevents false sharing on widespread platforms with
|
|
|
|
// 128 mod (cache line size) = 0 .
|
|
|
|
_ [128 - unsafe.Sizeof(rawRowsShardNopad{})%128]byte
|
2019-12-19 17:12:02 +01:00
|
|
|
}
|
|
|
|
|
|
|
|
func (rrs *rawRowsShard) Len() int {
|
2021-04-27 14:36:31 +02:00
|
|
|
rrs.mu.Lock()
|
2019-12-19 17:12:02 +01:00
|
|
|
n := len(rrs.rows)
|
2021-04-27 14:36:31 +02:00
|
|
|
rrs.mu.Unlock()
|
2019-12-19 17:12:02 +01:00
|
|
|
return n
|
|
|
|
}
|
|
|
|
|
2024-02-22 23:02:22 +01:00
|
|
|
func (rrs *rawRowsShard) addRows(rows []rawRow) ([]rawRow, []rawRow) {
|
2024-02-22 16:37:43 +01:00
|
|
|
var rowsToFlush []rawRow
|
2019-12-19 17:12:02 +01:00
|
|
|
|
2021-04-27 14:36:31 +02:00
|
|
|
rrs.mu.Lock()
|
2019-12-19 17:12:02 +01:00
|
|
|
if cap(rrs.rows) == 0 {
|
2023-01-18 09:01:03 +01:00
|
|
|
rrs.rows = newRawRows()
|
2019-12-19 17:12:02 +01:00
|
|
|
}
|
2024-02-22 23:02:22 +01:00
|
|
|
if len(rrs.rows) == 0 {
|
|
|
|
rrs.updateFlushDeadline()
|
|
|
|
}
|
2022-10-21 13:39:27 +02:00
|
|
|
n := copy(rrs.rows[len(rrs.rows):cap(rrs.rows)], rows)
|
|
|
|
rrs.rows = rrs.rows[:len(rrs.rows)+n]
|
|
|
|
rows = rows[n:]
|
|
|
|
if len(rows) > 0 {
|
2024-02-22 16:37:43 +01:00
|
|
|
rowsToFlush = rrs.rows
|
|
|
|
rrs.rows = newRawRows()
|
2024-02-22 23:02:22 +01:00
|
|
|
rrs.updateFlushDeadline()
|
2022-12-06 00:15:00 +01:00
|
|
|
n = copy(rrs.rows[:cap(rrs.rows)], rows)
|
|
|
|
rrs.rows = rrs.rows[:n]
|
|
|
|
rows = rows[n:]
|
2019-05-22 23:16:55 +02:00
|
|
|
}
|
2021-04-27 14:36:31 +02:00
|
|
|
rrs.mu.Unlock()
|
2019-05-22 23:16:55 +02:00
|
|
|
|
2024-02-22 23:02:22 +01:00
|
|
|
return rows, rowsToFlush
|
2022-12-06 00:15:00 +01:00
|
|
|
}
|
|
|
|
|
2023-01-18 09:01:03 +01:00
|
|
|
func newRawRows() []rawRow {
|
2024-02-22 19:06:37 +01:00
|
|
|
return make([]rawRow, 0, maxRawRowsPerShard)
|
2022-10-21 13:46:06 +02:00
|
|
|
}
|
|
|
|
|
2024-02-22 23:02:22 +01:00
|
|
|
func (pt *partition) flushRowssToInmemoryParts(rowss [][]rawRow) {
|
|
|
|
if len(rowss) == 0 {
|
2024-02-22 19:44:11 +01:00
|
|
|
return
|
|
|
|
}
|
|
|
|
|
2024-02-22 23:02:22 +01:00
|
|
|
// Convert rowss into in-memory parts.
|
2022-12-06 00:15:00 +01:00
|
|
|
var pwsLock sync.Mutex
|
2024-02-22 23:02:22 +01:00
|
|
|
pws := make([]*partWrapper, 0, len(rowss))
|
2022-04-06 12:34:00 +02:00
|
|
|
wg := getWaitGroup()
|
2024-02-22 23:02:22 +01:00
|
|
|
for _, rows := range rowss {
|
2021-06-17 12:42:32 +02:00
|
|
|
wg.Add(1)
|
lib/{mergeset,storage}: make background merge more responsive and scalable
- Maintain a separate worker pool per each part type (in-memory, file, big and small).
Previously a shared pool was used for merging all the part types.
A single merge worker could merge parts with mixed types at once. For example,
it could merge simultaneously an in-memory part plus a big file part.
Such a merge could take hours for big file part. During the duration of this merge
the in-memory part was pinned in memory and couldn't be persisted to disk
under the configured -inmemoryDataFlushInterval .
Another common issue, which could happen when parts with mixed types are merged,
is uncontrolled growth of in-memory parts or small parts when all the merge workers
were busy with merging big files. Such growth could lead to significant performance
degradataion for queries, since every query needs to check ever growing list of parts.
This could also slow down the registration of new time series, since VictoriaMetrics
searches for the internal series_id in the indexdb for every new time series.
The third issue is graceful shutdown duration, which could be very long when a background
merge is running on in-memory parts plus big file parts. This merge couldn't be interrupted,
since it merges in-memory parts.
A separate pool of merge workers per every part type elegantly resolves both issues:
- In-memory parts are merged to file-based parts in a timely manner, since the maximum
size of in-memory parts is limited.
- Long-running merges for big parts do not block merges for in-memory parts and small parts.
- Graceful shutdown duration is now limited by the time needed for flushing in-memory parts to files.
Merging for file parts is instantly canceled on graceful shutdown now.
- Deprecate -smallMergeConcurrency command-line flag, since the new background merge algorithm
should automatically self-tune according to the number of available CPU cores.
- Deprecate -finalMergeDelay command-line flag, since it wasn't working correctly.
It is better to run forced merge when needed - https://docs.victoriametrics.com/#forced-merge
- Tune the number of shards for pending rows and items before the data goes to in-memory parts
and becomes visible for search. This improves the maximum data ingestion rate and the maximum rate
for registration of new time series. This should reduce the duration of data ingestion slowdown
in VictoriaMetrics cluster on e.g. re-routing events, when some of vmstorage nodes become temporarily
unavailable.
- Prevent from possible "sync: WaitGroup misuse" panic on graceful shutdown.
This is a follow-up for fa566c68a6ccf7385a05f649aee7e5f5a38afb15 .
Thanks @misutoth to for the inspiration at https://github.com/VictoriaMetrics/VictoriaMetrics/pull/5212
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/5190
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/3790
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/3551
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/3337
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/3425
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/3647
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/3641
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/648
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/291
2024-01-26 21:39:49 +01:00
|
|
|
inmemoryPartsConcurrencyCh <- struct{}{}
|
2022-12-06 00:15:00 +01:00
|
|
|
go func(rowsChunk []rawRow) {
|
|
|
|
defer func() {
|
lib/{mergeset,storage}: make background merge more responsive and scalable
- Maintain a separate worker pool per each part type (in-memory, file, big and small).
Previously a shared pool was used for merging all the part types.
A single merge worker could merge parts with mixed types at once. For example,
it could merge simultaneously an in-memory part plus a big file part.
Such a merge could take hours for big file part. During the duration of this merge
the in-memory part was pinned in memory and couldn't be persisted to disk
under the configured -inmemoryDataFlushInterval .
Another common issue, which could happen when parts with mixed types are merged,
is uncontrolled growth of in-memory parts or small parts when all the merge workers
were busy with merging big files. Such growth could lead to significant performance
degradataion for queries, since every query needs to check ever growing list of parts.
This could also slow down the registration of new time series, since VictoriaMetrics
searches for the internal series_id in the indexdb for every new time series.
The third issue is graceful shutdown duration, which could be very long when a background
merge is running on in-memory parts plus big file parts. This merge couldn't be interrupted,
since it merges in-memory parts.
A separate pool of merge workers per every part type elegantly resolves both issues:
- In-memory parts are merged to file-based parts in a timely manner, since the maximum
size of in-memory parts is limited.
- Long-running merges for big parts do not block merges for in-memory parts and small parts.
- Graceful shutdown duration is now limited by the time needed for flushing in-memory parts to files.
Merging for file parts is instantly canceled on graceful shutdown now.
- Deprecate -smallMergeConcurrency command-line flag, since the new background merge algorithm
should automatically self-tune according to the number of available CPU cores.
- Deprecate -finalMergeDelay command-line flag, since it wasn't working correctly.
It is better to run forced merge when needed - https://docs.victoriametrics.com/#forced-merge
- Tune the number of shards for pending rows and items before the data goes to in-memory parts
and becomes visible for search. This improves the maximum data ingestion rate and the maximum rate
for registration of new time series. This should reduce the duration of data ingestion slowdown
in VictoriaMetrics cluster on e.g. re-routing events, when some of vmstorage nodes become temporarily
unavailable.
- Prevent from possible "sync: WaitGroup misuse" panic on graceful shutdown.
This is a follow-up for fa566c68a6ccf7385a05f649aee7e5f5a38afb15 .
Thanks @misutoth to for the inspiration at https://github.com/VictoriaMetrics/VictoriaMetrics/pull/5212
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/5190
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/3790
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/3551
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/3337
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/3425
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/3647
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/3641
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/648
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/291
2024-01-26 21:39:49 +01:00
|
|
|
<-inmemoryPartsConcurrencyCh
|
2022-12-06 00:15:00 +01:00
|
|
|
wg.Done()
|
|
|
|
}()
|
lib/{mergeset,storage}: make background merge more responsive and scalable
- Maintain a separate worker pool per each part type (in-memory, file, big and small).
Previously a shared pool was used for merging all the part types.
A single merge worker could merge parts with mixed types at once. For example,
it could merge simultaneously an in-memory part plus a big file part.
Such a merge could take hours for big file part. During the duration of this merge
the in-memory part was pinned in memory and couldn't be persisted to disk
under the configured -inmemoryDataFlushInterval .
Another common issue, which could happen when parts with mixed types are merged,
is uncontrolled growth of in-memory parts or small parts when all the merge workers
were busy with merging big files. Such growth could lead to significant performance
degradataion for queries, since every query needs to check ever growing list of parts.
This could also slow down the registration of new time series, since VictoriaMetrics
searches for the internal series_id in the indexdb for every new time series.
The third issue is graceful shutdown duration, which could be very long when a background
merge is running on in-memory parts plus big file parts. This merge couldn't be interrupted,
since it merges in-memory parts.
A separate pool of merge workers per every part type elegantly resolves both issues:
- In-memory parts are merged to file-based parts in a timely manner, since the maximum
size of in-memory parts is limited.
- Long-running merges for big parts do not block merges for in-memory parts and small parts.
- Graceful shutdown duration is now limited by the time needed for flushing in-memory parts to files.
Merging for file parts is instantly canceled on graceful shutdown now.
- Deprecate -smallMergeConcurrency command-line flag, since the new background merge algorithm
should automatically self-tune according to the number of available CPU cores.
- Deprecate -finalMergeDelay command-line flag, since it wasn't working correctly.
It is better to run forced merge when needed - https://docs.victoriametrics.com/#forced-merge
- Tune the number of shards for pending rows and items before the data goes to in-memory parts
and becomes visible for search. This improves the maximum data ingestion rate and the maximum rate
for registration of new time series. This should reduce the duration of data ingestion slowdown
in VictoriaMetrics cluster on e.g. re-routing events, when some of vmstorage nodes become temporarily
unavailable.
- Prevent from possible "sync: WaitGroup misuse" panic on graceful shutdown.
This is a follow-up for fa566c68a6ccf7385a05f649aee7e5f5a38afb15 .
Thanks @misutoth to for the inspiration at https://github.com/VictoriaMetrics/VictoriaMetrics/pull/5212
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/5190
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/3790
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/3551
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/3337
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/3425
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/3647
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/3641
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/648
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/291
2024-01-26 21:39:49 +01:00
|
|
|
|
2022-12-06 00:15:00 +01:00
|
|
|
pw := pt.createInmemoryPart(rowsChunk)
|
lib/{mergeset,storage}: make background merge more responsive and scalable
- Maintain a separate worker pool per each part type (in-memory, file, big and small).
Previously a shared pool was used for merging all the part types.
A single merge worker could merge parts with mixed types at once. For example,
it could merge simultaneously an in-memory part plus a big file part.
Such a merge could take hours for big file part. During the duration of this merge
the in-memory part was pinned in memory and couldn't be persisted to disk
under the configured -inmemoryDataFlushInterval .
Another common issue, which could happen when parts with mixed types are merged,
is uncontrolled growth of in-memory parts or small parts when all the merge workers
were busy with merging big files. Such growth could lead to significant performance
degradataion for queries, since every query needs to check ever growing list of parts.
This could also slow down the registration of new time series, since VictoriaMetrics
searches for the internal series_id in the indexdb for every new time series.
The third issue is graceful shutdown duration, which could be very long when a background
merge is running on in-memory parts plus big file parts. This merge couldn't be interrupted,
since it merges in-memory parts.
A separate pool of merge workers per every part type elegantly resolves both issues:
- In-memory parts are merged to file-based parts in a timely manner, since the maximum
size of in-memory parts is limited.
- Long-running merges for big parts do not block merges for in-memory parts and small parts.
- Graceful shutdown duration is now limited by the time needed for flushing in-memory parts to files.
Merging for file parts is instantly canceled on graceful shutdown now.
- Deprecate -smallMergeConcurrency command-line flag, since the new background merge algorithm
should automatically self-tune according to the number of available CPU cores.
- Deprecate -finalMergeDelay command-line flag, since it wasn't working correctly.
It is better to run forced merge when needed - https://docs.victoriametrics.com/#forced-merge
- Tune the number of shards for pending rows and items before the data goes to in-memory parts
and becomes visible for search. This improves the maximum data ingestion rate and the maximum rate
for registration of new time series. This should reduce the duration of data ingestion slowdown
in VictoriaMetrics cluster on e.g. re-routing events, when some of vmstorage nodes become temporarily
unavailable.
- Prevent from possible "sync: WaitGroup misuse" panic on graceful shutdown.
This is a follow-up for fa566c68a6ccf7385a05f649aee7e5f5a38afb15 .
Thanks @misutoth to for the inspiration at https://github.com/VictoriaMetrics/VictoriaMetrics/pull/5212
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/5190
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/3790
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/3551
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/3337
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/3425
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/3647
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/3641
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/648
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/291
2024-01-26 21:39:49 +01:00
|
|
|
if pw != nil {
|
|
|
|
pwsLock.Lock()
|
|
|
|
pws = append(pws, pw)
|
|
|
|
pwsLock.Unlock()
|
2022-12-06 00:15:00 +01:00
|
|
|
}
|
2024-02-22 23:02:22 +01:00
|
|
|
}(rows)
|
2019-05-22 23:16:55 +02:00
|
|
|
}
|
2021-06-17 12:42:32 +02:00
|
|
|
wg.Wait()
|
2022-04-06 12:34:00 +02:00
|
|
|
putWaitGroup(wg)
|
2022-12-06 00:15:00 +01:00
|
|
|
|
lib/{mergeset,storage}: make background merge more responsive and scalable
- Maintain a separate worker pool per each part type (in-memory, file, big and small).
Previously a shared pool was used for merging all the part types.
A single merge worker could merge parts with mixed types at once. For example,
it could merge simultaneously an in-memory part plus a big file part.
Such a merge could take hours for big file part. During the duration of this merge
the in-memory part was pinned in memory and couldn't be persisted to disk
under the configured -inmemoryDataFlushInterval .
Another common issue, which could happen when parts with mixed types are merged,
is uncontrolled growth of in-memory parts or small parts when all the merge workers
were busy with merging big files. Such growth could lead to significant performance
degradataion for queries, since every query needs to check ever growing list of parts.
This could also slow down the registration of new time series, since VictoriaMetrics
searches for the internal series_id in the indexdb for every new time series.
The third issue is graceful shutdown duration, which could be very long when a background
merge is running on in-memory parts plus big file parts. This merge couldn't be interrupted,
since it merges in-memory parts.
A separate pool of merge workers per every part type elegantly resolves both issues:
- In-memory parts are merged to file-based parts in a timely manner, since the maximum
size of in-memory parts is limited.
- Long-running merges for big parts do not block merges for in-memory parts and small parts.
- Graceful shutdown duration is now limited by the time needed for flushing in-memory parts to files.
Merging for file parts is instantly canceled on graceful shutdown now.
- Deprecate -smallMergeConcurrency command-line flag, since the new background merge algorithm
should automatically self-tune according to the number of available CPU cores.
- Deprecate -finalMergeDelay command-line flag, since it wasn't working correctly.
It is better to run forced merge when needed - https://docs.victoriametrics.com/#forced-merge
- Tune the number of shards for pending rows and items before the data goes to in-memory parts
and becomes visible for search. This improves the maximum data ingestion rate and the maximum rate
for registration of new time series. This should reduce the duration of data ingestion slowdown
in VictoriaMetrics cluster on e.g. re-routing events, when some of vmstorage nodes become temporarily
unavailable.
- Prevent from possible "sync: WaitGroup misuse" panic on graceful shutdown.
This is a follow-up for fa566c68a6ccf7385a05f649aee7e5f5a38afb15 .
Thanks @misutoth to for the inspiration at https://github.com/VictoriaMetrics/VictoriaMetrics/pull/5212
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/5190
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/3790
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/3551
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/3337
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/3425
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/3647
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/3641
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/648
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/291
2024-01-26 21:39:49 +01:00
|
|
|
// Merge pws into a single in-memory part.
|
|
|
|
maxPartSize := getMaxInmemoryPartSize()
|
|
|
|
for len(pws) > 1 {
|
|
|
|
pws = pt.mustMergeInmemoryParts(pws)
|
|
|
|
|
|
|
|
pwsRemaining := pws[:0]
|
|
|
|
for _, pw := range pws {
|
|
|
|
if pw.p.size >= maxPartSize {
|
|
|
|
pt.addToInmemoryParts(pw)
|
|
|
|
} else {
|
|
|
|
pwsRemaining = append(pwsRemaining, pw)
|
|
|
|
}
|
2023-01-18 10:09:03 +01:00
|
|
|
}
|
lib/{mergeset,storage}: make background merge more responsive and scalable
- Maintain a separate worker pool per each part type (in-memory, file, big and small).
Previously a shared pool was used for merging all the part types.
A single merge worker could merge parts with mixed types at once. For example,
it could merge simultaneously an in-memory part plus a big file part.
Such a merge could take hours for big file part. During the duration of this merge
the in-memory part was pinned in memory and couldn't be persisted to disk
under the configured -inmemoryDataFlushInterval .
Another common issue, which could happen when parts with mixed types are merged,
is uncontrolled growth of in-memory parts or small parts when all the merge workers
were busy with merging big files. Such growth could lead to significant performance
degradataion for queries, since every query needs to check ever growing list of parts.
This could also slow down the registration of new time series, since VictoriaMetrics
searches for the internal series_id in the indexdb for every new time series.
The third issue is graceful shutdown duration, which could be very long when a background
merge is running on in-memory parts plus big file parts. This merge couldn't be interrupted,
since it merges in-memory parts.
A separate pool of merge workers per every part type elegantly resolves both issues:
- In-memory parts are merged to file-based parts in a timely manner, since the maximum
size of in-memory parts is limited.
- Long-running merges for big parts do not block merges for in-memory parts and small parts.
- Graceful shutdown duration is now limited by the time needed for flushing in-memory parts to files.
Merging for file parts is instantly canceled on graceful shutdown now.
- Deprecate -smallMergeConcurrency command-line flag, since the new background merge algorithm
should automatically self-tune according to the number of available CPU cores.
- Deprecate -finalMergeDelay command-line flag, since it wasn't working correctly.
It is better to run forced merge when needed - https://docs.victoriametrics.com/#forced-merge
- Tune the number of shards for pending rows and items before the data goes to in-memory parts
and becomes visible for search. This improves the maximum data ingestion rate and the maximum rate
for registration of new time series. This should reduce the duration of data ingestion slowdown
in VictoriaMetrics cluster on e.g. re-routing events, when some of vmstorage nodes become temporarily
unavailable.
- Prevent from possible "sync: WaitGroup misuse" panic on graceful shutdown.
This is a follow-up for fa566c68a6ccf7385a05f649aee7e5f5a38afb15 .
Thanks @misutoth to for the inspiration at https://github.com/VictoriaMetrics/VictoriaMetrics/pull/5212
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/5190
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/3790
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/3551
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/3337
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/3425
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/3647
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/3641
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/648
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/291
2024-01-26 21:39:49 +01:00
|
|
|
pws = pwsRemaining
|
|
|
|
}
|
|
|
|
if len(pws) == 1 {
|
|
|
|
pt.addToInmemoryParts(pws[0])
|
2023-01-18 10:09:03 +01:00
|
|
|
}
|
lib/{mergeset,storage}: make background merge more responsive and scalable
- Maintain a separate worker pool per each part type (in-memory, file, big and small).
Previously a shared pool was used for merging all the part types.
A single merge worker could merge parts with mixed types at once. For example,
it could merge simultaneously an in-memory part plus a big file part.
Such a merge could take hours for big file part. During the duration of this merge
the in-memory part was pinned in memory and couldn't be persisted to disk
under the configured -inmemoryDataFlushInterval .
Another common issue, which could happen when parts with mixed types are merged,
is uncontrolled growth of in-memory parts or small parts when all the merge workers
were busy with merging big files. Such growth could lead to significant performance
degradataion for queries, since every query needs to check ever growing list of parts.
This could also slow down the registration of new time series, since VictoriaMetrics
searches for the internal series_id in the indexdb for every new time series.
The third issue is graceful shutdown duration, which could be very long when a background
merge is running on in-memory parts plus big file parts. This merge couldn't be interrupted,
since it merges in-memory parts.
A separate pool of merge workers per every part type elegantly resolves both issues:
- In-memory parts are merged to file-based parts in a timely manner, since the maximum
size of in-memory parts is limited.
- Long-running merges for big parts do not block merges for in-memory parts and small parts.
- Graceful shutdown duration is now limited by the time needed for flushing in-memory parts to files.
Merging for file parts is instantly canceled on graceful shutdown now.
- Deprecate -smallMergeConcurrency command-line flag, since the new background merge algorithm
should automatically self-tune according to the number of available CPU cores.
- Deprecate -finalMergeDelay command-line flag, since it wasn't working correctly.
It is better to run forced merge when needed - https://docs.victoriametrics.com/#forced-merge
- Tune the number of shards for pending rows and items before the data goes to in-memory parts
and becomes visible for search. This improves the maximum data ingestion rate and the maximum rate
for registration of new time series. This should reduce the duration of data ingestion slowdown
in VictoriaMetrics cluster on e.g. re-routing events, when some of vmstorage nodes become temporarily
unavailable.
- Prevent from possible "sync: WaitGroup misuse" panic on graceful shutdown.
This is a follow-up for fa566c68a6ccf7385a05f649aee7e5f5a38afb15 .
Thanks @misutoth to for the inspiration at https://github.com/VictoriaMetrics/VictoriaMetrics/pull/5212
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/5190
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/3790
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/3551
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/3337
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/3425
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/3647
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/3641
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/648
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/291
2024-01-26 21:39:49 +01:00
|
|
|
}
|
|
|
|
|
|
|
|
func (pt *partition) addToInmemoryParts(pw *partWrapper) {
|
|
|
|
pt.partsLock.Lock()
|
|
|
|
pt.inmemoryParts = append(pt.inmemoryParts, pw)
|
|
|
|
pt.startInmemoryPartsMergerLocked()
|
2022-12-06 00:15:00 +01:00
|
|
|
pt.partsLock.Unlock()
|
|
|
|
}
|
|
|
|
|
lib/{mergeset,storage}: make background merge more responsive and scalable
- Maintain a separate worker pool per each part type (in-memory, file, big and small).
Previously a shared pool was used for merging all the part types.
A single merge worker could merge parts with mixed types at once. For example,
it could merge simultaneously an in-memory part plus a big file part.
Such a merge could take hours for big file part. During the duration of this merge
the in-memory part was pinned in memory and couldn't be persisted to disk
under the configured -inmemoryDataFlushInterval .
Another common issue, which could happen when parts with mixed types are merged,
is uncontrolled growth of in-memory parts or small parts when all the merge workers
were busy with merging big files. Such growth could lead to significant performance
degradataion for queries, since every query needs to check ever growing list of parts.
This could also slow down the registration of new time series, since VictoriaMetrics
searches for the internal series_id in the indexdb for every new time series.
The third issue is graceful shutdown duration, which could be very long when a background
merge is running on in-memory parts plus big file parts. This merge couldn't be interrupted,
since it merges in-memory parts.
A separate pool of merge workers per every part type elegantly resolves both issues:
- In-memory parts are merged to file-based parts in a timely manner, since the maximum
size of in-memory parts is limited.
- Long-running merges for big parts do not block merges for in-memory parts and small parts.
- Graceful shutdown duration is now limited by the time needed for flushing in-memory parts to files.
Merging for file parts is instantly canceled on graceful shutdown now.
- Deprecate -smallMergeConcurrency command-line flag, since the new background merge algorithm
should automatically self-tune according to the number of available CPU cores.
- Deprecate -finalMergeDelay command-line flag, since it wasn't working correctly.
It is better to run forced merge when needed - https://docs.victoriametrics.com/#forced-merge
- Tune the number of shards for pending rows and items before the data goes to in-memory parts
and becomes visible for search. This improves the maximum data ingestion rate and the maximum rate
for registration of new time series. This should reduce the duration of data ingestion slowdown
in VictoriaMetrics cluster on e.g. re-routing events, when some of vmstorage nodes become temporarily
unavailable.
- Prevent from possible "sync: WaitGroup misuse" panic on graceful shutdown.
This is a follow-up for fa566c68a6ccf7385a05f649aee7e5f5a38afb15 .
Thanks @misutoth to for the inspiration at https://github.com/VictoriaMetrics/VictoriaMetrics/pull/5212
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/5190
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/3790
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/3551
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/3337
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/3425
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/3647
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/3641
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/648
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/291
2024-01-26 21:39:49 +01:00
|
|
|
func (pt *partition) NotifyReadWriteMode() {
|
|
|
|
pt.startInmemoryPartsMergers()
|
|
|
|
pt.startSmallPartsMergers()
|
|
|
|
pt.startBigPartsMergers()
|
2023-01-18 10:09:03 +01:00
|
|
|
}
|
|
|
|
|
lib/{mergeset,storage}: make background merge more responsive and scalable
- Maintain a separate worker pool per each part type (in-memory, file, big and small).
Previously a shared pool was used for merging all the part types.
A single merge worker could merge parts with mixed types at once. For example,
it could merge simultaneously an in-memory part plus a big file part.
Such a merge could take hours for big file part. During the duration of this merge
the in-memory part was pinned in memory and couldn't be persisted to disk
under the configured -inmemoryDataFlushInterval .
Another common issue, which could happen when parts with mixed types are merged,
is uncontrolled growth of in-memory parts or small parts when all the merge workers
were busy with merging big files. Such growth could lead to significant performance
degradataion for queries, since every query needs to check ever growing list of parts.
This could also slow down the registration of new time series, since VictoriaMetrics
searches for the internal series_id in the indexdb for every new time series.
The third issue is graceful shutdown duration, which could be very long when a background
merge is running on in-memory parts plus big file parts. This merge couldn't be interrupted,
since it merges in-memory parts.
A separate pool of merge workers per every part type elegantly resolves both issues:
- In-memory parts are merged to file-based parts in a timely manner, since the maximum
size of in-memory parts is limited.
- Long-running merges for big parts do not block merges for in-memory parts and small parts.
- Graceful shutdown duration is now limited by the time needed for flushing in-memory parts to files.
Merging for file parts is instantly canceled on graceful shutdown now.
- Deprecate -smallMergeConcurrency command-line flag, since the new background merge algorithm
should automatically self-tune according to the number of available CPU cores.
- Deprecate -finalMergeDelay command-line flag, since it wasn't working correctly.
It is better to run forced merge when needed - https://docs.victoriametrics.com/#forced-merge
- Tune the number of shards for pending rows and items before the data goes to in-memory parts
and becomes visible for search. This improves the maximum data ingestion rate and the maximum rate
for registration of new time series. This should reduce the duration of data ingestion slowdown
in VictoriaMetrics cluster on e.g. re-routing events, when some of vmstorage nodes become temporarily
unavailable.
- Prevent from possible "sync: WaitGroup misuse" panic on graceful shutdown.
This is a follow-up for fa566c68a6ccf7385a05f649aee7e5f5a38afb15 .
Thanks @misutoth to for the inspiration at https://github.com/VictoriaMetrics/VictoriaMetrics/pull/5212
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/5190
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/3790
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/3551
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/3337
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/3425
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/3647
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/3641
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/648
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/291
2024-01-26 21:39:49 +01:00
|
|
|
func (pt *partition) inmemoryPartsMerger() {
|
|
|
|
for {
|
2024-02-23 22:29:23 +01:00
|
|
|
if pt.s.isReadOnly.Load() {
|
lib/{mergeset,storage}: make background merge more responsive and scalable
- Maintain a separate worker pool per each part type (in-memory, file, big and small).
Previously a shared pool was used for merging all the part types.
A single merge worker could merge parts with mixed types at once. For example,
it could merge simultaneously an in-memory part plus a big file part.
Such a merge could take hours for big file part. During the duration of this merge
the in-memory part was pinned in memory and couldn't be persisted to disk
under the configured -inmemoryDataFlushInterval .
Another common issue, which could happen when parts with mixed types are merged,
is uncontrolled growth of in-memory parts or small parts when all the merge workers
were busy with merging big files. Such growth could lead to significant performance
degradataion for queries, since every query needs to check ever growing list of parts.
This could also slow down the registration of new time series, since VictoriaMetrics
searches for the internal series_id in the indexdb for every new time series.
The third issue is graceful shutdown duration, which could be very long when a background
merge is running on in-memory parts plus big file parts. This merge couldn't be interrupted,
since it merges in-memory parts.
A separate pool of merge workers per every part type elegantly resolves both issues:
- In-memory parts are merged to file-based parts in a timely manner, since the maximum
size of in-memory parts is limited.
- Long-running merges for big parts do not block merges for in-memory parts and small parts.
- Graceful shutdown duration is now limited by the time needed for flushing in-memory parts to files.
Merging for file parts is instantly canceled on graceful shutdown now.
- Deprecate -smallMergeConcurrency command-line flag, since the new background merge algorithm
should automatically self-tune according to the number of available CPU cores.
- Deprecate -finalMergeDelay command-line flag, since it wasn't working correctly.
It is better to run forced merge when needed - https://docs.victoriametrics.com/#forced-merge
- Tune the number of shards for pending rows and items before the data goes to in-memory parts
and becomes visible for search. This improves the maximum data ingestion rate and the maximum rate
for registration of new time series. This should reduce the duration of data ingestion slowdown
in VictoriaMetrics cluster on e.g. re-routing events, when some of vmstorage nodes become temporarily
unavailable.
- Prevent from possible "sync: WaitGroup misuse" panic on graceful shutdown.
This is a follow-up for fa566c68a6ccf7385a05f649aee7e5f5a38afb15 .
Thanks @misutoth to for the inspiration at https://github.com/VictoriaMetrics/VictoriaMetrics/pull/5212
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/5190
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/3790
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/3551
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/3337
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/3425
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/3647
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/3641
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/648
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/291
2024-01-26 21:39:49 +01:00
|
|
|
return
|
|
|
|
}
|
|
|
|
maxOutBytes := pt.getMaxBigPartSize()
|
2023-02-11 21:06:18 +01:00
|
|
|
|
lib/{mergeset,storage}: make background merge more responsive and scalable
- Maintain a separate worker pool per each part type (in-memory, file, big and small).
Previously a shared pool was used for merging all the part types.
A single merge worker could merge parts with mixed types at once. For example,
it could merge simultaneously an in-memory part plus a big file part.
Such a merge could take hours for big file part. During the duration of this merge
the in-memory part was pinned in memory and couldn't be persisted to disk
under the configured -inmemoryDataFlushInterval .
Another common issue, which could happen when parts with mixed types are merged,
is uncontrolled growth of in-memory parts or small parts when all the merge workers
were busy with merging big files. Such growth could lead to significant performance
degradataion for queries, since every query needs to check ever growing list of parts.
This could also slow down the registration of new time series, since VictoriaMetrics
searches for the internal series_id in the indexdb for every new time series.
The third issue is graceful shutdown duration, which could be very long when a background
merge is running on in-memory parts plus big file parts. This merge couldn't be interrupted,
since it merges in-memory parts.
A separate pool of merge workers per every part type elegantly resolves both issues:
- In-memory parts are merged to file-based parts in a timely manner, since the maximum
size of in-memory parts is limited.
- Long-running merges for big parts do not block merges for in-memory parts and small parts.
- Graceful shutdown duration is now limited by the time needed for flushing in-memory parts to files.
Merging for file parts is instantly canceled on graceful shutdown now.
- Deprecate -smallMergeConcurrency command-line flag, since the new background merge algorithm
should automatically self-tune according to the number of available CPU cores.
- Deprecate -finalMergeDelay command-line flag, since it wasn't working correctly.
It is better to run forced merge when needed - https://docs.victoriametrics.com/#forced-merge
- Tune the number of shards for pending rows and items before the data goes to in-memory parts
and becomes visible for search. This improves the maximum data ingestion rate and the maximum rate
for registration of new time series. This should reduce the duration of data ingestion slowdown
in VictoriaMetrics cluster on e.g. re-routing events, when some of vmstorage nodes become temporarily
unavailable.
- Prevent from possible "sync: WaitGroup misuse" panic on graceful shutdown.
This is a follow-up for fa566c68a6ccf7385a05f649aee7e5f5a38afb15 .
Thanks @misutoth to for the inspiration at https://github.com/VictoriaMetrics/VictoriaMetrics/pull/5212
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/5190
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/3790
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/3551
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/3337
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/3425
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/3647
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/3641
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/648
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/291
2024-01-26 21:39:49 +01:00
|
|
|
pt.partsLock.Lock()
|
|
|
|
pws := getPartsToMerge(pt.inmemoryParts, maxOutBytes)
|
|
|
|
pt.partsLock.Unlock()
|
2022-12-06 00:15:00 +01:00
|
|
|
|
lib/{mergeset,storage}: make background merge more responsive and scalable
- Maintain a separate worker pool per each part type (in-memory, file, big and small).
Previously a shared pool was used for merging all the part types.
A single merge worker could merge parts with mixed types at once. For example,
it could merge simultaneously an in-memory part plus a big file part.
Such a merge could take hours for big file part. During the duration of this merge
the in-memory part was pinned in memory and couldn't be persisted to disk
under the configured -inmemoryDataFlushInterval .
Another common issue, which could happen when parts with mixed types are merged,
is uncontrolled growth of in-memory parts or small parts when all the merge workers
were busy with merging big files. Such growth could lead to significant performance
degradataion for queries, since every query needs to check ever growing list of parts.
This could also slow down the registration of new time series, since VictoriaMetrics
searches for the internal series_id in the indexdb for every new time series.
The third issue is graceful shutdown duration, which could be very long when a background
merge is running on in-memory parts plus big file parts. This merge couldn't be interrupted,
since it merges in-memory parts.
A separate pool of merge workers per every part type elegantly resolves both issues:
- In-memory parts are merged to file-based parts in a timely manner, since the maximum
size of in-memory parts is limited.
- Long-running merges for big parts do not block merges for in-memory parts and small parts.
- Graceful shutdown duration is now limited by the time needed for flushing in-memory parts to files.
Merging for file parts is instantly canceled on graceful shutdown now.
- Deprecate -smallMergeConcurrency command-line flag, since the new background merge algorithm
should automatically self-tune according to the number of available CPU cores.
- Deprecate -finalMergeDelay command-line flag, since it wasn't working correctly.
It is better to run forced merge when needed - https://docs.victoriametrics.com/#forced-merge
- Tune the number of shards for pending rows and items before the data goes to in-memory parts
and becomes visible for search. This improves the maximum data ingestion rate and the maximum rate
for registration of new time series. This should reduce the duration of data ingestion slowdown
in VictoriaMetrics cluster on e.g. re-routing events, when some of vmstorage nodes become temporarily
unavailable.
- Prevent from possible "sync: WaitGroup misuse" panic on graceful shutdown.
This is a follow-up for fa566c68a6ccf7385a05f649aee7e5f5a38afb15 .
Thanks @misutoth to for the inspiration at https://github.com/VictoriaMetrics/VictoriaMetrics/pull/5212
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/5190
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/3790
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/3551
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/3337
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/3425
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/3647
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/3641
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/648
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/291
2024-01-26 21:39:49 +01:00
|
|
|
if len(pws) == 0 {
|
|
|
|
// Nothing to merge
|
|
|
|
return
|
|
|
|
}
|
2022-12-28 23:32:18 +01:00
|
|
|
|
lib/{mergeset,storage}: make background merge more responsive and scalable
- Maintain a separate worker pool per each part type (in-memory, file, big and small).
Previously a shared pool was used for merging all the part types.
A single merge worker could merge parts with mixed types at once. For example,
it could merge simultaneously an in-memory part plus a big file part.
Such a merge could take hours for big file part. During the duration of this merge
the in-memory part was pinned in memory and couldn't be persisted to disk
under the configured -inmemoryDataFlushInterval .
Another common issue, which could happen when parts with mixed types are merged,
is uncontrolled growth of in-memory parts or small parts when all the merge workers
were busy with merging big files. Such growth could lead to significant performance
degradataion for queries, since every query needs to check ever growing list of parts.
This could also slow down the registration of new time series, since VictoriaMetrics
searches for the internal series_id in the indexdb for every new time series.
The third issue is graceful shutdown duration, which could be very long when a background
merge is running on in-memory parts plus big file parts. This merge couldn't be interrupted,
since it merges in-memory parts.
A separate pool of merge workers per every part type elegantly resolves both issues:
- In-memory parts are merged to file-based parts in a timely manner, since the maximum
size of in-memory parts is limited.
- Long-running merges for big parts do not block merges for in-memory parts and small parts.
- Graceful shutdown duration is now limited by the time needed for flushing in-memory parts to files.
Merging for file parts is instantly canceled on graceful shutdown now.
- Deprecate -smallMergeConcurrency command-line flag, since the new background merge algorithm
should automatically self-tune according to the number of available CPU cores.
- Deprecate -finalMergeDelay command-line flag, since it wasn't working correctly.
It is better to run forced merge when needed - https://docs.victoriametrics.com/#forced-merge
- Tune the number of shards for pending rows and items before the data goes to in-memory parts
and becomes visible for search. This improves the maximum data ingestion rate and the maximum rate
for registration of new time series. This should reduce the duration of data ingestion slowdown
in VictoriaMetrics cluster on e.g. re-routing events, when some of vmstorage nodes become temporarily
unavailable.
- Prevent from possible "sync: WaitGroup misuse" panic on graceful shutdown.
This is a follow-up for fa566c68a6ccf7385a05f649aee7e5f5a38afb15 .
Thanks @misutoth to for the inspiration at https://github.com/VictoriaMetrics/VictoriaMetrics/pull/5212
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/5190
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/3790
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/3551
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/3337
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/3425
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/3647
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/3641
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/648
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/291
2024-01-26 21:39:49 +01:00
|
|
|
inmemoryPartsConcurrencyCh <- struct{}{}
|
|
|
|
err := pt.mergeParts(pws, pt.stopCh, false)
|
|
|
|
<-inmemoryPartsConcurrencyCh
|
2022-12-06 00:15:00 +01:00
|
|
|
|
lib/{mergeset,storage}: make background merge more responsive and scalable
- Maintain a separate worker pool per each part type (in-memory, file, big and small).
Previously a shared pool was used for merging all the part types.
A single merge worker could merge parts with mixed types at once. For example,
it could merge simultaneously an in-memory part plus a big file part.
Such a merge could take hours for big file part. During the duration of this merge
the in-memory part was pinned in memory and couldn't be persisted to disk
under the configured -inmemoryDataFlushInterval .
Another common issue, which could happen when parts with mixed types are merged,
is uncontrolled growth of in-memory parts or small parts when all the merge workers
were busy with merging big files. Such growth could lead to significant performance
degradataion for queries, since every query needs to check ever growing list of parts.
This could also slow down the registration of new time series, since VictoriaMetrics
searches for the internal series_id in the indexdb for every new time series.
The third issue is graceful shutdown duration, which could be very long when a background
merge is running on in-memory parts plus big file parts. This merge couldn't be interrupted,
since it merges in-memory parts.
A separate pool of merge workers per every part type elegantly resolves both issues:
- In-memory parts are merged to file-based parts in a timely manner, since the maximum
size of in-memory parts is limited.
- Long-running merges for big parts do not block merges for in-memory parts and small parts.
- Graceful shutdown duration is now limited by the time needed for flushing in-memory parts to files.
Merging for file parts is instantly canceled on graceful shutdown now.
- Deprecate -smallMergeConcurrency command-line flag, since the new background merge algorithm
should automatically self-tune according to the number of available CPU cores.
- Deprecate -finalMergeDelay command-line flag, since it wasn't working correctly.
It is better to run forced merge when needed - https://docs.victoriametrics.com/#forced-merge
- Tune the number of shards for pending rows and items before the data goes to in-memory parts
and becomes visible for search. This improves the maximum data ingestion rate and the maximum rate
for registration of new time series. This should reduce the duration of data ingestion slowdown
in VictoriaMetrics cluster on e.g. re-routing events, when some of vmstorage nodes become temporarily
unavailable.
- Prevent from possible "sync: WaitGroup misuse" panic on graceful shutdown.
This is a follow-up for fa566c68a6ccf7385a05f649aee7e5f5a38afb15 .
Thanks @misutoth to for the inspiration at https://github.com/VictoriaMetrics/VictoriaMetrics/pull/5212
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/5190
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/3790
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/3551
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/3337
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/3425
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/3647
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/3641
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/648
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/291
2024-01-26 21:39:49 +01:00
|
|
|
if err == nil {
|
|
|
|
// Try merging additional parts.
|
|
|
|
continue
|
|
|
|
}
|
|
|
|
if errors.Is(err, errForciblyStopped) {
|
|
|
|
// Nothing to do - finish the merger.
|
|
|
|
return
|
|
|
|
}
|
|
|
|
// Unexpected error.
|
|
|
|
logger.Panicf("FATAL: unrecoverable error when merging inmemory parts in partition %q: %s", pt.name, err)
|
2022-12-06 00:15:00 +01:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
lib/{mergeset,storage}: make background merge more responsive and scalable
- Maintain a separate worker pool per each part type (in-memory, file, big and small).
Previously a shared pool was used for merging all the part types.
A single merge worker could merge parts with mixed types at once. For example,
it could merge simultaneously an in-memory part plus a big file part.
Such a merge could take hours for big file part. During the duration of this merge
the in-memory part was pinned in memory and couldn't be persisted to disk
under the configured -inmemoryDataFlushInterval .
Another common issue, which could happen when parts with mixed types are merged,
is uncontrolled growth of in-memory parts or small parts when all the merge workers
were busy with merging big files. Such growth could lead to significant performance
degradataion for queries, since every query needs to check ever growing list of parts.
This could also slow down the registration of new time series, since VictoriaMetrics
searches for the internal series_id in the indexdb for every new time series.
The third issue is graceful shutdown duration, which could be very long when a background
merge is running on in-memory parts plus big file parts. This merge couldn't be interrupted,
since it merges in-memory parts.
A separate pool of merge workers per every part type elegantly resolves both issues:
- In-memory parts are merged to file-based parts in a timely manner, since the maximum
size of in-memory parts is limited.
- Long-running merges for big parts do not block merges for in-memory parts and small parts.
- Graceful shutdown duration is now limited by the time needed for flushing in-memory parts to files.
Merging for file parts is instantly canceled on graceful shutdown now.
- Deprecate -smallMergeConcurrency command-line flag, since the new background merge algorithm
should automatically self-tune according to the number of available CPU cores.
- Deprecate -finalMergeDelay command-line flag, since it wasn't working correctly.
It is better to run forced merge when needed - https://docs.victoriametrics.com/#forced-merge
- Tune the number of shards for pending rows and items before the data goes to in-memory parts
and becomes visible for search. This improves the maximum data ingestion rate and the maximum rate
for registration of new time series. This should reduce the duration of data ingestion slowdown
in VictoriaMetrics cluster on e.g. re-routing events, when some of vmstorage nodes become temporarily
unavailable.
- Prevent from possible "sync: WaitGroup misuse" panic on graceful shutdown.
This is a follow-up for fa566c68a6ccf7385a05f649aee7e5f5a38afb15 .
Thanks @misutoth to for the inspiration at https://github.com/VictoriaMetrics/VictoriaMetrics/pull/5212
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/5190
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/3790
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/3551
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/3337
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/3425
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/3647
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/3641
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/648
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/291
2024-01-26 21:39:49 +01:00
|
|
|
func (pt *partition) smallPartsMerger() {
|
|
|
|
for {
|
2024-02-23 22:29:23 +01:00
|
|
|
if pt.s.isReadOnly.Load() {
|
lib/{mergeset,storage}: make background merge more responsive and scalable
- Maintain a separate worker pool per each part type (in-memory, file, big and small).
Previously a shared pool was used for merging all the part types.
A single merge worker could merge parts with mixed types at once. For example,
it could merge simultaneously an in-memory part plus a big file part.
Such a merge could take hours for big file part. During the duration of this merge
the in-memory part was pinned in memory and couldn't be persisted to disk
under the configured -inmemoryDataFlushInterval .
Another common issue, which could happen when parts with mixed types are merged,
is uncontrolled growth of in-memory parts or small parts when all the merge workers
were busy with merging big files. Such growth could lead to significant performance
degradataion for queries, since every query needs to check ever growing list of parts.
This could also slow down the registration of new time series, since VictoriaMetrics
searches for the internal series_id in the indexdb for every new time series.
The third issue is graceful shutdown duration, which could be very long when a background
merge is running on in-memory parts plus big file parts. This merge couldn't be interrupted,
since it merges in-memory parts.
A separate pool of merge workers per every part type elegantly resolves both issues:
- In-memory parts are merged to file-based parts in a timely manner, since the maximum
size of in-memory parts is limited.
- Long-running merges for big parts do not block merges for in-memory parts and small parts.
- Graceful shutdown duration is now limited by the time needed for flushing in-memory parts to files.
Merging for file parts is instantly canceled on graceful shutdown now.
- Deprecate -smallMergeConcurrency command-line flag, since the new background merge algorithm
should automatically self-tune according to the number of available CPU cores.
- Deprecate -finalMergeDelay command-line flag, since it wasn't working correctly.
It is better to run forced merge when needed - https://docs.victoriametrics.com/#forced-merge
- Tune the number of shards for pending rows and items before the data goes to in-memory parts
and becomes visible for search. This improves the maximum data ingestion rate and the maximum rate
for registration of new time series. This should reduce the duration of data ingestion slowdown
in VictoriaMetrics cluster on e.g. re-routing events, when some of vmstorage nodes become temporarily
unavailable.
- Prevent from possible "sync: WaitGroup misuse" panic on graceful shutdown.
This is a follow-up for fa566c68a6ccf7385a05f649aee7e5f5a38afb15 .
Thanks @misutoth to for the inspiration at https://github.com/VictoriaMetrics/VictoriaMetrics/pull/5212
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/5190
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/3790
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/3551
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/3337
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/3425
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/3647
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/3641
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/648
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/291
2024-01-26 21:39:49 +01:00
|
|
|
return
|
|
|
|
}
|
|
|
|
maxOutBytes := pt.getMaxBigPartSize()
|
2022-12-13 01:49:21 +01:00
|
|
|
|
lib/{mergeset,storage}: make background merge more responsive and scalable
- Maintain a separate worker pool per each part type (in-memory, file, big and small).
Previously a shared pool was used for merging all the part types.
A single merge worker could merge parts with mixed types at once. For example,
it could merge simultaneously an in-memory part plus a big file part.
Such a merge could take hours for big file part. During the duration of this merge
the in-memory part was pinned in memory and couldn't be persisted to disk
under the configured -inmemoryDataFlushInterval .
Another common issue, which could happen when parts with mixed types are merged,
is uncontrolled growth of in-memory parts or small parts when all the merge workers
were busy with merging big files. Such growth could lead to significant performance
degradataion for queries, since every query needs to check ever growing list of parts.
This could also slow down the registration of new time series, since VictoriaMetrics
searches for the internal series_id in the indexdb for every new time series.
The third issue is graceful shutdown duration, which could be very long when a background
merge is running on in-memory parts plus big file parts. This merge couldn't be interrupted,
since it merges in-memory parts.
A separate pool of merge workers per every part type elegantly resolves both issues:
- In-memory parts are merged to file-based parts in a timely manner, since the maximum
size of in-memory parts is limited.
- Long-running merges for big parts do not block merges for in-memory parts and small parts.
- Graceful shutdown duration is now limited by the time needed for flushing in-memory parts to files.
Merging for file parts is instantly canceled on graceful shutdown now.
- Deprecate -smallMergeConcurrency command-line flag, since the new background merge algorithm
should automatically self-tune according to the number of available CPU cores.
- Deprecate -finalMergeDelay command-line flag, since it wasn't working correctly.
It is better to run forced merge when needed - https://docs.victoriametrics.com/#forced-merge
- Tune the number of shards for pending rows and items before the data goes to in-memory parts
and becomes visible for search. This improves the maximum data ingestion rate and the maximum rate
for registration of new time series. This should reduce the duration of data ingestion slowdown
in VictoriaMetrics cluster on e.g. re-routing events, when some of vmstorage nodes become temporarily
unavailable.
- Prevent from possible "sync: WaitGroup misuse" panic on graceful shutdown.
This is a follow-up for fa566c68a6ccf7385a05f649aee7e5f5a38afb15 .
Thanks @misutoth to for the inspiration at https://github.com/VictoriaMetrics/VictoriaMetrics/pull/5212
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/5190
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/3790
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/3551
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/3337
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/3425
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/3647
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/3641
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/648
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/291
2024-01-26 21:39:49 +01:00
|
|
|
pt.partsLock.Lock()
|
|
|
|
pws := getPartsToMerge(pt.smallParts, maxOutBytes)
|
|
|
|
pt.partsLock.Unlock()
|
|
|
|
|
|
|
|
if len(pws) == 0 {
|
|
|
|
// Nothing to merge
|
|
|
|
return
|
|
|
|
}
|
|
|
|
|
|
|
|
smallPartsConcurrencyCh <- struct{}{}
|
|
|
|
err := pt.mergeParts(pws, pt.stopCh, false)
|
|
|
|
<-smallPartsConcurrencyCh
|
|
|
|
|
|
|
|
if err == nil {
|
|
|
|
// Try merging additional parts.
|
|
|
|
continue
|
|
|
|
}
|
|
|
|
if errors.Is(err, errForciblyStopped) {
|
|
|
|
// Nothing to do - finish the merger.
|
|
|
|
return
|
|
|
|
}
|
|
|
|
// Unexpected error.
|
|
|
|
logger.Panicf("FATAL: unrecoverable error when merging small parts at %q: %s", pt.smallPartsPath, err)
|
2022-12-13 01:49:21 +01:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
lib/{mergeset,storage}: make background merge more responsive and scalable
- Maintain a separate worker pool per each part type (in-memory, file, big and small).
Previously a shared pool was used for merging all the part types.
A single merge worker could merge parts with mixed types at once. For example,
it could merge simultaneously an in-memory part plus a big file part.
Such a merge could take hours for big file part. During the duration of this merge
the in-memory part was pinned in memory and couldn't be persisted to disk
under the configured -inmemoryDataFlushInterval .
Another common issue, which could happen when parts with mixed types are merged,
is uncontrolled growth of in-memory parts or small parts when all the merge workers
were busy with merging big files. Such growth could lead to significant performance
degradataion for queries, since every query needs to check ever growing list of parts.
This could also slow down the registration of new time series, since VictoriaMetrics
searches for the internal series_id in the indexdb for every new time series.
The third issue is graceful shutdown duration, which could be very long when a background
merge is running on in-memory parts plus big file parts. This merge couldn't be interrupted,
since it merges in-memory parts.
A separate pool of merge workers per every part type elegantly resolves both issues:
- In-memory parts are merged to file-based parts in a timely manner, since the maximum
size of in-memory parts is limited.
- Long-running merges for big parts do not block merges for in-memory parts and small parts.
- Graceful shutdown duration is now limited by the time needed for flushing in-memory parts to files.
Merging for file parts is instantly canceled on graceful shutdown now.
- Deprecate -smallMergeConcurrency command-line flag, since the new background merge algorithm
should automatically self-tune according to the number of available CPU cores.
- Deprecate -finalMergeDelay command-line flag, since it wasn't working correctly.
It is better to run forced merge when needed - https://docs.victoriametrics.com/#forced-merge
- Tune the number of shards for pending rows and items before the data goes to in-memory parts
and becomes visible for search. This improves the maximum data ingestion rate and the maximum rate
for registration of new time series. This should reduce the duration of data ingestion slowdown
in VictoriaMetrics cluster on e.g. re-routing events, when some of vmstorage nodes become temporarily
unavailable.
- Prevent from possible "sync: WaitGroup misuse" panic on graceful shutdown.
This is a follow-up for fa566c68a6ccf7385a05f649aee7e5f5a38afb15 .
Thanks @misutoth to for the inspiration at https://github.com/VictoriaMetrics/VictoriaMetrics/pull/5212
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/5190
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/3790
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/3551
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/3337
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/3425
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/3647
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/3641
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/648
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/291
2024-01-26 21:39:49 +01:00
|
|
|
func (pt *partition) bigPartsMerger() {
|
|
|
|
for {
|
2024-02-23 22:29:23 +01:00
|
|
|
if pt.s.isReadOnly.Load() {
|
lib/{mergeset,storage}: make background merge more responsive and scalable
- Maintain a separate worker pool per each part type (in-memory, file, big and small).
Previously a shared pool was used for merging all the part types.
A single merge worker could merge parts with mixed types at once. For example,
it could merge simultaneously an in-memory part plus a big file part.
Such a merge could take hours for big file part. During the duration of this merge
the in-memory part was pinned in memory and couldn't be persisted to disk
under the configured -inmemoryDataFlushInterval .
Another common issue, which could happen when parts with mixed types are merged,
is uncontrolled growth of in-memory parts or small parts when all the merge workers
were busy with merging big files. Such growth could lead to significant performance
degradataion for queries, since every query needs to check ever growing list of parts.
This could also slow down the registration of new time series, since VictoriaMetrics
searches for the internal series_id in the indexdb for every new time series.
The third issue is graceful shutdown duration, which could be very long when a background
merge is running on in-memory parts plus big file parts. This merge couldn't be interrupted,
since it merges in-memory parts.
A separate pool of merge workers per every part type elegantly resolves both issues:
- In-memory parts are merged to file-based parts in a timely manner, since the maximum
size of in-memory parts is limited.
- Long-running merges for big parts do not block merges for in-memory parts and small parts.
- Graceful shutdown duration is now limited by the time needed for flushing in-memory parts to files.
Merging for file parts is instantly canceled on graceful shutdown now.
- Deprecate -smallMergeConcurrency command-line flag, since the new background merge algorithm
should automatically self-tune according to the number of available CPU cores.
- Deprecate -finalMergeDelay command-line flag, since it wasn't working correctly.
It is better to run forced merge when needed - https://docs.victoriametrics.com/#forced-merge
- Tune the number of shards for pending rows and items before the data goes to in-memory parts
and becomes visible for search. This improves the maximum data ingestion rate and the maximum rate
for registration of new time series. This should reduce the duration of data ingestion slowdown
in VictoriaMetrics cluster on e.g. re-routing events, when some of vmstorage nodes become temporarily
unavailable.
- Prevent from possible "sync: WaitGroup misuse" panic on graceful shutdown.
This is a follow-up for fa566c68a6ccf7385a05f649aee7e5f5a38afb15 .
Thanks @misutoth to for the inspiration at https://github.com/VictoriaMetrics/VictoriaMetrics/pull/5212
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/5190
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/3790
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/3551
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/3337
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/3425
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/3647
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/3641
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/648
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/291
2024-01-26 21:39:49 +01:00
|
|
|
return
|
|
|
|
}
|
|
|
|
maxOutBytes := pt.getMaxBigPartSize()
|
|
|
|
|
|
|
|
pt.partsLock.Lock()
|
|
|
|
pws := getPartsToMerge(pt.bigParts, maxOutBytes)
|
|
|
|
pt.partsLock.Unlock()
|
|
|
|
|
|
|
|
if len(pws) == 0 {
|
|
|
|
// Nothing to merge
|
|
|
|
return
|
2022-12-06 00:15:00 +01:00
|
|
|
}
|
lib/{mergeset,storage}: make background merge more responsive and scalable
- Maintain a separate worker pool per each part type (in-memory, file, big and small).
Previously a shared pool was used for merging all the part types.
A single merge worker could merge parts with mixed types at once. For example,
it could merge simultaneously an in-memory part plus a big file part.
Such a merge could take hours for big file part. During the duration of this merge
the in-memory part was pinned in memory and couldn't be persisted to disk
under the configured -inmemoryDataFlushInterval .
Another common issue, which could happen when parts with mixed types are merged,
is uncontrolled growth of in-memory parts or small parts when all the merge workers
were busy with merging big files. Such growth could lead to significant performance
degradataion for queries, since every query needs to check ever growing list of parts.
This could also slow down the registration of new time series, since VictoriaMetrics
searches for the internal series_id in the indexdb for every new time series.
The third issue is graceful shutdown duration, which could be very long when a background
merge is running on in-memory parts plus big file parts. This merge couldn't be interrupted,
since it merges in-memory parts.
A separate pool of merge workers per every part type elegantly resolves both issues:
- In-memory parts are merged to file-based parts in a timely manner, since the maximum
size of in-memory parts is limited.
- Long-running merges for big parts do not block merges for in-memory parts and small parts.
- Graceful shutdown duration is now limited by the time needed for flushing in-memory parts to files.
Merging for file parts is instantly canceled on graceful shutdown now.
- Deprecate -smallMergeConcurrency command-line flag, since the new background merge algorithm
should automatically self-tune according to the number of available CPU cores.
- Deprecate -finalMergeDelay command-line flag, since it wasn't working correctly.
It is better to run forced merge when needed - https://docs.victoriametrics.com/#forced-merge
- Tune the number of shards for pending rows and items before the data goes to in-memory parts
and becomes visible for search. This improves the maximum data ingestion rate and the maximum rate
for registration of new time series. This should reduce the duration of data ingestion slowdown
in VictoriaMetrics cluster on e.g. re-routing events, when some of vmstorage nodes become temporarily
unavailable.
- Prevent from possible "sync: WaitGroup misuse" panic on graceful shutdown.
This is a follow-up for fa566c68a6ccf7385a05f649aee7e5f5a38afb15 .
Thanks @misutoth to for the inspiration at https://github.com/VictoriaMetrics/VictoriaMetrics/pull/5212
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/5190
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/3790
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/3551
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/3337
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/3425
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/3647
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/3641
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/648
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/291
2024-01-26 21:39:49 +01:00
|
|
|
|
|
|
|
bigPartsConcurrencyCh <- struct{}{}
|
|
|
|
err := pt.mergeParts(pws, pt.stopCh, false)
|
|
|
|
<-bigPartsConcurrencyCh
|
|
|
|
|
|
|
|
if err == nil {
|
|
|
|
// Try merging additional parts.
|
|
|
|
continue
|
|
|
|
}
|
|
|
|
if errors.Is(err, errForciblyStopped) {
|
|
|
|
// Nothing to do - finish the merger.
|
|
|
|
return
|
|
|
|
}
|
|
|
|
// Unexpected error.
|
|
|
|
logger.Panicf("FATAL: unrecoverable error when merging big parts at %q: %s", pt.bigPartsPath, err)
|
2022-12-06 00:15:00 +01:00
|
|
|
}
|
2019-05-22 23:16:55 +02:00
|
|
|
}
|
|
|
|
|
2022-04-06 12:34:00 +02:00
|
|
|
func getWaitGroup() *sync.WaitGroup {
|
|
|
|
v := wgPool.Get()
|
|
|
|
if v == nil {
|
|
|
|
return &sync.WaitGroup{}
|
|
|
|
}
|
|
|
|
return v.(*sync.WaitGroup)
|
|
|
|
}
|
|
|
|
|
|
|
|
func putWaitGroup(wg *sync.WaitGroup) {
|
|
|
|
wgPool.Put(wg)
|
|
|
|
}
|
|
|
|
|
|
|
|
var wgPool sync.Pool
|
|
|
|
|
lib/{mergeset,storage}: make background merge more responsive and scalable
- Maintain a separate worker pool per each part type (in-memory, file, big and small).
Previously a shared pool was used for merging all the part types.
A single merge worker could merge parts with mixed types at once. For example,
it could merge simultaneously an in-memory part plus a big file part.
Such a merge could take hours for big file part. During the duration of this merge
the in-memory part was pinned in memory and couldn't be persisted to disk
under the configured -inmemoryDataFlushInterval .
Another common issue, which could happen when parts with mixed types are merged,
is uncontrolled growth of in-memory parts or small parts when all the merge workers
were busy with merging big files. Such growth could lead to significant performance
degradataion for queries, since every query needs to check ever growing list of parts.
This could also slow down the registration of new time series, since VictoriaMetrics
searches for the internal series_id in the indexdb for every new time series.
The third issue is graceful shutdown duration, which could be very long when a background
merge is running on in-memory parts plus big file parts. This merge couldn't be interrupted,
since it merges in-memory parts.
A separate pool of merge workers per every part type elegantly resolves both issues:
- In-memory parts are merged to file-based parts in a timely manner, since the maximum
size of in-memory parts is limited.
- Long-running merges for big parts do not block merges for in-memory parts and small parts.
- Graceful shutdown duration is now limited by the time needed for flushing in-memory parts to files.
Merging for file parts is instantly canceled on graceful shutdown now.
- Deprecate -smallMergeConcurrency command-line flag, since the new background merge algorithm
should automatically self-tune according to the number of available CPU cores.
- Deprecate -finalMergeDelay command-line flag, since it wasn't working correctly.
It is better to run forced merge when needed - https://docs.victoriametrics.com/#forced-merge
- Tune the number of shards for pending rows and items before the data goes to in-memory parts
and becomes visible for search. This improves the maximum data ingestion rate and the maximum rate
for registration of new time series. This should reduce the duration of data ingestion slowdown
in VictoriaMetrics cluster on e.g. re-routing events, when some of vmstorage nodes become temporarily
unavailable.
- Prevent from possible "sync: WaitGroup misuse" panic on graceful shutdown.
This is a follow-up for fa566c68a6ccf7385a05f649aee7e5f5a38afb15 .
Thanks @misutoth to for the inspiration at https://github.com/VictoriaMetrics/VictoriaMetrics/pull/5212
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/5190
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/3790
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/3551
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/3337
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/3425
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/3647
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/3641
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/648
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/291
2024-01-26 21:39:49 +01:00
|
|
|
func (pt *partition) mustMergeInmemoryParts(pws []*partWrapper) []*partWrapper {
|
|
|
|
var pwsResult []*partWrapper
|
|
|
|
var pwsResultLock sync.Mutex
|
|
|
|
wg := getWaitGroup()
|
|
|
|
for len(pws) > 0 {
|
|
|
|
pwsToMerge, pwsRemaining := getPartsForOptimalMerge(pws)
|
|
|
|
wg.Add(1)
|
|
|
|
inmemoryPartsConcurrencyCh <- struct{}{}
|
|
|
|
go func(pwsChunk []*partWrapper) {
|
|
|
|
defer func() {
|
|
|
|
<-inmemoryPartsConcurrencyCh
|
|
|
|
wg.Done()
|
|
|
|
}()
|
|
|
|
|
|
|
|
pw := pt.mustMergeInmemoryPartsFinal(pwsChunk)
|
|
|
|
|
|
|
|
pwsResultLock.Lock()
|
|
|
|
pwsResult = append(pwsResult, pw)
|
|
|
|
pwsResultLock.Unlock()
|
|
|
|
}(pwsToMerge)
|
|
|
|
pws = pwsRemaining
|
|
|
|
}
|
|
|
|
wg.Wait()
|
|
|
|
putWaitGroup(wg)
|
|
|
|
|
|
|
|
return pwsResult
|
|
|
|
}
|
|
|
|
|
|
|
|
func (pt *partition) mustMergeInmemoryPartsFinal(pws []*partWrapper) *partWrapper {
|
|
|
|
if len(pws) == 0 {
|
|
|
|
logger.Panicf("BUG: pws must contain at least a single item")
|
|
|
|
}
|
|
|
|
if len(pws) == 1 {
|
|
|
|
// Nothing to merge
|
|
|
|
return pws[0]
|
|
|
|
}
|
|
|
|
|
|
|
|
bsrs := make([]*blockStreamReader, 0, len(pws))
|
|
|
|
for _, pw := range pws {
|
|
|
|
if pw.mp == nil {
|
|
|
|
logger.Panicf("BUG: unexpected file part")
|
|
|
|
}
|
|
|
|
bsr := getBlockStreamReader()
|
|
|
|
bsr.MustInitFromInmemoryPart(pw.mp)
|
|
|
|
bsrs = append(bsrs, bsr)
|
|
|
|
}
|
|
|
|
|
|
|
|
// determine flushToDiskDeadline before performing the actual merge,
|
|
|
|
// in order to guarantee the correct deadline, since the merge may take significant amounts of time.
|
|
|
|
flushToDiskDeadline := getFlushToDiskDeadline(pws)
|
|
|
|
|
|
|
|
// Prepare blockStreamWriter for destination part.
|
|
|
|
srcRowsCount := uint64(0)
|
|
|
|
srcBlocksCount := uint64(0)
|
|
|
|
for _, bsr := range bsrs {
|
|
|
|
srcRowsCount += bsr.ph.RowsCount
|
|
|
|
srcBlocksCount += bsr.ph.BlocksCount
|
|
|
|
}
|
|
|
|
rowsPerBlock := float64(srcRowsCount) / float64(srcBlocksCount)
|
|
|
|
compressLevel := getCompressLevel(rowsPerBlock)
|
|
|
|
bsw := getBlockStreamWriter()
|
|
|
|
mpDst := getInmemoryPart()
|
|
|
|
bsw.MustInitFromInmemoryPart(mpDst, compressLevel)
|
|
|
|
|
|
|
|
// Merge parts.
|
|
|
|
// The merge shouldn't be interrupted by stopCh, so use nil stopCh.
|
|
|
|
ph, err := pt.mergePartsInternal("", bsw, bsrs, partInmemory, nil)
|
|
|
|
putBlockStreamWriter(bsw)
|
|
|
|
for _, bsr := range bsrs {
|
|
|
|
putBlockStreamReader(bsr)
|
|
|
|
}
|
|
|
|
if err != nil {
|
|
|
|
logger.Panicf("FATAL: cannot merge inmemoryBlocks: %s", err)
|
|
|
|
}
|
|
|
|
mpDst.ph = *ph
|
|
|
|
|
|
|
|
return newPartWrapperFromInmemoryPart(mpDst, flushToDiskDeadline)
|
|
|
|
}
|
|
|
|
|
2022-12-06 00:15:00 +01:00
|
|
|
func (pt *partition) createInmemoryPart(rows []rawRow) *partWrapper {
|
2019-05-22 23:16:55 +02:00
|
|
|
if len(rows) == 0 {
|
2022-12-06 00:15:00 +01:00
|
|
|
return nil
|
2019-05-22 23:16:55 +02:00
|
|
|
}
|
|
|
|
mp := getInmemoryPart()
|
|
|
|
mp.InitFromRows(rows)
|
|
|
|
|
|
|
|
// Make sure the part may be added.
|
|
|
|
if mp.ph.MinTimestamp > mp.ph.MaxTimestamp {
|
|
|
|
logger.Panicf("BUG: the part %q cannot be added to partition %q because its MinTimestamp exceeds MaxTimestamp; %d vs %d",
|
|
|
|
&mp.ph, pt.smallPartsPath, mp.ph.MinTimestamp, mp.ph.MaxTimestamp)
|
|
|
|
}
|
|
|
|
if mp.ph.MinTimestamp < pt.tr.MinTimestamp {
|
|
|
|
logger.Panicf("BUG: the part %q cannot be added to partition %q because of too small MinTimestamp; got %d; want at least %d",
|
|
|
|
&mp.ph, pt.smallPartsPath, mp.ph.MinTimestamp, pt.tr.MinTimestamp)
|
|
|
|
}
|
|
|
|
if mp.ph.MaxTimestamp > pt.tr.MaxTimestamp {
|
|
|
|
logger.Panicf("BUG: the part %q cannot be added to partition %q because of too big MaxTimestamp; got %d; want at least %d",
|
|
|
|
&mp.ph, pt.smallPartsPath, mp.ph.MaxTimestamp, pt.tr.MaxTimestamp)
|
|
|
|
}
|
2022-12-06 00:15:00 +01:00
|
|
|
flushToDiskDeadline := time.Now().Add(dataFlushInterval)
|
|
|
|
return newPartWrapperFromInmemoryPart(mp, flushToDiskDeadline)
|
|
|
|
}
|
2019-05-22 23:16:55 +02:00
|
|
|
|
2022-12-06 00:15:00 +01:00
|
|
|
func newPartWrapperFromInmemoryPart(mp *inmemoryPart, flushToDiskDeadline time.Time) *partWrapper {
|
2023-04-15 00:46:09 +02:00
|
|
|
p := mp.NewPart()
|
2019-05-22 23:16:55 +02:00
|
|
|
pw := &partWrapper{
|
2022-12-06 00:15:00 +01:00
|
|
|
p: p,
|
|
|
|
mp: mp,
|
|
|
|
flushToDiskDeadline: flushToDiskDeadline,
|
2019-05-22 23:16:55 +02:00
|
|
|
}
|
2024-02-23 21:54:55 +01:00
|
|
|
pw.incRef()
|
2022-12-06 00:15:00 +01:00
|
|
|
return pw
|
2019-05-22 23:16:55 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
// HasTimestamp returns true if the pt contains the given timestamp.
|
|
|
|
func (pt *partition) HasTimestamp(timestamp int64) bool {
|
|
|
|
return timestamp >= pt.tr.MinTimestamp && timestamp <= pt.tr.MaxTimestamp
|
|
|
|
}
|
|
|
|
|
|
|
|
// GetParts appends parts snapshot to dst and returns it.
|
|
|
|
//
|
|
|
|
// The appended parts must be released with PutParts.
|
2023-02-01 18:54:21 +01:00
|
|
|
func (pt *partition) GetParts(dst []*partWrapper, addInMemory bool) []*partWrapper {
|
2019-05-22 23:16:55 +02:00
|
|
|
pt.partsLock.Lock()
|
2023-02-01 18:54:21 +01:00
|
|
|
if addInMemory {
|
2023-03-19 09:36:05 +01:00
|
|
|
incRefForParts(pt.inmemoryParts)
|
2023-02-01 18:54:21 +01:00
|
|
|
dst = append(dst, pt.inmemoryParts...)
|
2022-12-06 00:15:00 +01:00
|
|
|
}
|
2023-03-19 09:36:05 +01:00
|
|
|
incRefForParts(pt.smallParts)
|
2019-05-22 23:16:55 +02:00
|
|
|
dst = append(dst, pt.smallParts...)
|
2023-03-19 09:36:05 +01:00
|
|
|
incRefForParts(pt.bigParts)
|
2019-05-22 23:16:55 +02:00
|
|
|
dst = append(dst, pt.bigParts...)
|
|
|
|
pt.partsLock.Unlock()
|
|
|
|
|
|
|
|
return dst
|
|
|
|
}
|
|
|
|
|
|
|
|
// PutParts releases the given pws obtained via GetParts.
|
|
|
|
func (pt *partition) PutParts(pws []*partWrapper) {
|
|
|
|
for _, pw := range pws {
|
|
|
|
pw.decRef()
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2023-03-19 09:36:05 +01:00
|
|
|
func incRefForParts(pws []*partWrapper) {
|
|
|
|
for _, pw := range pws {
|
|
|
|
pw.incRef()
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2019-05-22 23:16:55 +02:00
|
|
|
// MustClose closes the pt, so the app may safely exit.
|
|
|
|
//
|
|
|
|
// The pt must be detached from table before calling pt.MustClose.
|
|
|
|
func (pt *partition) MustClose() {
|
lib/{mergeset,storage}: make background merge more responsive and scalable
- Maintain a separate worker pool per each part type (in-memory, file, big and small).
Previously a shared pool was used for merging all the part types.
A single merge worker could merge parts with mixed types at once. For example,
it could merge simultaneously an in-memory part plus a big file part.
Such a merge could take hours for big file part. During the duration of this merge
the in-memory part was pinned in memory and couldn't be persisted to disk
under the configured -inmemoryDataFlushInterval .
Another common issue, which could happen when parts with mixed types are merged,
is uncontrolled growth of in-memory parts or small parts when all the merge workers
were busy with merging big files. Such growth could lead to significant performance
degradataion for queries, since every query needs to check ever growing list of parts.
This could also slow down the registration of new time series, since VictoriaMetrics
searches for the internal series_id in the indexdb for every new time series.
The third issue is graceful shutdown duration, which could be very long when a background
merge is running on in-memory parts plus big file parts. This merge couldn't be interrupted,
since it merges in-memory parts.
A separate pool of merge workers per every part type elegantly resolves both issues:
- In-memory parts are merged to file-based parts in a timely manner, since the maximum
size of in-memory parts is limited.
- Long-running merges for big parts do not block merges for in-memory parts and small parts.
- Graceful shutdown duration is now limited by the time needed for flushing in-memory parts to files.
Merging for file parts is instantly canceled on graceful shutdown now.
- Deprecate -smallMergeConcurrency command-line flag, since the new background merge algorithm
should automatically self-tune according to the number of available CPU cores.
- Deprecate -finalMergeDelay command-line flag, since it wasn't working correctly.
It is better to run forced merge when needed - https://docs.victoriametrics.com/#forced-merge
- Tune the number of shards for pending rows and items before the data goes to in-memory parts
and becomes visible for search. This improves the maximum data ingestion rate and the maximum rate
for registration of new time series. This should reduce the duration of data ingestion slowdown
in VictoriaMetrics cluster on e.g. re-routing events, when some of vmstorage nodes become temporarily
unavailable.
- Prevent from possible "sync: WaitGroup misuse" panic on graceful shutdown.
This is a follow-up for fa566c68a6ccf7385a05f649aee7e5f5a38afb15 .
Thanks @misutoth to for the inspiration at https://github.com/VictoriaMetrics/VictoriaMetrics/pull/5212
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/5190
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/3790
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/3551
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/3337
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/3425
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/3647
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/3641
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/648
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/291
2024-01-26 21:39:49 +01:00
|
|
|
// Notify the background workers to stop.
|
|
|
|
// The pt.partsLock is aquired in order to guarantee that pt.wg.Add() isn't called
|
|
|
|
// after pt.stopCh is closed and pt.wg.Wait() is called below.
|
|
|
|
pt.partsLock.Lock()
|
2019-05-22 23:16:55 +02:00
|
|
|
close(pt.stopCh)
|
lib/{mergeset,storage}: make background merge more responsive and scalable
- Maintain a separate worker pool per each part type (in-memory, file, big and small).
Previously a shared pool was used for merging all the part types.
A single merge worker could merge parts with mixed types at once. For example,
it could merge simultaneously an in-memory part plus a big file part.
Such a merge could take hours for big file part. During the duration of this merge
the in-memory part was pinned in memory and couldn't be persisted to disk
under the configured -inmemoryDataFlushInterval .
Another common issue, which could happen when parts with mixed types are merged,
is uncontrolled growth of in-memory parts or small parts when all the merge workers
were busy with merging big files. Such growth could lead to significant performance
degradataion for queries, since every query needs to check ever growing list of parts.
This could also slow down the registration of new time series, since VictoriaMetrics
searches for the internal series_id in the indexdb for every new time series.
The third issue is graceful shutdown duration, which could be very long when a background
merge is running on in-memory parts plus big file parts. This merge couldn't be interrupted,
since it merges in-memory parts.
A separate pool of merge workers per every part type elegantly resolves both issues:
- In-memory parts are merged to file-based parts in a timely manner, since the maximum
size of in-memory parts is limited.
- Long-running merges for big parts do not block merges for in-memory parts and small parts.
- Graceful shutdown duration is now limited by the time needed for flushing in-memory parts to files.
Merging for file parts is instantly canceled on graceful shutdown now.
- Deprecate -smallMergeConcurrency command-line flag, since the new background merge algorithm
should automatically self-tune according to the number of available CPU cores.
- Deprecate -finalMergeDelay command-line flag, since it wasn't working correctly.
It is better to run forced merge when needed - https://docs.victoriametrics.com/#forced-merge
- Tune the number of shards for pending rows and items before the data goes to in-memory parts
and becomes visible for search. This improves the maximum data ingestion rate and the maximum rate
for registration of new time series. This should reduce the duration of data ingestion slowdown
in VictoriaMetrics cluster on e.g. re-routing events, when some of vmstorage nodes become temporarily
unavailable.
- Prevent from possible "sync: WaitGroup misuse" panic on graceful shutdown.
This is a follow-up for fa566c68a6ccf7385a05f649aee7e5f5a38afb15 .
Thanks @misutoth to for the inspiration at https://github.com/VictoriaMetrics/VictoriaMetrics/pull/5212
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/5190
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/3790
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/3551
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/3337
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/3425
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/3647
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/3641
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/648
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/291
2024-01-26 21:39:49 +01:00
|
|
|
pt.partsLock.Unlock()
|
2019-05-22 23:16:55 +02:00
|
|
|
|
lib/{mergeset,storage}: make background merge more responsive and scalable
- Maintain a separate worker pool per each part type (in-memory, file, big and small).
Previously a shared pool was used for merging all the part types.
A single merge worker could merge parts with mixed types at once. For example,
it could merge simultaneously an in-memory part plus a big file part.
Such a merge could take hours for big file part. During the duration of this merge
the in-memory part was pinned in memory and couldn't be persisted to disk
under the configured -inmemoryDataFlushInterval .
Another common issue, which could happen when parts with mixed types are merged,
is uncontrolled growth of in-memory parts or small parts when all the merge workers
were busy with merging big files. Such growth could lead to significant performance
degradataion for queries, since every query needs to check ever growing list of parts.
This could also slow down the registration of new time series, since VictoriaMetrics
searches for the internal series_id in the indexdb for every new time series.
The third issue is graceful shutdown duration, which could be very long when a background
merge is running on in-memory parts plus big file parts. This merge couldn't be interrupted,
since it merges in-memory parts.
A separate pool of merge workers per every part type elegantly resolves both issues:
- In-memory parts are merged to file-based parts in a timely manner, since the maximum
size of in-memory parts is limited.
- Long-running merges for big parts do not block merges for in-memory parts and small parts.
- Graceful shutdown duration is now limited by the time needed for flushing in-memory parts to files.
Merging for file parts is instantly canceled on graceful shutdown now.
- Deprecate -smallMergeConcurrency command-line flag, since the new background merge algorithm
should automatically self-tune according to the number of available CPU cores.
- Deprecate -finalMergeDelay command-line flag, since it wasn't working correctly.
It is better to run forced merge when needed - https://docs.victoriametrics.com/#forced-merge
- Tune the number of shards for pending rows and items before the data goes to in-memory parts
and becomes visible for search. This improves the maximum data ingestion rate and the maximum rate
for registration of new time series. This should reduce the duration of data ingestion slowdown
in VictoriaMetrics cluster on e.g. re-routing events, when some of vmstorage nodes become temporarily
unavailable.
- Prevent from possible "sync: WaitGroup misuse" panic on graceful shutdown.
This is a follow-up for fa566c68a6ccf7385a05f649aee7e5f5a38afb15 .
Thanks @misutoth to for the inspiration at https://github.com/VictoriaMetrics/VictoriaMetrics/pull/5212
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/5190
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/3790
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/3551
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/3337
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/3425
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/3647
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/3641
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/648
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/291
2024-01-26 21:39:49 +01:00
|
|
|
// Wait for background workers to stop.
|
2022-12-04 08:03:05 +01:00
|
|
|
pt.wg.Wait()
|
2019-05-22 23:16:55 +02:00
|
|
|
|
lib/{mergeset,storage}: make background merge more responsive and scalable
- Maintain a separate worker pool per each part type (in-memory, file, big and small).
Previously a shared pool was used for merging all the part types.
A single merge worker could merge parts with mixed types at once. For example,
it could merge simultaneously an in-memory part plus a big file part.
Such a merge could take hours for big file part. During the duration of this merge
the in-memory part was pinned in memory and couldn't be persisted to disk
under the configured -inmemoryDataFlushInterval .
Another common issue, which could happen when parts with mixed types are merged,
is uncontrolled growth of in-memory parts or small parts when all the merge workers
were busy with merging big files. Such growth could lead to significant performance
degradataion for queries, since every query needs to check ever growing list of parts.
This could also slow down the registration of new time series, since VictoriaMetrics
searches for the internal series_id in the indexdb for every new time series.
The third issue is graceful shutdown duration, which could be very long when a background
merge is running on in-memory parts plus big file parts. This merge couldn't be interrupted,
since it merges in-memory parts.
A separate pool of merge workers per every part type elegantly resolves both issues:
- In-memory parts are merged to file-based parts in a timely manner, since the maximum
size of in-memory parts is limited.
- Long-running merges for big parts do not block merges for in-memory parts and small parts.
- Graceful shutdown duration is now limited by the time needed for flushing in-memory parts to files.
Merging for file parts is instantly canceled on graceful shutdown now.
- Deprecate -smallMergeConcurrency command-line flag, since the new background merge algorithm
should automatically self-tune according to the number of available CPU cores.
- Deprecate -finalMergeDelay command-line flag, since it wasn't working correctly.
It is better to run forced merge when needed - https://docs.victoriametrics.com/#forced-merge
- Tune the number of shards for pending rows and items before the data goes to in-memory parts
and becomes visible for search. This improves the maximum data ingestion rate and the maximum rate
for registration of new time series. This should reduce the duration of data ingestion slowdown
in VictoriaMetrics cluster on e.g. re-routing events, when some of vmstorage nodes become temporarily
unavailable.
- Prevent from possible "sync: WaitGroup misuse" panic on graceful shutdown.
This is a follow-up for fa566c68a6ccf7385a05f649aee7e5f5a38afb15 .
Thanks @misutoth to for the inspiration at https://github.com/VictoriaMetrics/VictoriaMetrics/pull/5212
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/5190
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/3790
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/3551
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/3337
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/3425
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/3647
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/3641
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/648
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/291
2024-01-26 21:39:49 +01:00
|
|
|
// Flush the remaining in-memory rows to files.
|
|
|
|
pt.flushInmemoryRowsToFiles()
|
2019-05-22 23:16:55 +02:00
|
|
|
|
2022-12-06 00:15:00 +01:00
|
|
|
// Remove references from inmemoryParts, smallParts and bigParts, so they may be eventually closed
|
2019-05-25 20:51:11 +02:00
|
|
|
// after all the searches are done.
|
2019-05-22 23:16:55 +02:00
|
|
|
pt.partsLock.Lock()
|
lib/{mergeset,storage}: make background merge more responsive and scalable
- Maintain a separate worker pool per each part type (in-memory, file, big and small).
Previously a shared pool was used for merging all the part types.
A single merge worker could merge parts with mixed types at once. For example,
it could merge simultaneously an in-memory part plus a big file part.
Such a merge could take hours for big file part. During the duration of this merge
the in-memory part was pinned in memory and couldn't be persisted to disk
under the configured -inmemoryDataFlushInterval .
Another common issue, which could happen when parts with mixed types are merged,
is uncontrolled growth of in-memory parts or small parts when all the merge workers
were busy with merging big files. Such growth could lead to significant performance
degradataion for queries, since every query needs to check ever growing list of parts.
This could also slow down the registration of new time series, since VictoriaMetrics
searches for the internal series_id in the indexdb for every new time series.
The third issue is graceful shutdown duration, which could be very long when a background
merge is running on in-memory parts plus big file parts. This merge couldn't be interrupted,
since it merges in-memory parts.
A separate pool of merge workers per every part type elegantly resolves both issues:
- In-memory parts are merged to file-based parts in a timely manner, since the maximum
size of in-memory parts is limited.
- Long-running merges for big parts do not block merges for in-memory parts and small parts.
- Graceful shutdown duration is now limited by the time needed for flushing in-memory parts to files.
Merging for file parts is instantly canceled on graceful shutdown now.
- Deprecate -smallMergeConcurrency command-line flag, since the new background merge algorithm
should automatically self-tune according to the number of available CPU cores.
- Deprecate -finalMergeDelay command-line flag, since it wasn't working correctly.
It is better to run forced merge when needed - https://docs.victoriametrics.com/#forced-merge
- Tune the number of shards for pending rows and items before the data goes to in-memory parts
and becomes visible for search. This improves the maximum data ingestion rate and the maximum rate
for registration of new time series. This should reduce the duration of data ingestion slowdown
in VictoriaMetrics cluster on e.g. re-routing events, when some of vmstorage nodes become temporarily
unavailable.
- Prevent from possible "sync: WaitGroup misuse" panic on graceful shutdown.
This is a follow-up for fa566c68a6ccf7385a05f649aee7e5f5a38afb15 .
Thanks @misutoth to for the inspiration at https://github.com/VictoriaMetrics/VictoriaMetrics/pull/5212
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/5190
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/3790
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/3551
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/3337
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/3425
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/3647
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/3641
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/648
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/291
2024-01-26 21:39:49 +01:00
|
|
|
|
|
|
|
if n := pt.rawRows.Len(); n > 0 {
|
|
|
|
logger.Panicf("BUG: raw rows must be empty at this stage; got %d rows", n)
|
|
|
|
}
|
|
|
|
|
|
|
|
if n := len(pt.inmemoryParts); n > 0 {
|
|
|
|
logger.Panicf("BUG: in-memory parts must be empty at this stage; got %d parts", n)
|
|
|
|
}
|
2022-12-06 00:15:00 +01:00
|
|
|
pt.inmemoryParts = nil
|
lib/{mergeset,storage}: make background merge more responsive and scalable
- Maintain a separate worker pool per each part type (in-memory, file, big and small).
Previously a shared pool was used for merging all the part types.
A single merge worker could merge parts with mixed types at once. For example,
it could merge simultaneously an in-memory part plus a big file part.
Such a merge could take hours for big file part. During the duration of this merge
the in-memory part was pinned in memory and couldn't be persisted to disk
under the configured -inmemoryDataFlushInterval .
Another common issue, which could happen when parts with mixed types are merged,
is uncontrolled growth of in-memory parts or small parts when all the merge workers
were busy with merging big files. Such growth could lead to significant performance
degradataion for queries, since every query needs to check ever growing list of parts.
This could also slow down the registration of new time series, since VictoriaMetrics
searches for the internal series_id in the indexdb for every new time series.
The third issue is graceful shutdown duration, which could be very long when a background
merge is running on in-memory parts plus big file parts. This merge couldn't be interrupted,
since it merges in-memory parts.
A separate pool of merge workers per every part type elegantly resolves both issues:
- In-memory parts are merged to file-based parts in a timely manner, since the maximum
size of in-memory parts is limited.
- Long-running merges for big parts do not block merges for in-memory parts and small parts.
- Graceful shutdown duration is now limited by the time needed for flushing in-memory parts to files.
Merging for file parts is instantly canceled on graceful shutdown now.
- Deprecate -smallMergeConcurrency command-line flag, since the new background merge algorithm
should automatically self-tune according to the number of available CPU cores.
- Deprecate -finalMergeDelay command-line flag, since it wasn't working correctly.
It is better to run forced merge when needed - https://docs.victoriametrics.com/#forced-merge
- Tune the number of shards for pending rows and items before the data goes to in-memory parts
and becomes visible for search. This improves the maximum data ingestion rate and the maximum rate
for registration of new time series. This should reduce the duration of data ingestion slowdown
in VictoriaMetrics cluster on e.g. re-routing events, when some of vmstorage nodes become temporarily
unavailable.
- Prevent from possible "sync: WaitGroup misuse" panic on graceful shutdown.
This is a follow-up for fa566c68a6ccf7385a05f649aee7e5f5a38afb15 .
Thanks @misutoth to for the inspiration at https://github.com/VictoriaMetrics/VictoriaMetrics/pull/5212
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/5190
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/3790
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/3551
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/3337
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/3425
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/3647
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/3641
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/648
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/291
2024-01-26 21:39:49 +01:00
|
|
|
|
|
|
|
smallParts := pt.smallParts
|
2019-05-22 23:16:55 +02:00
|
|
|
pt.smallParts = nil
|
lib/{mergeset,storage}: make background merge more responsive and scalable
- Maintain a separate worker pool per each part type (in-memory, file, big and small).
Previously a shared pool was used for merging all the part types.
A single merge worker could merge parts with mixed types at once. For example,
it could merge simultaneously an in-memory part plus a big file part.
Such a merge could take hours for big file part. During the duration of this merge
the in-memory part was pinned in memory and couldn't be persisted to disk
under the configured -inmemoryDataFlushInterval .
Another common issue, which could happen when parts with mixed types are merged,
is uncontrolled growth of in-memory parts or small parts when all the merge workers
were busy with merging big files. Such growth could lead to significant performance
degradataion for queries, since every query needs to check ever growing list of parts.
This could also slow down the registration of new time series, since VictoriaMetrics
searches for the internal series_id in the indexdb for every new time series.
The third issue is graceful shutdown duration, which could be very long when a background
merge is running on in-memory parts plus big file parts. This merge couldn't be interrupted,
since it merges in-memory parts.
A separate pool of merge workers per every part type elegantly resolves both issues:
- In-memory parts are merged to file-based parts in a timely manner, since the maximum
size of in-memory parts is limited.
- Long-running merges for big parts do not block merges for in-memory parts and small parts.
- Graceful shutdown duration is now limited by the time needed for flushing in-memory parts to files.
Merging for file parts is instantly canceled on graceful shutdown now.
- Deprecate -smallMergeConcurrency command-line flag, since the new background merge algorithm
should automatically self-tune according to the number of available CPU cores.
- Deprecate -finalMergeDelay command-line flag, since it wasn't working correctly.
It is better to run forced merge when needed - https://docs.victoriametrics.com/#forced-merge
- Tune the number of shards for pending rows and items before the data goes to in-memory parts
and becomes visible for search. This improves the maximum data ingestion rate and the maximum rate
for registration of new time series. This should reduce the duration of data ingestion slowdown
in VictoriaMetrics cluster on e.g. re-routing events, when some of vmstorage nodes become temporarily
unavailable.
- Prevent from possible "sync: WaitGroup misuse" panic on graceful shutdown.
This is a follow-up for fa566c68a6ccf7385a05f649aee7e5f5a38afb15 .
Thanks @misutoth to for the inspiration at https://github.com/VictoriaMetrics/VictoriaMetrics/pull/5212
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/5190
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/3790
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/3551
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/3337
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/3425
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/3647
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/3641
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/648
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/291
2024-01-26 21:39:49 +01:00
|
|
|
|
|
|
|
bigParts := pt.bigParts
|
2022-12-06 00:15:00 +01:00
|
|
|
pt.bigParts = nil
|
lib/{mergeset,storage}: make background merge more responsive and scalable
- Maintain a separate worker pool per each part type (in-memory, file, big and small).
Previously a shared pool was used for merging all the part types.
A single merge worker could merge parts with mixed types at once. For example,
it could merge simultaneously an in-memory part plus a big file part.
Such a merge could take hours for big file part. During the duration of this merge
the in-memory part was pinned in memory and couldn't be persisted to disk
under the configured -inmemoryDataFlushInterval .
Another common issue, which could happen when parts with mixed types are merged,
is uncontrolled growth of in-memory parts or small parts when all the merge workers
were busy with merging big files. Such growth could lead to significant performance
degradataion for queries, since every query needs to check ever growing list of parts.
This could also slow down the registration of new time series, since VictoriaMetrics
searches for the internal series_id in the indexdb for every new time series.
The third issue is graceful shutdown duration, which could be very long when a background
merge is running on in-memory parts plus big file parts. This merge couldn't be interrupted,
since it merges in-memory parts.
A separate pool of merge workers per every part type elegantly resolves both issues:
- In-memory parts are merged to file-based parts in a timely manner, since the maximum
size of in-memory parts is limited.
- Long-running merges for big parts do not block merges for in-memory parts and small parts.
- Graceful shutdown duration is now limited by the time needed for flushing in-memory parts to files.
Merging for file parts is instantly canceled on graceful shutdown now.
- Deprecate -smallMergeConcurrency command-line flag, since the new background merge algorithm
should automatically self-tune according to the number of available CPU cores.
- Deprecate -finalMergeDelay command-line flag, since it wasn't working correctly.
It is better to run forced merge when needed - https://docs.victoriametrics.com/#forced-merge
- Tune the number of shards for pending rows and items before the data goes to in-memory parts
and becomes visible for search. This improves the maximum data ingestion rate and the maximum rate
for registration of new time series. This should reduce the duration of data ingestion slowdown
in VictoriaMetrics cluster on e.g. re-routing events, when some of vmstorage nodes become temporarily
unavailable.
- Prevent from possible "sync: WaitGroup misuse" panic on graceful shutdown.
This is a follow-up for fa566c68a6ccf7385a05f649aee7e5f5a38afb15 .
Thanks @misutoth to for the inspiration at https://github.com/VictoriaMetrics/VictoriaMetrics/pull/5212
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/5190
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/3790
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/3551
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/3337
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/3425
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/3647
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/3641
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/648
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/291
2024-01-26 21:39:49 +01:00
|
|
|
|
2019-05-22 23:16:55 +02:00
|
|
|
pt.partsLock.Unlock()
|
|
|
|
|
|
|
|
for _, pw := range smallParts {
|
|
|
|
pw.decRef()
|
|
|
|
}
|
|
|
|
for _, pw := range bigParts {
|
|
|
|
pw.decRef()
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
lib/{mergeset,storage}: make background merge more responsive and scalable
- Maintain a separate worker pool per each part type (in-memory, file, big and small).
Previously a shared pool was used for merging all the part types.
A single merge worker could merge parts with mixed types at once. For example,
it could merge simultaneously an in-memory part plus a big file part.
Such a merge could take hours for big file part. During the duration of this merge
the in-memory part was pinned in memory and couldn't be persisted to disk
under the configured -inmemoryDataFlushInterval .
Another common issue, which could happen when parts with mixed types are merged,
is uncontrolled growth of in-memory parts or small parts when all the merge workers
were busy with merging big files. Such growth could lead to significant performance
degradataion for queries, since every query needs to check ever growing list of parts.
This could also slow down the registration of new time series, since VictoriaMetrics
searches for the internal series_id in the indexdb for every new time series.
The third issue is graceful shutdown duration, which could be very long when a background
merge is running on in-memory parts plus big file parts. This merge couldn't be interrupted,
since it merges in-memory parts.
A separate pool of merge workers per every part type elegantly resolves both issues:
- In-memory parts are merged to file-based parts in a timely manner, since the maximum
size of in-memory parts is limited.
- Long-running merges for big parts do not block merges for in-memory parts and small parts.
- Graceful shutdown duration is now limited by the time needed for flushing in-memory parts to files.
Merging for file parts is instantly canceled on graceful shutdown now.
- Deprecate -smallMergeConcurrency command-line flag, since the new background merge algorithm
should automatically self-tune according to the number of available CPU cores.
- Deprecate -finalMergeDelay command-line flag, since it wasn't working correctly.
It is better to run forced merge when needed - https://docs.victoriametrics.com/#forced-merge
- Tune the number of shards for pending rows and items before the data goes to in-memory parts
and becomes visible for search. This improves the maximum data ingestion rate and the maximum rate
for registration of new time series. This should reduce the duration of data ingestion slowdown
in VictoriaMetrics cluster on e.g. re-routing events, when some of vmstorage nodes become temporarily
unavailable.
- Prevent from possible "sync: WaitGroup misuse" panic on graceful shutdown.
This is a follow-up for fa566c68a6ccf7385a05f649aee7e5f5a38afb15 .
Thanks @misutoth to for the inspiration at https://github.com/VictoriaMetrics/VictoriaMetrics/pull/5212
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/5190
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/3790
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/3551
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/3337
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/3425
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/3647
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/3641
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/648
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/291
2024-01-26 21:39:49 +01:00
|
|
|
func (pt *partition) startInmemoryPartsMergers() {
|
|
|
|
pt.partsLock.Lock()
|
|
|
|
for i := 0; i < cap(inmemoryPartsConcurrencyCh); i++ {
|
|
|
|
pt.startInmemoryPartsMergerLocked()
|
|
|
|
}
|
|
|
|
pt.partsLock.Unlock()
|
|
|
|
}
|
|
|
|
|
|
|
|
func (pt *partition) startInmemoryPartsMergerLocked() {
|
|
|
|
select {
|
|
|
|
case <-pt.stopCh:
|
|
|
|
return
|
|
|
|
default:
|
|
|
|
}
|
2022-12-06 00:15:00 +01:00
|
|
|
pt.wg.Add(1)
|
|
|
|
go func() {
|
lib/{mergeset,storage}: make background merge more responsive and scalable
- Maintain a separate worker pool per each part type (in-memory, file, big and small).
Previously a shared pool was used for merging all the part types.
A single merge worker could merge parts with mixed types at once. For example,
it could merge simultaneously an in-memory part plus a big file part.
Such a merge could take hours for big file part. During the duration of this merge
the in-memory part was pinned in memory and couldn't be persisted to disk
under the configured -inmemoryDataFlushInterval .
Another common issue, which could happen when parts with mixed types are merged,
is uncontrolled growth of in-memory parts or small parts when all the merge workers
were busy with merging big files. Such growth could lead to significant performance
degradataion for queries, since every query needs to check ever growing list of parts.
This could also slow down the registration of new time series, since VictoriaMetrics
searches for the internal series_id in the indexdb for every new time series.
The third issue is graceful shutdown duration, which could be very long when a background
merge is running on in-memory parts plus big file parts. This merge couldn't be interrupted,
since it merges in-memory parts.
A separate pool of merge workers per every part type elegantly resolves both issues:
- In-memory parts are merged to file-based parts in a timely manner, since the maximum
size of in-memory parts is limited.
- Long-running merges for big parts do not block merges for in-memory parts and small parts.
- Graceful shutdown duration is now limited by the time needed for flushing in-memory parts to files.
Merging for file parts is instantly canceled on graceful shutdown now.
- Deprecate -smallMergeConcurrency command-line flag, since the new background merge algorithm
should automatically self-tune according to the number of available CPU cores.
- Deprecate -finalMergeDelay command-line flag, since it wasn't working correctly.
It is better to run forced merge when needed - https://docs.victoriametrics.com/#forced-merge
- Tune the number of shards for pending rows and items before the data goes to in-memory parts
and becomes visible for search. This improves the maximum data ingestion rate and the maximum rate
for registration of new time series. This should reduce the duration of data ingestion slowdown
in VictoriaMetrics cluster on e.g. re-routing events, when some of vmstorage nodes become temporarily
unavailable.
- Prevent from possible "sync: WaitGroup misuse" panic on graceful shutdown.
This is a follow-up for fa566c68a6ccf7385a05f649aee7e5f5a38afb15 .
Thanks @misutoth to for the inspiration at https://github.com/VictoriaMetrics/VictoriaMetrics/pull/5212
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/5190
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/3790
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/3551
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/3337
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/3425
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/3647
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/3641
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/648
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/291
2024-01-26 21:39:49 +01:00
|
|
|
pt.inmemoryPartsMerger()
|
|
|
|
pt.wg.Done()
|
|
|
|
}()
|
|
|
|
}
|
|
|
|
|
|
|
|
func (pt *partition) startSmallPartsMergers() {
|
|
|
|
pt.partsLock.Lock()
|
|
|
|
for i := 0; i < cap(smallPartsConcurrencyCh); i++ {
|
|
|
|
pt.startSmallPartsMergerLocked()
|
|
|
|
}
|
|
|
|
pt.partsLock.Unlock()
|
|
|
|
}
|
|
|
|
|
|
|
|
func (pt *partition) startSmallPartsMergerLocked() {
|
|
|
|
select {
|
|
|
|
case <-pt.stopCh:
|
|
|
|
return
|
|
|
|
default:
|
|
|
|
}
|
|
|
|
pt.wg.Add(1)
|
|
|
|
go func() {
|
|
|
|
pt.smallPartsMerger()
|
|
|
|
pt.wg.Done()
|
|
|
|
}()
|
|
|
|
}
|
|
|
|
|
|
|
|
func (pt *partition) startBigPartsMergers() {
|
|
|
|
pt.partsLock.Lock()
|
|
|
|
for i := 0; i < cap(bigPartsConcurrencyCh); i++ {
|
|
|
|
pt.startBigPartsMergerLocked()
|
|
|
|
}
|
|
|
|
pt.partsLock.Unlock()
|
|
|
|
}
|
|
|
|
|
|
|
|
func (pt *partition) startBigPartsMergerLocked() {
|
|
|
|
select {
|
|
|
|
case <-pt.stopCh:
|
|
|
|
return
|
|
|
|
default:
|
|
|
|
}
|
|
|
|
pt.wg.Add(1)
|
|
|
|
go func() {
|
|
|
|
pt.bigPartsMerger()
|
2022-12-06 00:15:00 +01:00
|
|
|
pt.wg.Done()
|
|
|
|
}()
|
|
|
|
}
|
|
|
|
|
|
|
|
func (pt *partition) startPendingRowsFlusher() {
|
2022-12-04 08:03:05 +01:00
|
|
|
pt.wg.Add(1)
|
2019-05-22 23:16:55 +02:00
|
|
|
go func() {
|
2022-12-06 00:15:00 +01:00
|
|
|
pt.pendingRowsFlusher()
|
2022-12-04 08:03:05 +01:00
|
|
|
pt.wg.Done()
|
2019-05-22 23:16:55 +02:00
|
|
|
}()
|
|
|
|
}
|
|
|
|
|
lib/{mergeset,storage}: make background merge more responsive and scalable
- Maintain a separate worker pool per each part type (in-memory, file, big and small).
Previously a shared pool was used for merging all the part types.
A single merge worker could merge parts with mixed types at once. For example,
it could merge simultaneously an in-memory part plus a big file part.
Such a merge could take hours for big file part. During the duration of this merge
the in-memory part was pinned in memory and couldn't be persisted to disk
under the configured -inmemoryDataFlushInterval .
Another common issue, which could happen when parts with mixed types are merged,
is uncontrolled growth of in-memory parts or small parts when all the merge workers
were busy with merging big files. Such growth could lead to significant performance
degradataion for queries, since every query needs to check ever growing list of parts.
This could also slow down the registration of new time series, since VictoriaMetrics
searches for the internal series_id in the indexdb for every new time series.
The third issue is graceful shutdown duration, which could be very long when a background
merge is running on in-memory parts plus big file parts. This merge couldn't be interrupted,
since it merges in-memory parts.
A separate pool of merge workers per every part type elegantly resolves both issues:
- In-memory parts are merged to file-based parts in a timely manner, since the maximum
size of in-memory parts is limited.
- Long-running merges for big parts do not block merges for in-memory parts and small parts.
- Graceful shutdown duration is now limited by the time needed for flushing in-memory parts to files.
Merging for file parts is instantly canceled on graceful shutdown now.
- Deprecate -smallMergeConcurrency command-line flag, since the new background merge algorithm
should automatically self-tune according to the number of available CPU cores.
- Deprecate -finalMergeDelay command-line flag, since it wasn't working correctly.
It is better to run forced merge when needed - https://docs.victoriametrics.com/#forced-merge
- Tune the number of shards for pending rows and items before the data goes to in-memory parts
and becomes visible for search. This improves the maximum data ingestion rate and the maximum rate
for registration of new time series. This should reduce the duration of data ingestion slowdown
in VictoriaMetrics cluster on e.g. re-routing events, when some of vmstorage nodes become temporarily
unavailable.
- Prevent from possible "sync: WaitGroup misuse" panic on graceful shutdown.
This is a follow-up for fa566c68a6ccf7385a05f649aee7e5f5a38afb15 .
Thanks @misutoth to for the inspiration at https://github.com/VictoriaMetrics/VictoriaMetrics/pull/5212
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/5190
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/3790
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/3551
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/3337
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/3425
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/3647
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/3641
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/648
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/291
2024-01-26 21:39:49 +01:00
|
|
|
func (pt *partition) startInmemoryPartsFlusher() {
|
|
|
|
pt.wg.Add(1)
|
|
|
|
go func() {
|
|
|
|
pt.inmemoryPartsFlusher()
|
|
|
|
pt.wg.Done()
|
|
|
|
}()
|
|
|
|
}
|
|
|
|
|
|
|
|
func (pt *partition) startStalePartsRemover() {
|
|
|
|
pt.wg.Add(1)
|
|
|
|
go func() {
|
|
|
|
pt.stalePartsRemover()
|
|
|
|
pt.wg.Done()
|
|
|
|
}()
|
|
|
|
}
|
|
|
|
|
|
|
|
var (
|
|
|
|
inmemoryPartsConcurrencyCh = make(chan struct{}, getInmemoryPartsConcurrency())
|
|
|
|
smallPartsConcurrencyCh = make(chan struct{}, getSmallPartsConcurrency())
|
|
|
|
bigPartsConcurrencyCh = make(chan struct{}, getBigPartsConcurrency())
|
|
|
|
)
|
|
|
|
|
|
|
|
func getInmemoryPartsConcurrency() int {
|
|
|
|
// The concurrency for processing in-memory parts must equal to the number of CPU cores,
|
|
|
|
// since these operations are CPU-bound.
|
|
|
|
return cgroup.AvailableCPUs()
|
|
|
|
}
|
|
|
|
|
|
|
|
func getSmallPartsConcurrency() int {
|
|
|
|
n := cgroup.AvailableCPUs()
|
|
|
|
if n < 4 {
|
|
|
|
// Allow at least 4 concurrent workers for small parts on systems
|
|
|
|
// with less than 4 CPU cores in order to be able to make smaller part merges
|
|
|
|
// when bigger part merges are in progress.
|
|
|
|
return 4
|
|
|
|
}
|
|
|
|
return n
|
|
|
|
}
|
|
|
|
|
|
|
|
func getBigPartsConcurrency() int {
|
|
|
|
n := cgroup.AvailableCPUs()
|
|
|
|
if n < 4 {
|
|
|
|
// Allow at least 4 concurrent workers for big parts on systems
|
|
|
|
// with less than 4 CPU cores in order to be able to make smaller part merges
|
|
|
|
// when bigger part merges are in progress.
|
|
|
|
return 4
|
|
|
|
}
|
|
|
|
return n
|
|
|
|
}
|
|
|
|
|
2022-12-06 00:15:00 +01:00
|
|
|
func (pt *partition) inmemoryPartsFlusher() {
|
2024-02-22 19:06:37 +01:00
|
|
|
// Do not add jitter to d in order to guarantee the flush interval
|
|
|
|
d := dataFlushInterval
|
2024-01-22 17:12:37 +01:00
|
|
|
ticker := time.NewTicker(d)
|
2022-12-06 00:15:00 +01:00
|
|
|
defer ticker.Stop()
|
|
|
|
for {
|
|
|
|
select {
|
|
|
|
case <-pt.stopCh:
|
|
|
|
return
|
|
|
|
case <-ticker.C:
|
lib/{mergeset,storage}: make background merge more responsive and scalable
- Maintain a separate worker pool per each part type (in-memory, file, big and small).
Previously a shared pool was used for merging all the part types.
A single merge worker could merge parts with mixed types at once. For example,
it could merge simultaneously an in-memory part plus a big file part.
Such a merge could take hours for big file part. During the duration of this merge
the in-memory part was pinned in memory and couldn't be persisted to disk
under the configured -inmemoryDataFlushInterval .
Another common issue, which could happen when parts with mixed types are merged,
is uncontrolled growth of in-memory parts or small parts when all the merge workers
were busy with merging big files. Such growth could lead to significant performance
degradataion for queries, since every query needs to check ever growing list of parts.
This could also slow down the registration of new time series, since VictoriaMetrics
searches for the internal series_id in the indexdb for every new time series.
The third issue is graceful shutdown duration, which could be very long when a background
merge is running on in-memory parts plus big file parts. This merge couldn't be interrupted,
since it merges in-memory parts.
A separate pool of merge workers per every part type elegantly resolves both issues:
- In-memory parts are merged to file-based parts in a timely manner, since the maximum
size of in-memory parts is limited.
- Long-running merges for big parts do not block merges for in-memory parts and small parts.
- Graceful shutdown duration is now limited by the time needed for flushing in-memory parts to files.
Merging for file parts is instantly canceled on graceful shutdown now.
- Deprecate -smallMergeConcurrency command-line flag, since the new background merge algorithm
should automatically self-tune according to the number of available CPU cores.
- Deprecate -finalMergeDelay command-line flag, since it wasn't working correctly.
It is better to run forced merge when needed - https://docs.victoriametrics.com/#forced-merge
- Tune the number of shards for pending rows and items before the data goes to in-memory parts
and becomes visible for search. This improves the maximum data ingestion rate and the maximum rate
for registration of new time series. This should reduce the duration of data ingestion slowdown
in VictoriaMetrics cluster on e.g. re-routing events, when some of vmstorage nodes become temporarily
unavailable.
- Prevent from possible "sync: WaitGroup misuse" panic on graceful shutdown.
This is a follow-up for fa566c68a6ccf7385a05f649aee7e5f5a38afb15 .
Thanks @misutoth to for the inspiration at https://github.com/VictoriaMetrics/VictoriaMetrics/pull/5212
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/5190
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/3790
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/3551
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/3337
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/3425
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/3647
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/3641
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/648
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/291
2024-01-26 21:39:49 +01:00
|
|
|
pt.flushInmemoryPartsToFiles(false)
|
2022-12-06 00:15:00 +01:00
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
func (pt *partition) pendingRowsFlusher() {
|
2024-02-22 19:06:37 +01:00
|
|
|
// Do not add jitter to d in order to guarantee the flush interval
|
|
|
|
d := pendingRowsFlushInterval
|
2024-01-22 17:12:37 +01:00
|
|
|
ticker := time.NewTicker(d)
|
2020-02-13 11:55:58 +01:00
|
|
|
defer ticker.Stop()
|
2019-05-22 23:16:55 +02:00
|
|
|
for {
|
|
|
|
select {
|
|
|
|
case <-pt.stopCh:
|
|
|
|
return
|
2020-02-13 11:55:58 +01:00
|
|
|
case <-ticker.C:
|
2024-02-22 16:22:23 +01:00
|
|
|
pt.flushPendingRows(false)
|
2019-05-22 23:16:55 +02:00
|
|
|
}
|
2019-12-19 17:12:02 +01:00
|
|
|
}
|
|
|
|
}
|
2019-05-22 23:16:55 +02:00
|
|
|
|
2024-02-22 16:22:23 +01:00
|
|
|
func (pt *partition) flushPendingRows(isFinal bool) {
|
|
|
|
pt.rawRows.flush(pt, isFinal)
|
2022-12-06 00:15:00 +01:00
|
|
|
}
|
|
|
|
|
lib/{mergeset,storage}: make background merge more responsive and scalable
- Maintain a separate worker pool per each part type (in-memory, file, big and small).
Previously a shared pool was used for merging all the part types.
A single merge worker could merge parts with mixed types at once. For example,
it could merge simultaneously an in-memory part plus a big file part.
Such a merge could take hours for big file part. During the duration of this merge
the in-memory part was pinned in memory and couldn't be persisted to disk
under the configured -inmemoryDataFlushInterval .
Another common issue, which could happen when parts with mixed types are merged,
is uncontrolled growth of in-memory parts or small parts when all the merge workers
were busy with merging big files. Such growth could lead to significant performance
degradataion for queries, since every query needs to check ever growing list of parts.
This could also slow down the registration of new time series, since VictoriaMetrics
searches for the internal series_id in the indexdb for every new time series.
The third issue is graceful shutdown duration, which could be very long when a background
merge is running on in-memory parts plus big file parts. This merge couldn't be interrupted,
since it merges in-memory parts.
A separate pool of merge workers per every part type elegantly resolves both issues:
- In-memory parts are merged to file-based parts in a timely manner, since the maximum
size of in-memory parts is limited.
- Long-running merges for big parts do not block merges for in-memory parts and small parts.
- Graceful shutdown duration is now limited by the time needed for flushing in-memory parts to files.
Merging for file parts is instantly canceled on graceful shutdown now.
- Deprecate -smallMergeConcurrency command-line flag, since the new background merge algorithm
should automatically self-tune according to the number of available CPU cores.
- Deprecate -finalMergeDelay command-line flag, since it wasn't working correctly.
It is better to run forced merge when needed - https://docs.victoriametrics.com/#forced-merge
- Tune the number of shards for pending rows and items before the data goes to in-memory parts
and becomes visible for search. This improves the maximum data ingestion rate and the maximum rate
for registration of new time series. This should reduce the duration of data ingestion slowdown
in VictoriaMetrics cluster on e.g. re-routing events, when some of vmstorage nodes become temporarily
unavailable.
- Prevent from possible "sync: WaitGroup misuse" panic on graceful shutdown.
This is a follow-up for fa566c68a6ccf7385a05f649aee7e5f5a38afb15 .
Thanks @misutoth to for the inspiration at https://github.com/VictoriaMetrics/VictoriaMetrics/pull/5212
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/5190
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/3790
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/3551
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/3337
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/3425
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/3647
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/3641
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/648
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/291
2024-01-26 21:39:49 +01:00
|
|
|
func (pt *partition) flushInmemoryRowsToFiles() {
|
2024-02-22 16:22:23 +01:00
|
|
|
pt.flushPendingRows(true)
|
lib/{mergeset,storage}: make background merge more responsive and scalable
- Maintain a separate worker pool per each part type (in-memory, file, big and small).
Previously a shared pool was used for merging all the part types.
A single merge worker could merge parts with mixed types at once. For example,
it could merge simultaneously an in-memory part plus a big file part.
Such a merge could take hours for big file part. During the duration of this merge
the in-memory part was pinned in memory and couldn't be persisted to disk
under the configured -inmemoryDataFlushInterval .
Another common issue, which could happen when parts with mixed types are merged,
is uncontrolled growth of in-memory parts or small parts when all the merge workers
were busy with merging big files. Such growth could lead to significant performance
degradataion for queries, since every query needs to check ever growing list of parts.
This could also slow down the registration of new time series, since VictoriaMetrics
searches for the internal series_id in the indexdb for every new time series.
The third issue is graceful shutdown duration, which could be very long when a background
merge is running on in-memory parts plus big file parts. This merge couldn't be interrupted,
since it merges in-memory parts.
A separate pool of merge workers per every part type elegantly resolves both issues:
- In-memory parts are merged to file-based parts in a timely manner, since the maximum
size of in-memory parts is limited.
- Long-running merges for big parts do not block merges for in-memory parts and small parts.
- Graceful shutdown duration is now limited by the time needed for flushing in-memory parts to files.
Merging for file parts is instantly canceled on graceful shutdown now.
- Deprecate -smallMergeConcurrency command-line flag, since the new background merge algorithm
should automatically self-tune according to the number of available CPU cores.
- Deprecate -finalMergeDelay command-line flag, since it wasn't working correctly.
It is better to run forced merge when needed - https://docs.victoriametrics.com/#forced-merge
- Tune the number of shards for pending rows and items before the data goes to in-memory parts
and becomes visible for search. This improves the maximum data ingestion rate and the maximum rate
for registration of new time series. This should reduce the duration of data ingestion slowdown
in VictoriaMetrics cluster on e.g. re-routing events, when some of vmstorage nodes become temporarily
unavailable.
- Prevent from possible "sync: WaitGroup misuse" panic on graceful shutdown.
This is a follow-up for fa566c68a6ccf7385a05f649aee7e5f5a38afb15 .
Thanks @misutoth to for the inspiration at https://github.com/VictoriaMetrics/VictoriaMetrics/pull/5212
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/5190
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/3790
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/3551
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/3337
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/3425
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/3647
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/3641
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/648
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/291
2024-01-26 21:39:49 +01:00
|
|
|
pt.flushInmemoryPartsToFiles(true)
|
2022-12-06 00:15:00 +01:00
|
|
|
}
|
|
|
|
|
lib/{mergeset,storage}: make background merge more responsive and scalable
- Maintain a separate worker pool per each part type (in-memory, file, big and small).
Previously a shared pool was used for merging all the part types.
A single merge worker could merge parts with mixed types at once. For example,
it could merge simultaneously an in-memory part plus a big file part.
Such a merge could take hours for big file part. During the duration of this merge
the in-memory part was pinned in memory and couldn't be persisted to disk
under the configured -inmemoryDataFlushInterval .
Another common issue, which could happen when parts with mixed types are merged,
is uncontrolled growth of in-memory parts or small parts when all the merge workers
were busy with merging big files. Such growth could lead to significant performance
degradataion for queries, since every query needs to check ever growing list of parts.
This could also slow down the registration of new time series, since VictoriaMetrics
searches for the internal series_id in the indexdb for every new time series.
The third issue is graceful shutdown duration, which could be very long when a background
merge is running on in-memory parts plus big file parts. This merge couldn't be interrupted,
since it merges in-memory parts.
A separate pool of merge workers per every part type elegantly resolves both issues:
- In-memory parts are merged to file-based parts in a timely manner, since the maximum
size of in-memory parts is limited.
- Long-running merges for big parts do not block merges for in-memory parts and small parts.
- Graceful shutdown duration is now limited by the time needed for flushing in-memory parts to files.
Merging for file parts is instantly canceled on graceful shutdown now.
- Deprecate -smallMergeConcurrency command-line flag, since the new background merge algorithm
should automatically self-tune according to the number of available CPU cores.
- Deprecate -finalMergeDelay command-line flag, since it wasn't working correctly.
It is better to run forced merge when needed - https://docs.victoriametrics.com/#forced-merge
- Tune the number of shards for pending rows and items before the data goes to in-memory parts
and becomes visible for search. This improves the maximum data ingestion rate and the maximum rate
for registration of new time series. This should reduce the duration of data ingestion slowdown
in VictoriaMetrics cluster on e.g. re-routing events, when some of vmstorage nodes become temporarily
unavailable.
- Prevent from possible "sync: WaitGroup misuse" panic on graceful shutdown.
This is a follow-up for fa566c68a6ccf7385a05f649aee7e5f5a38afb15 .
Thanks @misutoth to for the inspiration at https://github.com/VictoriaMetrics/VictoriaMetrics/pull/5212
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/5190
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/3790
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/3551
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/3337
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/3425
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/3647
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/3641
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/648
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/291
2024-01-26 21:39:49 +01:00
|
|
|
func (pt *partition) flushInmemoryPartsToFiles(isFinal bool) {
|
2023-03-19 08:10:24 +01:00
|
|
|
currentTime := time.Now()
|
|
|
|
var pws []*partWrapper
|
2022-12-06 00:15:00 +01:00
|
|
|
|
2023-03-19 08:10:24 +01:00
|
|
|
pt.partsLock.Lock()
|
|
|
|
for _, pw := range pt.inmemoryParts {
|
|
|
|
if !pw.isInMerge && (isFinal || pw.flushToDiskDeadline.Before(currentTime)) {
|
|
|
|
pw.isInMerge = true
|
|
|
|
pws = append(pws, pw)
|
2022-12-06 00:15:00 +01:00
|
|
|
}
|
2023-03-19 08:10:24 +01:00
|
|
|
}
|
|
|
|
pt.partsLock.Unlock()
|
2022-12-06 00:15:00 +01:00
|
|
|
|
lib/{mergeset,storage}: make background merge more responsive and scalable
- Maintain a separate worker pool per each part type (in-memory, file, big and small).
Previously a shared pool was used for merging all the part types.
A single merge worker could merge parts with mixed types at once. For example,
it could merge simultaneously an in-memory part plus a big file part.
Such a merge could take hours for big file part. During the duration of this merge
the in-memory part was pinned in memory and couldn't be persisted to disk
under the configured -inmemoryDataFlushInterval .
Another common issue, which could happen when parts with mixed types are merged,
is uncontrolled growth of in-memory parts or small parts when all the merge workers
were busy with merging big files. Such growth could lead to significant performance
degradataion for queries, since every query needs to check ever growing list of parts.
This could also slow down the registration of new time series, since VictoriaMetrics
searches for the internal series_id in the indexdb for every new time series.
The third issue is graceful shutdown duration, which could be very long when a background
merge is running on in-memory parts plus big file parts. This merge couldn't be interrupted,
since it merges in-memory parts.
A separate pool of merge workers per every part type elegantly resolves both issues:
- In-memory parts are merged to file-based parts in a timely manner, since the maximum
size of in-memory parts is limited.
- Long-running merges for big parts do not block merges for in-memory parts and small parts.
- Graceful shutdown duration is now limited by the time needed for flushing in-memory parts to files.
Merging for file parts is instantly canceled on graceful shutdown now.
- Deprecate -smallMergeConcurrency command-line flag, since the new background merge algorithm
should automatically self-tune according to the number of available CPU cores.
- Deprecate -finalMergeDelay command-line flag, since it wasn't working correctly.
It is better to run forced merge when needed - https://docs.victoriametrics.com/#forced-merge
- Tune the number of shards for pending rows and items before the data goes to in-memory parts
and becomes visible for search. This improves the maximum data ingestion rate and the maximum rate
for registration of new time series. This should reduce the duration of data ingestion slowdown
in VictoriaMetrics cluster on e.g. re-routing events, when some of vmstorage nodes become temporarily
unavailable.
- Prevent from possible "sync: WaitGroup misuse" panic on graceful shutdown.
This is a follow-up for fa566c68a6ccf7385a05f649aee7e5f5a38afb15 .
Thanks @misutoth to for the inspiration at https://github.com/VictoriaMetrics/VictoriaMetrics/pull/5212
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/5190
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/3790
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/3551
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/3337
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/3425
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/3647
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/3641
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/648
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/291
2024-01-26 21:39:49 +01:00
|
|
|
if err := pt.mergePartsToFiles(pws, nil, inmemoryPartsConcurrencyCh); err != nil {
|
2023-03-19 08:10:24 +01:00
|
|
|
logger.Panicf("FATAL: cannot merge in-memory parts: %s", err)
|
2022-12-06 00:15:00 +01:00
|
|
|
}
|
2019-12-19 17:12:02 +01:00
|
|
|
}
|
|
|
|
|
2024-02-22 16:22:23 +01:00
|
|
|
func (rrss *rawRowsShards) flush(pt *partition, isFinal bool) {
|
2024-02-22 23:02:22 +01:00
|
|
|
var dst [][]rawRow
|
|
|
|
|
|
|
|
currentTimeMs := time.Now().UnixMilli()
|
2024-02-23 21:27:03 +01:00
|
|
|
flushDeadlineMs := rrss.flushDeadlineMs.Load()
|
2024-02-22 23:02:22 +01:00
|
|
|
if isFinal || currentTimeMs >= flushDeadlineMs {
|
|
|
|
rrss.rowssToFlushLock.Lock()
|
|
|
|
dst = rrss.rowssToFlush
|
|
|
|
rrss.rowssToFlush = nil
|
|
|
|
rrss.rowssToFlushLock.Unlock()
|
|
|
|
}
|
|
|
|
|
2021-04-27 14:36:31 +02:00
|
|
|
for i := range rrss.shards {
|
2024-02-22 23:02:22 +01:00
|
|
|
dst = rrss.shards[i].appendRawRowsToFlush(dst, currentTimeMs, isFinal)
|
2019-05-22 23:16:55 +02:00
|
|
|
}
|
2024-02-22 23:02:22 +01:00
|
|
|
|
|
|
|
pt.flushRowssToInmemoryParts(dst)
|
2019-05-22 23:16:55 +02:00
|
|
|
}
|
|
|
|
|
2024-02-22 23:02:22 +01:00
|
|
|
func (rrs *rawRowsShard) appendRawRowsToFlush(dst [][]rawRow, currentTimeMs int64, isFinal bool) [][]rawRow {
|
2024-02-23 21:27:03 +01:00
|
|
|
flushDeadlineMs := rrs.flushDeadlineMs.Load()
|
2024-02-22 23:02:22 +01:00
|
|
|
if !isFinal && currentTimeMs < flushDeadlineMs {
|
2022-10-21 13:33:03 +02:00
|
|
|
// Fast path - nothing to flush
|
|
|
|
return dst
|
2019-05-22 23:16:55 +02:00
|
|
|
}
|
2024-02-22 23:02:22 +01:00
|
|
|
|
2022-10-21 13:33:03 +02:00
|
|
|
// Slow path - move rrs.rows to dst.
|
|
|
|
rrs.mu.Lock()
|
2024-02-22 23:02:22 +01:00
|
|
|
dst = appendRawRowss(dst, rrs.rows)
|
2022-10-21 13:33:03 +02:00
|
|
|
rrs.rows = rrs.rows[:0]
|
|
|
|
rrs.mu.Unlock()
|
2024-02-22 23:02:22 +01:00
|
|
|
|
|
|
|
return dst
|
|
|
|
}
|
|
|
|
|
|
|
|
func (rrs *rawRowsShard) updateFlushDeadline() {
|
2024-02-23 21:27:03 +01:00
|
|
|
rrs.flushDeadlineMs.Store(time.Now().Add(pendingRowsFlushInterval).UnixMilli())
|
2024-02-22 23:02:22 +01:00
|
|
|
}
|
|
|
|
|
|
|
|
func appendRawRowss(dst [][]rawRow, src []rawRow) [][]rawRow {
|
|
|
|
if len(src) == 0 {
|
|
|
|
return dst
|
|
|
|
}
|
|
|
|
if len(dst) == 0 {
|
|
|
|
dst = append(dst, newRawRows())
|
|
|
|
}
|
|
|
|
prows := &dst[len(dst)-1]
|
|
|
|
n := copy((*prows)[len(*prows):cap(*prows)], src)
|
|
|
|
*prows = (*prows)[:len(*prows)+n]
|
|
|
|
src = src[n:]
|
|
|
|
for len(src) > 0 {
|
|
|
|
rows := newRawRows()
|
|
|
|
n := copy(rows[:cap(rows)], src)
|
|
|
|
rows = rows[:len(rows)+n]
|
|
|
|
src = src[n:]
|
|
|
|
dst = append(dst, rows)
|
|
|
|
}
|
2021-06-17 12:42:32 +02:00
|
|
|
return dst
|
2019-05-22 23:16:55 +02:00
|
|
|
}
|
|
|
|
|
lib/{mergeset,storage}: make background merge more responsive and scalable
- Maintain a separate worker pool per each part type (in-memory, file, big and small).
Previously a shared pool was used for merging all the part types.
A single merge worker could merge parts with mixed types at once. For example,
it could merge simultaneously an in-memory part plus a big file part.
Such a merge could take hours for big file part. During the duration of this merge
the in-memory part was pinned in memory and couldn't be persisted to disk
under the configured -inmemoryDataFlushInterval .
Another common issue, which could happen when parts with mixed types are merged,
is uncontrolled growth of in-memory parts or small parts when all the merge workers
were busy with merging big files. Such growth could lead to significant performance
degradataion for queries, since every query needs to check ever growing list of parts.
This could also slow down the registration of new time series, since VictoriaMetrics
searches for the internal series_id in the indexdb for every new time series.
The third issue is graceful shutdown duration, which could be very long when a background
merge is running on in-memory parts plus big file parts. This merge couldn't be interrupted,
since it merges in-memory parts.
A separate pool of merge workers per every part type elegantly resolves both issues:
- In-memory parts are merged to file-based parts in a timely manner, since the maximum
size of in-memory parts is limited.
- Long-running merges for big parts do not block merges for in-memory parts and small parts.
- Graceful shutdown duration is now limited by the time needed for flushing in-memory parts to files.
Merging for file parts is instantly canceled on graceful shutdown now.
- Deprecate -smallMergeConcurrency command-line flag, since the new background merge algorithm
should automatically self-tune according to the number of available CPU cores.
- Deprecate -finalMergeDelay command-line flag, since it wasn't working correctly.
It is better to run forced merge when needed - https://docs.victoriametrics.com/#forced-merge
- Tune the number of shards for pending rows and items before the data goes to in-memory parts
and becomes visible for search. This improves the maximum data ingestion rate and the maximum rate
for registration of new time series. This should reduce the duration of data ingestion slowdown
in VictoriaMetrics cluster on e.g. re-routing events, when some of vmstorage nodes become temporarily
unavailable.
- Prevent from possible "sync: WaitGroup misuse" panic on graceful shutdown.
This is a follow-up for fa566c68a6ccf7385a05f649aee7e5f5a38afb15 .
Thanks @misutoth to for the inspiration at https://github.com/VictoriaMetrics/VictoriaMetrics/pull/5212
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/5190
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/3790
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/3551
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/3337
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/3425
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/3647
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/3641
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/648
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/291
2024-01-26 21:39:49 +01:00
|
|
|
func (pt *partition) mergePartsToFiles(pws []*partWrapper, stopCh <-chan struct{}, concurrencyCh chan struct{}) error {
|
|
|
|
pwsLen := len(pws)
|
|
|
|
|
|
|
|
var errGlobal error
|
|
|
|
var errGlobalLock sync.Mutex
|
|
|
|
wg := getWaitGroup()
|
2022-12-06 00:15:00 +01:00
|
|
|
for len(pws) > 0 {
|
lib/{mergeset,storage}: make background merge more responsive and scalable
- Maintain a separate worker pool per each part type (in-memory, file, big and small).
Previously a shared pool was used for merging all the part types.
A single merge worker could merge parts with mixed types at once. For example,
it could merge simultaneously an in-memory part plus a big file part.
Such a merge could take hours for big file part. During the duration of this merge
the in-memory part was pinned in memory and couldn't be persisted to disk
under the configured -inmemoryDataFlushInterval .
Another common issue, which could happen when parts with mixed types are merged,
is uncontrolled growth of in-memory parts or small parts when all the merge workers
were busy with merging big files. Such growth could lead to significant performance
degradataion for queries, since every query needs to check ever growing list of parts.
This could also slow down the registration of new time series, since VictoriaMetrics
searches for the internal series_id in the indexdb for every new time series.
The third issue is graceful shutdown duration, which could be very long when a background
merge is running on in-memory parts plus big file parts. This merge couldn't be interrupted,
since it merges in-memory parts.
A separate pool of merge workers per every part type elegantly resolves both issues:
- In-memory parts are merged to file-based parts in a timely manner, since the maximum
size of in-memory parts is limited.
- Long-running merges for big parts do not block merges for in-memory parts and small parts.
- Graceful shutdown duration is now limited by the time needed for flushing in-memory parts to files.
Merging for file parts is instantly canceled on graceful shutdown now.
- Deprecate -smallMergeConcurrency command-line flag, since the new background merge algorithm
should automatically self-tune according to the number of available CPU cores.
- Deprecate -finalMergeDelay command-line flag, since it wasn't working correctly.
It is better to run forced merge when needed - https://docs.victoriametrics.com/#forced-merge
- Tune the number of shards for pending rows and items before the data goes to in-memory parts
and becomes visible for search. This improves the maximum data ingestion rate and the maximum rate
for registration of new time series. This should reduce the duration of data ingestion slowdown
in VictoriaMetrics cluster on e.g. re-routing events, when some of vmstorage nodes become temporarily
unavailable.
- Prevent from possible "sync: WaitGroup misuse" panic on graceful shutdown.
This is a follow-up for fa566c68a6ccf7385a05f649aee7e5f5a38afb15 .
Thanks @misutoth to for the inspiration at https://github.com/VictoriaMetrics/VictoriaMetrics/pull/5212
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/5190
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/3790
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/3551
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/3337
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/3425
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/3647
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/3641
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/648
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/291
2024-01-26 21:39:49 +01:00
|
|
|
pwsToMerge, pwsRemaining := getPartsForOptimalMerge(pws)
|
|
|
|
wg.Add(1)
|
|
|
|
concurrencyCh <- struct{}{}
|
|
|
|
go func(pwsChunk []*partWrapper) {
|
|
|
|
defer func() {
|
|
|
|
<-concurrencyCh
|
|
|
|
wg.Done()
|
|
|
|
}()
|
|
|
|
|
|
|
|
if err := pt.mergeParts(pwsChunk, stopCh, true); err != nil && !errors.Is(err, errForciblyStopped) {
|
|
|
|
errGlobalLock.Lock()
|
|
|
|
if errGlobal == nil {
|
|
|
|
errGlobal = err
|
|
|
|
}
|
|
|
|
errGlobalLock.Unlock()
|
|
|
|
}
|
|
|
|
}(pwsToMerge)
|
|
|
|
pws = pwsRemaining
|
|
|
|
}
|
|
|
|
wg.Wait()
|
|
|
|
putWaitGroup(wg)
|
|
|
|
|
|
|
|
if errGlobal != nil {
|
|
|
|
return fmt.Errorf("cannot merge %d parts optimally: %w", pwsLen, errGlobal)
|
2019-05-22 23:16:55 +02:00
|
|
|
}
|
|
|
|
return nil
|
|
|
|
}
|
|
|
|
|
2022-12-06 00:15:00 +01:00
|
|
|
// ForceMergeAllParts runs merge for all the parts in pt.
|
2024-04-02 20:24:57 +02:00
|
|
|
func (pt *partition) ForceMergeAllParts(stopCh <-chan struct{}) error {
|
2022-12-06 00:15:00 +01:00
|
|
|
pws := pt.getAllPartsForMerge()
|
2020-09-17 11:01:53 +02:00
|
|
|
if len(pws) == 0 {
|
|
|
|
// Nothing to merge.
|
|
|
|
return nil
|
|
|
|
}
|
2021-12-15 14:58:27 +01:00
|
|
|
|
2023-09-15 19:04:54 +02:00
|
|
|
// Check whether there is enough disk space for merging pws.
|
|
|
|
newPartSize := getPartsSize(pws)
|
|
|
|
maxOutBytes := fs.MustGetFreeSpace(pt.bigPartsPath)
|
|
|
|
if newPartSize > maxOutBytes {
|
|
|
|
freeSpaceNeededBytes := newPartSize - maxOutBytes
|
|
|
|
forceMergeLogger.Warnf("cannot initiate force merge for the partition %s; additional space needed: %d bytes", pt.name, freeSpaceNeededBytes)
|
|
|
|
pt.releasePartsToMerge(pws)
|
|
|
|
return nil
|
|
|
|
}
|
|
|
|
|
|
|
|
// If len(pws) == 1, then the merge must run anyway.
|
|
|
|
// This allows applying the configured retention, removing the deleted series
|
|
|
|
// and performing de-duplication if needed.
|
2024-04-02 20:24:57 +02:00
|
|
|
if err := pt.mergePartsToFiles(pws, stopCh, bigPartsConcurrencyCh); err != nil {
|
2023-09-15 19:04:54 +02:00
|
|
|
return fmt.Errorf("cannot force merge %d parts from partition %q: %w", len(pws), pt.name, err)
|
2020-09-17 11:01:53 +02:00
|
|
|
}
|
2023-09-15 19:04:54 +02:00
|
|
|
|
|
|
|
return nil
|
2020-09-17 11:01:53 +02:00
|
|
|
}
|
|
|
|
|
2022-06-27 11:31:16 +02:00
|
|
|
var forceMergeLogger = logger.WithThrottler("forceMerge", time.Minute)
|
|
|
|
|
2022-12-06 00:15:00 +01:00
|
|
|
func (pt *partition) getAllPartsForMerge() []*partWrapper {
|
|
|
|
var pws []*partWrapper
|
|
|
|
pt.partsLock.Lock()
|
|
|
|
if !hasActiveMerges(pt.inmemoryParts) && !hasActiveMerges(pt.smallParts) && !hasActiveMerges(pt.bigParts) {
|
|
|
|
pws = appendAllPartsForMerge(pws, pt.inmemoryParts)
|
|
|
|
pws = appendAllPartsForMerge(pws, pt.smallParts)
|
|
|
|
pws = appendAllPartsForMerge(pws, pt.bigParts)
|
|
|
|
}
|
|
|
|
pt.partsLock.Unlock()
|
|
|
|
return pws
|
|
|
|
}
|
|
|
|
|
|
|
|
func appendAllPartsForMerge(dst, src []*partWrapper) []*partWrapper {
|
2020-09-17 11:01:53 +02:00
|
|
|
for _, pw := range src {
|
|
|
|
if pw.isInMerge {
|
|
|
|
logger.Panicf("BUG: part %q is already in merge", pw.p.path)
|
|
|
|
}
|
|
|
|
pw.isInMerge = true
|
|
|
|
dst = append(dst, pw)
|
|
|
|
}
|
|
|
|
return dst
|
|
|
|
}
|
|
|
|
|
|
|
|
func hasActiveMerges(pws []*partWrapper) bool {
|
|
|
|
for _, pw := range pws {
|
|
|
|
if pw.isInMerge {
|
|
|
|
return true
|
|
|
|
}
|
|
|
|
}
|
|
|
|
return false
|
|
|
|
}
|
|
|
|
|
2022-12-06 00:15:00 +01:00
|
|
|
func getMaxInmemoryPartSize() uint64 {
|
|
|
|
// Allocate 10% of allowed memory for in-memory parts.
|
lib/{mergeset,storage}: make background merge more responsive and scalable
- Maintain a separate worker pool per each part type (in-memory, file, big and small).
Previously a shared pool was used for merging all the part types.
A single merge worker could merge parts with mixed types at once. For example,
it could merge simultaneously an in-memory part plus a big file part.
Such a merge could take hours for big file part. During the duration of this merge
the in-memory part was pinned in memory and couldn't be persisted to disk
under the configured -inmemoryDataFlushInterval .
Another common issue, which could happen when parts with mixed types are merged,
is uncontrolled growth of in-memory parts or small parts when all the merge workers
were busy with merging big files. Such growth could lead to significant performance
degradataion for queries, since every query needs to check ever growing list of parts.
This could also slow down the registration of new time series, since VictoriaMetrics
searches for the internal series_id in the indexdb for every new time series.
The third issue is graceful shutdown duration, which could be very long when a background
merge is running on in-memory parts plus big file parts. This merge couldn't be interrupted,
since it merges in-memory parts.
A separate pool of merge workers per every part type elegantly resolves both issues:
- In-memory parts are merged to file-based parts in a timely manner, since the maximum
size of in-memory parts is limited.
- Long-running merges for big parts do not block merges for in-memory parts and small parts.
- Graceful shutdown duration is now limited by the time needed for flushing in-memory parts to files.
Merging for file parts is instantly canceled on graceful shutdown now.
- Deprecate -smallMergeConcurrency command-line flag, since the new background merge algorithm
should automatically self-tune according to the number of available CPU cores.
- Deprecate -finalMergeDelay command-line flag, since it wasn't working correctly.
It is better to run forced merge when needed - https://docs.victoriametrics.com/#forced-merge
- Tune the number of shards for pending rows and items before the data goes to in-memory parts
and becomes visible for search. This improves the maximum data ingestion rate and the maximum rate
for registration of new time series. This should reduce the duration of data ingestion slowdown
in VictoriaMetrics cluster on e.g. re-routing events, when some of vmstorage nodes become temporarily
unavailable.
- Prevent from possible "sync: WaitGroup misuse" panic on graceful shutdown.
This is a follow-up for fa566c68a6ccf7385a05f649aee7e5f5a38afb15 .
Thanks @misutoth to for the inspiration at https://github.com/VictoriaMetrics/VictoriaMetrics/pull/5212
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/5190
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/3790
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/3551
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/3337
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/3425
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/3647
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/3641
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/648
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/291
2024-01-26 21:39:49 +01:00
|
|
|
n := uint64(0.1 * float64(memory.Allowed()) / maxInmemoryParts)
|
2022-12-06 00:15:00 +01:00
|
|
|
if n < 1e6 {
|
|
|
|
n = 1e6
|
|
|
|
}
|
|
|
|
return n
|
|
|
|
}
|
|
|
|
|
|
|
|
func (pt *partition) getMaxSmallPartSize() uint64 {
|
|
|
|
// Small parts are cached in the OS page cache,
|
|
|
|
// so limit their size by the remaining free RAM.
|
|
|
|
mem := memory.Remaining()
|
|
|
|
n := uint64(mem) / defaultPartsToMerge
|
|
|
|
if n < 10e6 {
|
|
|
|
n = 10e6
|
|
|
|
}
|
|
|
|
// Make sure the output part fits available disk space for small parts.
|
lib/{mergeset,storage}: make background merge more responsive and scalable
- Maintain a separate worker pool per each part type (in-memory, file, big and small).
Previously a shared pool was used for merging all the part types.
A single merge worker could merge parts with mixed types at once. For example,
it could merge simultaneously an in-memory part plus a big file part.
Such a merge could take hours for big file part. During the duration of this merge
the in-memory part was pinned in memory and couldn't be persisted to disk
under the configured -inmemoryDataFlushInterval .
Another common issue, which could happen when parts with mixed types are merged,
is uncontrolled growth of in-memory parts or small parts when all the merge workers
were busy with merging big files. Such growth could lead to significant performance
degradataion for queries, since every query needs to check ever growing list of parts.
This could also slow down the registration of new time series, since VictoriaMetrics
searches for the internal series_id in the indexdb for every new time series.
The third issue is graceful shutdown duration, which could be very long when a background
merge is running on in-memory parts plus big file parts. This merge couldn't be interrupted,
since it merges in-memory parts.
A separate pool of merge workers per every part type elegantly resolves both issues:
- In-memory parts are merged to file-based parts in a timely manner, since the maximum
size of in-memory parts is limited.
- Long-running merges for big parts do not block merges for in-memory parts and small parts.
- Graceful shutdown duration is now limited by the time needed for flushing in-memory parts to files.
Merging for file parts is instantly canceled on graceful shutdown now.
- Deprecate -smallMergeConcurrency command-line flag, since the new background merge algorithm
should automatically self-tune according to the number of available CPU cores.
- Deprecate -finalMergeDelay command-line flag, since it wasn't working correctly.
It is better to run forced merge when needed - https://docs.victoriametrics.com/#forced-merge
- Tune the number of shards for pending rows and items before the data goes to in-memory parts
and becomes visible for search. This improves the maximum data ingestion rate and the maximum rate
for registration of new time series. This should reduce the duration of data ingestion slowdown
in VictoriaMetrics cluster on e.g. re-routing events, when some of vmstorage nodes become temporarily
unavailable.
- Prevent from possible "sync: WaitGroup misuse" panic on graceful shutdown.
This is a follow-up for fa566c68a6ccf7385a05f649aee7e5f5a38afb15 .
Thanks @misutoth to for the inspiration at https://github.com/VictoriaMetrics/VictoriaMetrics/pull/5212
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/5190
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/3790
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/3551
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/3337
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/3425
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/3647
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/3641
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/648
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/291
2024-01-26 21:39:49 +01:00
|
|
|
sizeLimit := getMaxOutBytes(pt.smallPartsPath, cap(smallPartsConcurrencyCh))
|
2022-12-06 00:15:00 +01:00
|
|
|
if n > sizeLimit {
|
|
|
|
n = sizeLimit
|
|
|
|
}
|
|
|
|
return n
|
|
|
|
}
|
|
|
|
|
|
|
|
func (pt *partition) getMaxBigPartSize() uint64 {
|
2023-09-25 17:15:43 +02:00
|
|
|
// Always use 4 workers for big merges due to historical reasons.
|
|
|
|
// See https://github.com/VictoriaMetrics/VictoriaMetrics/issues/4915#issuecomment-1733922830
|
|
|
|
workersCount := 4
|
2023-04-14 05:33:33 +02:00
|
|
|
return getMaxOutBytes(pt.bigPartsPath, workersCount)
|
2020-10-07 16:35:42 +02:00
|
|
|
}
|
|
|
|
|
2021-08-25 08:35:03 +02:00
|
|
|
func getMaxOutBytes(path string, workersCount int) uint64 {
|
|
|
|
n := fs.MustGetFreeSpace(path)
|
2022-12-13 01:49:21 +01:00
|
|
|
// Do not subtract freeDiskSpaceLimitBytes from n before calculating the maxOutBytes,
|
2021-12-01 09:56:21 +01:00
|
|
|
// since this will result in sub-optimal merges - e.g. many small parts will be left unmerged.
|
|
|
|
|
2022-12-13 01:49:21 +01:00
|
|
|
// Divide free space by the max number of concurrent merges.
|
2021-08-25 08:35:03 +02:00
|
|
|
maxOutBytes := n / uint64(workersCount)
|
|
|
|
if maxOutBytes > maxBigPartSize {
|
|
|
|
maxOutBytes = maxBigPartSize
|
2019-08-25 13:10:43 +02:00
|
|
|
}
|
2021-08-25 08:35:03 +02:00
|
|
|
return maxOutBytes
|
2019-05-22 23:16:55 +02:00
|
|
|
}
|
|
|
|
|
2023-10-02 08:04:59 +02:00
|
|
|
func assertIsInMerge(pws []*partWrapper) {
|
|
|
|
for _, pw := range pws {
|
|
|
|
if !pw.isInMerge {
|
|
|
|
logger.Panicf("BUG: partWrapper.isInMerge unexpectedly set to false")
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2020-12-18 22:14:35 +01:00
|
|
|
func (pt *partition) releasePartsToMerge(pws []*partWrapper) {
|
|
|
|
pt.partsLock.Lock()
|
|
|
|
for _, pw := range pws {
|
|
|
|
if !pw.isInMerge {
|
|
|
|
logger.Panicf("BUG: missing isInMerge flag on the part %q", pw.p.path)
|
|
|
|
}
|
|
|
|
pw.isInMerge = false
|
|
|
|
}
|
|
|
|
pt.partsLock.Unlock()
|
|
|
|
}
|
|
|
|
|
2024-04-02 20:24:57 +02:00
|
|
|
func (pt *partition) runFinalDedup(stopCh <-chan struct{}) error {
|
2021-12-15 14:58:27 +01:00
|
|
|
t := time.Now()
|
2024-03-30 00:39:27 +01:00
|
|
|
logger.Infof("start removing duplicate samples from partition (%s, %s)", pt.bigPartsPath, pt.smallPartsPath)
|
2024-04-02 20:24:57 +02:00
|
|
|
if err := pt.ForceMergeAllParts(stopCh); err != nil {
|
2024-03-30 00:39:27 +01:00
|
|
|
return fmt.Errorf("cannot remove duplicate samples from partition (%s, %s): %w", pt.bigPartsPath, pt.smallPartsPath, err)
|
2021-12-15 14:58:27 +01:00
|
|
|
}
|
2024-03-30 00:39:27 +01:00
|
|
|
logger.Infof("duplicate samples have been removed from partition (%s, %s) in %.3f seconds", pt.bigPartsPath, pt.smallPartsPath, time.Since(t).Seconds())
|
2021-12-15 14:58:27 +01:00
|
|
|
return nil
|
|
|
|
}
|
|
|
|
|
2022-12-20 19:11:38 +01:00
|
|
|
func (pt *partition) isFinalDedupNeeded() bool {
|
2024-03-30 00:39:27 +01:00
|
|
|
dedupInterval := GetDedupInterval()
|
2022-12-20 19:11:38 +01:00
|
|
|
|
2023-02-01 18:54:21 +01:00
|
|
|
pws := pt.GetParts(nil, false)
|
2021-12-15 14:58:27 +01:00
|
|
|
minDedupInterval := getMinDedupInterval(pws)
|
2024-03-30 00:39:27 +01:00
|
|
|
pt.PutParts(pws)
|
|
|
|
|
|
|
|
return dedupInterval > minDedupInterval
|
2021-12-17 19:11:15 +01:00
|
|
|
}
|
|
|
|
|
|
|
|
func getMinDedupInterval(pws []*partWrapper) int64 {
|
|
|
|
if len(pws) == 0 {
|
|
|
|
return 0
|
|
|
|
}
|
|
|
|
dMin := pws[0].p.ph.MinDedupInterval
|
|
|
|
for _, pw := range pws[1:] {
|
|
|
|
d := pw.p.ph.MinDedupInterval
|
|
|
|
if d < dMin {
|
|
|
|
dMin = d
|
|
|
|
}
|
|
|
|
}
|
|
|
|
return dMin
|
2021-12-15 14:58:27 +01:00
|
|
|
}
|
|
|
|
|
2022-12-06 00:15:00 +01:00
|
|
|
// mergeParts merges pws to a single resulting part.
|
2020-09-17 01:05:54 +02:00
|
|
|
//
|
lib/{mergeset,storage}: make background merge more responsive and scalable
- Maintain a separate worker pool per each part type (in-memory, file, big and small).
Previously a shared pool was used for merging all the part types.
A single merge worker could merge parts with mixed types at once. For example,
it could merge simultaneously an in-memory part plus a big file part.
Such a merge could take hours for big file part. During the duration of this merge
the in-memory part was pinned in memory and couldn't be persisted to disk
under the configured -inmemoryDataFlushInterval .
Another common issue, which could happen when parts with mixed types are merged,
is uncontrolled growth of in-memory parts or small parts when all the merge workers
were busy with merging big files. Such growth could lead to significant performance
degradataion for queries, since every query needs to check ever growing list of parts.
This could also slow down the registration of new time series, since VictoriaMetrics
searches for the internal series_id in the indexdb for every new time series.
The third issue is graceful shutdown duration, which could be very long when a background
merge is running on in-memory parts plus big file parts. This merge couldn't be interrupted,
since it merges in-memory parts.
A separate pool of merge workers per every part type elegantly resolves both issues:
- In-memory parts are merged to file-based parts in a timely manner, since the maximum
size of in-memory parts is limited.
- Long-running merges for big parts do not block merges for in-memory parts and small parts.
- Graceful shutdown duration is now limited by the time needed for flushing in-memory parts to files.
Merging for file parts is instantly canceled on graceful shutdown now.
- Deprecate -smallMergeConcurrency command-line flag, since the new background merge algorithm
should automatically self-tune according to the number of available CPU cores.
- Deprecate -finalMergeDelay command-line flag, since it wasn't working correctly.
It is better to run forced merge when needed - https://docs.victoriametrics.com/#forced-merge
- Tune the number of shards for pending rows and items before the data goes to in-memory parts
and becomes visible for search. This improves the maximum data ingestion rate and the maximum rate
for registration of new time series. This should reduce the duration of data ingestion slowdown
in VictoriaMetrics cluster on e.g. re-routing events, when some of vmstorage nodes become temporarily
unavailable.
- Prevent from possible "sync: WaitGroup misuse" panic on graceful shutdown.
This is a follow-up for fa566c68a6ccf7385a05f649aee7e5f5a38afb15 .
Thanks @misutoth to for the inspiration at https://github.com/VictoriaMetrics/VictoriaMetrics/pull/5212
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/5190
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/3790
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/3551
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/3337
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/3425
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/3647
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/3641
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/648
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/291
2024-01-26 21:39:49 +01:00
|
|
|
// It is expected that pws contains at least a single part.
|
|
|
|
//
|
2020-09-17 01:05:54 +02:00
|
|
|
// Merging is immediately stopped if stopCh is closed.
|
|
|
|
//
|
2022-12-06 00:15:00 +01:00
|
|
|
// if isFinal is set, then the resulting part will be saved to disk.
|
lib/{mergeset,storage}: make background merge more responsive and scalable
- Maintain a separate worker pool per each part type (in-memory, file, big and small).
Previously a shared pool was used for merging all the part types.
A single merge worker could merge parts with mixed types at once. For example,
it could merge simultaneously an in-memory part plus a big file part.
Such a merge could take hours for big file part. During the duration of this merge
the in-memory part was pinned in memory and couldn't be persisted to disk
under the configured -inmemoryDataFlushInterval .
Another common issue, which could happen when parts with mixed types are merged,
is uncontrolled growth of in-memory parts or small parts when all the merge workers
were busy with merging big files. Such growth could lead to significant performance
degradataion for queries, since every query needs to check ever growing list of parts.
This could also slow down the registration of new time series, since VictoriaMetrics
searches for the internal series_id in the indexdb for every new time series.
The third issue is graceful shutdown duration, which could be very long when a background
merge is running on in-memory parts plus big file parts. This merge couldn't be interrupted,
since it merges in-memory parts.
A separate pool of merge workers per every part type elegantly resolves both issues:
- In-memory parts are merged to file-based parts in a timely manner, since the maximum
size of in-memory parts is limited.
- Long-running merges for big parts do not block merges for in-memory parts and small parts.
- Graceful shutdown duration is now limited by the time needed for flushing in-memory parts to files.
Merging for file parts is instantly canceled on graceful shutdown now.
- Deprecate -smallMergeConcurrency command-line flag, since the new background merge algorithm
should automatically self-tune according to the number of available CPU cores.
- Deprecate -finalMergeDelay command-line flag, since it wasn't working correctly.
It is better to run forced merge when needed - https://docs.victoriametrics.com/#forced-merge
- Tune the number of shards for pending rows and items before the data goes to in-memory parts
and becomes visible for search. This improves the maximum data ingestion rate and the maximum rate
for registration of new time series. This should reduce the duration of data ingestion slowdown
in VictoriaMetrics cluster on e.g. re-routing events, when some of vmstorage nodes become temporarily
unavailable.
- Prevent from possible "sync: WaitGroup misuse" panic on graceful shutdown.
This is a follow-up for fa566c68a6ccf7385a05f649aee7e5f5a38afb15 .
Thanks @misutoth to for the inspiration at https://github.com/VictoriaMetrics/VictoriaMetrics/pull/5212
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/5190
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/3790
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/3551
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/3337
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/3425
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/3647
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/3641
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/648
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/291
2024-01-26 21:39:49 +01:00
|
|
|
// If at least a single source part at pws is stored on disk, then the resulting part
|
|
|
|
// will be stored to disk.
|
2022-12-06 00:15:00 +01:00
|
|
|
//
|
2020-09-17 01:05:54 +02:00
|
|
|
// All the parts inside pws must have isInMerge field set to true.
|
2023-10-02 08:04:59 +02:00
|
|
|
// The isInMerge field inside pws parts is set to false before returning from the function.
|
2022-12-06 00:15:00 +01:00
|
|
|
func (pt *partition) mergeParts(pws []*partWrapper, stopCh <-chan struct{}, isFinal bool) error {
|
2019-05-22 23:16:55 +02:00
|
|
|
if len(pws) == 0 {
|
lib/{mergeset,storage}: make background merge more responsive and scalable
- Maintain a separate worker pool per each part type (in-memory, file, big and small).
Previously a shared pool was used for merging all the part types.
A single merge worker could merge parts with mixed types at once. For example,
it could merge simultaneously an in-memory part plus a big file part.
Such a merge could take hours for big file part. During the duration of this merge
the in-memory part was pinned in memory and couldn't be persisted to disk
under the configured -inmemoryDataFlushInterval .
Another common issue, which could happen when parts with mixed types are merged,
is uncontrolled growth of in-memory parts or small parts when all the merge workers
were busy with merging big files. Such growth could lead to significant performance
degradataion for queries, since every query needs to check ever growing list of parts.
This could also slow down the registration of new time series, since VictoriaMetrics
searches for the internal series_id in the indexdb for every new time series.
The third issue is graceful shutdown duration, which could be very long when a background
merge is running on in-memory parts plus big file parts. This merge couldn't be interrupted,
since it merges in-memory parts.
A separate pool of merge workers per every part type elegantly resolves both issues:
- In-memory parts are merged to file-based parts in a timely manner, since the maximum
size of in-memory parts is limited.
- Long-running merges for big parts do not block merges for in-memory parts and small parts.
- Graceful shutdown duration is now limited by the time needed for flushing in-memory parts to files.
Merging for file parts is instantly canceled on graceful shutdown now.
- Deprecate -smallMergeConcurrency command-line flag, since the new background merge algorithm
should automatically self-tune according to the number of available CPU cores.
- Deprecate -finalMergeDelay command-line flag, since it wasn't working correctly.
It is better to run forced merge when needed - https://docs.victoriametrics.com/#forced-merge
- Tune the number of shards for pending rows and items before the data goes to in-memory parts
and becomes visible for search. This improves the maximum data ingestion rate and the maximum rate
for registration of new time series. This should reduce the duration of data ingestion slowdown
in VictoriaMetrics cluster on e.g. re-routing events, when some of vmstorage nodes become temporarily
unavailable.
- Prevent from possible "sync: WaitGroup misuse" panic on graceful shutdown.
This is a follow-up for fa566c68a6ccf7385a05f649aee7e5f5a38afb15 .
Thanks @misutoth to for the inspiration at https://github.com/VictoriaMetrics/VictoriaMetrics/pull/5212
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/5190
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/3790
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/3551
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/3337
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/3425
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/3647
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/3641
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/648
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/291
2024-01-26 21:39:49 +01:00
|
|
|
logger.Panicf("BUG: empty pws cannot be passed to mergeParts()")
|
2019-05-22 23:16:55 +02:00
|
|
|
}
|
|
|
|
|
2023-10-02 08:04:59 +02:00
|
|
|
assertIsInMerge(pws)
|
|
|
|
defer pt.releasePartsToMerge(pws)
|
|
|
|
|
2019-05-22 23:16:55 +02:00
|
|
|
startTime := time.Now()
|
|
|
|
|
2022-12-06 00:15:00 +01:00
|
|
|
// Initialize destination paths.
|
|
|
|
dstPartType := pt.getDstPartType(pws, isFinal)
|
2023-03-19 09:36:05 +01:00
|
|
|
mergeIdx := pt.nextMergeIdx()
|
|
|
|
dstPartPath := pt.getDstPartPath(dstPartType, mergeIdx)
|
2022-12-06 00:15:00 +01:00
|
|
|
|
2023-02-01 18:54:21 +01:00
|
|
|
if !isDedupEnabled() && isFinal && len(pws) == 1 && pws[0].mp != nil {
|
2022-12-06 00:15:00 +01:00
|
|
|
// Fast path: flush a single in-memory part to disk.
|
|
|
|
mp := pws[0].mp
|
2023-04-14 07:11:56 +02:00
|
|
|
mp.MustStoreToDisk(dstPartPath)
|
2023-03-19 09:36:05 +01:00
|
|
|
pwNew := pt.openCreatedPart(&mp.ph, pws, nil, dstPartPath)
|
2022-12-06 00:15:00 +01:00
|
|
|
pt.swapSrcWithDstParts(pws, pwNew, dstPartType)
|
|
|
|
return nil
|
|
|
|
}
|
|
|
|
|
2019-05-22 23:16:55 +02:00
|
|
|
// Prepare BlockStreamReaders for source parts.
|
2023-04-15 00:46:09 +02:00
|
|
|
bsrs := mustOpenBlockStreamReaders(pws)
|
2022-12-06 00:15:00 +01:00
|
|
|
|
|
|
|
// Prepare BlockStreamWriter for destination part.
|
|
|
|
srcSize := uint64(0)
|
|
|
|
srcRowsCount := uint64(0)
|
|
|
|
srcBlocksCount := uint64(0)
|
2019-05-22 23:16:55 +02:00
|
|
|
for _, pw := range pws {
|
2022-12-06 00:15:00 +01:00
|
|
|
srcSize += pw.p.size
|
|
|
|
srcRowsCount += pw.p.ph.RowsCount
|
|
|
|
srcBlocksCount += pw.p.ph.BlocksCount
|
|
|
|
}
|
|
|
|
rowsPerBlock := float64(srcRowsCount) / float64(srcBlocksCount)
|
|
|
|
compressLevel := getCompressLevel(rowsPerBlock)
|
|
|
|
bsw := getBlockStreamWriter()
|
|
|
|
var mpNew *inmemoryPart
|
|
|
|
if dstPartType == partInmemory {
|
|
|
|
mpNew = getInmemoryPart()
|
2023-04-15 00:46:09 +02:00
|
|
|
bsw.MustInitFromInmemoryPart(mpNew, compressLevel)
|
2022-12-06 00:15:00 +01:00
|
|
|
} else {
|
2023-03-19 09:36:05 +01:00
|
|
|
if dstPartPath == "" {
|
|
|
|
logger.Panicf("BUG: dstPartPath must be non-empty")
|
2022-12-06 00:15:00 +01:00
|
|
|
}
|
|
|
|
nocache := dstPartType == partBig
|
2023-04-15 00:12:45 +02:00
|
|
|
bsw.MustInitFromFilePart(dstPartPath, nocache, compressLevel)
|
2019-05-22 23:16:55 +02:00
|
|
|
}
|
|
|
|
|
2022-12-06 00:15:00 +01:00
|
|
|
// Merge source parts to destination part.
|
2023-03-19 09:36:05 +01:00
|
|
|
ph, err := pt.mergePartsInternal(dstPartPath, bsw, bsrs, dstPartType, stopCh)
|
2022-12-06 00:15:00 +01:00
|
|
|
putBlockStreamWriter(bsw)
|
2023-04-15 00:46:09 +02:00
|
|
|
for _, bsr := range bsrs {
|
|
|
|
putBlockStreamReader(bsr)
|
|
|
|
}
|
2022-12-06 00:15:00 +01:00
|
|
|
if err != nil {
|
2023-03-19 09:36:05 +01:00
|
|
|
return err
|
2022-12-06 00:15:00 +01:00
|
|
|
}
|
|
|
|
if mpNew != nil {
|
|
|
|
// Update partHeader for destination inmemory part after the merge.
|
|
|
|
mpNew.ph = *ph
|
2023-04-14 06:03:06 +02:00
|
|
|
} else {
|
|
|
|
// Make sure the created part directory listing is synced.
|
|
|
|
fs.MustSyncPath(dstPartPath)
|
2019-05-22 23:16:55 +02:00
|
|
|
}
|
|
|
|
|
2023-03-19 09:36:05 +01:00
|
|
|
// Atomically swap the source parts with the newly created part.
|
|
|
|
pwNew := pt.openCreatedPart(ph, pws, mpNew, dstPartPath)
|
2022-12-06 00:15:00 +01:00
|
|
|
|
|
|
|
dstRowsCount := uint64(0)
|
|
|
|
dstBlocksCount := uint64(0)
|
|
|
|
dstSize := uint64(0)
|
|
|
|
if pwNew != nil {
|
|
|
|
pDst := pwNew.p
|
|
|
|
dstRowsCount = pDst.ph.RowsCount
|
|
|
|
dstBlocksCount = pDst.ph.BlocksCount
|
|
|
|
dstSize = pDst.size
|
|
|
|
}
|
2023-03-03 12:33:42 +01:00
|
|
|
|
|
|
|
pt.swapSrcWithDstParts(pws, pwNew, dstPartType)
|
|
|
|
|
|
|
|
d := time.Since(startTime)
|
|
|
|
if d <= 30*time.Second {
|
|
|
|
return nil
|
|
|
|
}
|
|
|
|
|
|
|
|
// Log stats for long merges.
|
2022-12-06 00:15:00 +01:00
|
|
|
durationSecs := d.Seconds()
|
|
|
|
rowsPerSec := int(float64(srcRowsCount) / durationSecs)
|
|
|
|
logger.Infof("merged (%d parts, %d rows, %d blocks, %d bytes) into (1 part, %d rows, %d blocks, %d bytes) in %.3f seconds at %d rows/sec to %q",
|
|
|
|
len(pws), srcRowsCount, srcBlocksCount, srcSize, dstRowsCount, dstBlocksCount, dstSize, durationSecs, rowsPerSec, dstPartPath)
|
|
|
|
|
|
|
|
return nil
|
|
|
|
}
|
|
|
|
|
|
|
|
func getFlushToDiskDeadline(pws []*partWrapper) time.Time {
|
2023-04-14 08:17:10 +02:00
|
|
|
d := time.Now().Add(dataFlushInterval)
|
|
|
|
for _, pw := range pws {
|
|
|
|
if pw.mp != nil && pw.flushToDiskDeadline.Before(d) {
|
2022-12-06 00:15:00 +01:00
|
|
|
d = pw.flushToDiskDeadline
|
|
|
|
}
|
|
|
|
}
|
|
|
|
return d
|
|
|
|
}
|
|
|
|
|
|
|
|
type partType int
|
|
|
|
|
|
|
|
var (
|
|
|
|
partInmemory = partType(0)
|
|
|
|
partSmall = partType(1)
|
|
|
|
partBig = partType(2)
|
|
|
|
)
|
|
|
|
|
|
|
|
func (pt *partition) getDstPartType(pws []*partWrapper, isFinal bool) partType {
|
|
|
|
dstPartSize := getPartsSize(pws)
|
|
|
|
if dstPartSize > pt.getMaxSmallPartSize() {
|
|
|
|
return partBig
|
|
|
|
}
|
|
|
|
if isFinal || dstPartSize > getMaxInmemoryPartSize() {
|
|
|
|
return partSmall
|
|
|
|
}
|
|
|
|
if !areAllInmemoryParts(pws) {
|
|
|
|
// If at least a single source part is located in file,
|
|
|
|
// then the destination part must be in file for durability reasons.
|
|
|
|
return partSmall
|
|
|
|
}
|
|
|
|
return partInmemory
|
|
|
|
}
|
|
|
|
|
2023-03-19 09:36:05 +01:00
|
|
|
func (pt *partition) getDstPartPath(dstPartType partType, mergeIdx uint64) string {
|
2022-12-06 00:15:00 +01:00
|
|
|
ptPath := ""
|
|
|
|
switch dstPartType {
|
|
|
|
case partSmall:
|
|
|
|
ptPath = pt.smallPartsPath
|
|
|
|
case partBig:
|
2019-05-22 23:16:55 +02:00
|
|
|
ptPath = pt.bigPartsPath
|
2022-12-06 00:15:00 +01:00
|
|
|
case partInmemory:
|
|
|
|
ptPath = pt.smallPartsPath
|
|
|
|
default:
|
|
|
|
logger.Panicf("BUG: unknown partType=%d", dstPartType)
|
2019-05-22 23:16:55 +02:00
|
|
|
}
|
2023-03-19 09:36:05 +01:00
|
|
|
dstPartPath := ""
|
2022-12-06 00:15:00 +01:00
|
|
|
if dstPartType != partInmemory {
|
2023-03-25 22:33:54 +01:00
|
|
|
dstPartPath = filepath.Join(ptPath, fmt.Sprintf("%016X", mergeIdx))
|
2019-05-22 23:16:55 +02:00
|
|
|
}
|
2023-03-19 09:36:05 +01:00
|
|
|
return dstPartPath
|
2022-12-06 00:15:00 +01:00
|
|
|
}
|
2019-05-22 23:16:55 +02:00
|
|
|
|
2023-04-15 00:46:09 +02:00
|
|
|
func mustOpenBlockStreamReaders(pws []*partWrapper) []*blockStreamReader {
|
2022-12-06 00:15:00 +01:00
|
|
|
bsrs := make([]*blockStreamReader, 0, len(pws))
|
|
|
|
for _, pw := range pws {
|
|
|
|
bsr := getBlockStreamReader()
|
|
|
|
if pw.mp != nil {
|
2023-04-15 00:46:09 +02:00
|
|
|
bsr.MustInitFromInmemoryPart(pw.mp)
|
2022-12-06 00:15:00 +01:00
|
|
|
} else {
|
2023-04-15 00:46:09 +02:00
|
|
|
bsr.MustInitFromFilePart(pw.p.path)
|
2022-12-06 00:15:00 +01:00
|
|
|
}
|
|
|
|
bsrs = append(bsrs, bsr)
|
|
|
|
}
|
2023-04-15 00:46:09 +02:00
|
|
|
return bsrs
|
2022-12-06 00:15:00 +01:00
|
|
|
}
|
|
|
|
|
2023-03-19 09:36:05 +01:00
|
|
|
func (pt *partition) mergePartsInternal(dstPartPath string, bsw *blockStreamWriter, bsrs []*blockStreamReader, dstPartType partType, stopCh <-chan struct{}) (*partHeader, error) {
|
2019-05-22 23:16:55 +02:00
|
|
|
var ph partHeader
|
2024-02-23 23:15:21 +01:00
|
|
|
var rowsMerged *atomic.Uint64
|
|
|
|
var rowsDeleted *atomic.Uint64
|
|
|
|
var mergesCount *atomic.Uint64
|
|
|
|
var activeMerges *atomic.Int64
|
2022-12-06 00:15:00 +01:00
|
|
|
switch dstPartType {
|
|
|
|
case partInmemory:
|
|
|
|
rowsMerged = &pt.inmemoryRowsMerged
|
|
|
|
rowsDeleted = &pt.inmemoryRowsDeleted
|
|
|
|
mergesCount = &pt.inmemoryMergesCount
|
|
|
|
activeMerges = &pt.activeInmemoryMerges
|
|
|
|
case partSmall:
|
|
|
|
rowsMerged = &pt.smallRowsMerged
|
|
|
|
rowsDeleted = &pt.smallRowsDeleted
|
|
|
|
mergesCount = &pt.smallMergesCount
|
|
|
|
activeMerges = &pt.activeSmallMerges
|
|
|
|
case partBig:
|
2019-05-22 23:16:55 +02:00
|
|
|
rowsMerged = &pt.bigRowsMerged
|
|
|
|
rowsDeleted = &pt.bigRowsDeleted
|
2022-12-06 00:15:00 +01:00
|
|
|
mergesCount = &pt.bigMergesCount
|
|
|
|
activeMerges = &pt.activeBigMerges
|
|
|
|
default:
|
|
|
|
logger.Panicf("BUG: unknown partType=%d", dstPartType)
|
2020-07-22 23:58:48 +02:00
|
|
|
}
|
2022-12-06 00:15:00 +01:00
|
|
|
retentionDeadline := timestampFromTime(time.Now()) - pt.s.retentionMsecs
|
2024-02-23 23:15:21 +01:00
|
|
|
activeMerges.Add(1)
|
2022-10-23 15:08:54 +02:00
|
|
|
err := mergeBlockStreams(&ph, bsw, bsrs, stopCh, pt.s, retentionDeadline, rowsMerged, rowsDeleted)
|
2024-02-23 23:15:21 +01:00
|
|
|
activeMerges.Add(-1)
|
|
|
|
mergesCount.Add(1)
|
2019-05-22 23:16:55 +02:00
|
|
|
if err != nil {
|
2023-03-19 09:36:05 +01:00
|
|
|
return nil, fmt.Errorf("cannot merge %d parts to %s: %w", len(bsrs), dstPartPath, err)
|
2019-05-22 23:16:55 +02:00
|
|
|
}
|
2023-03-19 09:36:05 +01:00
|
|
|
if dstPartPath != "" {
|
2022-12-06 00:15:00 +01:00
|
|
|
ph.MinDedupInterval = GetDedupInterval()
|
2023-04-14 06:33:15 +02:00
|
|
|
ph.MustWriteMetadata(dstPartPath)
|
2019-05-22 23:16:55 +02:00
|
|
|
}
|
2022-12-06 00:15:00 +01:00
|
|
|
return &ph, nil
|
|
|
|
}
|
2019-05-22 23:16:55 +02:00
|
|
|
|
2023-03-19 09:36:05 +01:00
|
|
|
func (pt *partition) openCreatedPart(ph *partHeader, pws []*partWrapper, mpNew *inmemoryPart, dstPartPath string) *partWrapper {
|
2022-12-06 00:15:00 +01:00
|
|
|
// Open the created part.
|
|
|
|
if ph.RowsCount == 0 {
|
2023-03-19 09:36:05 +01:00
|
|
|
// The created part is empty. Remove it
|
|
|
|
if mpNew == nil {
|
|
|
|
fs.MustRemoveAll(dstPartPath)
|
|
|
|
}
|
|
|
|
return nil
|
2019-05-22 23:16:55 +02:00
|
|
|
}
|
2022-12-06 00:15:00 +01:00
|
|
|
if mpNew != nil {
|
|
|
|
// Open the created part from memory.
|
|
|
|
flushToDiskDeadline := getFlushToDiskDeadline(pws)
|
|
|
|
pwNew := newPartWrapperFromInmemoryPart(mpNew, flushToDiskDeadline)
|
2023-03-19 09:36:05 +01:00
|
|
|
return pwNew
|
2019-05-22 23:16:55 +02:00
|
|
|
}
|
2022-12-06 00:15:00 +01:00
|
|
|
// Open the created part from disk.
|
2023-04-15 00:46:09 +02:00
|
|
|
pNew := mustOpenFilePart(dstPartPath)
|
2022-12-06 00:15:00 +01:00
|
|
|
pwNew := &partWrapper{
|
2024-02-23 21:54:55 +01:00
|
|
|
p: pNew,
|
2019-05-22 23:16:55 +02:00
|
|
|
}
|
2024-02-23 21:54:55 +01:00
|
|
|
pwNew.incRef()
|
2023-03-19 09:36:05 +01:00
|
|
|
return pwNew
|
2022-12-06 00:15:00 +01:00
|
|
|
}
|
2019-05-22 23:16:55 +02:00
|
|
|
|
2022-12-06 00:15:00 +01:00
|
|
|
func areAllInmemoryParts(pws []*partWrapper) bool {
|
|
|
|
for _, pw := range pws {
|
|
|
|
if pw.mp == nil {
|
|
|
|
return false
|
2019-05-22 23:16:55 +02:00
|
|
|
}
|
|
|
|
}
|
2022-12-06 00:15:00 +01:00
|
|
|
return true
|
|
|
|
}
|
2019-05-22 23:16:55 +02:00
|
|
|
|
2022-12-06 00:15:00 +01:00
|
|
|
func (pt *partition) swapSrcWithDstParts(pws []*partWrapper, pwNew *partWrapper, dstPartType partType) {
|
|
|
|
// Atomically unregister old parts and add new part to pt.
|
lib/{mergeset,storage}: make background merge more responsive and scalable
- Maintain a separate worker pool per each part type (in-memory, file, big and small).
Previously a shared pool was used for merging all the part types.
A single merge worker could merge parts with mixed types at once. For example,
it could merge simultaneously an in-memory part plus a big file part.
Such a merge could take hours for big file part. During the duration of this merge
the in-memory part was pinned in memory and couldn't be persisted to disk
under the configured -inmemoryDataFlushInterval .
Another common issue, which could happen when parts with mixed types are merged,
is uncontrolled growth of in-memory parts or small parts when all the merge workers
were busy with merging big files. Such growth could lead to significant performance
degradataion for queries, since every query needs to check ever growing list of parts.
This could also slow down the registration of new time series, since VictoriaMetrics
searches for the internal series_id in the indexdb for every new time series.
The third issue is graceful shutdown duration, which could be very long when a background
merge is running on in-memory parts plus big file parts. This merge couldn't be interrupted,
since it merges in-memory parts.
A separate pool of merge workers per every part type elegantly resolves both issues:
- In-memory parts are merged to file-based parts in a timely manner, since the maximum
size of in-memory parts is limited.
- Long-running merges for big parts do not block merges for in-memory parts and small parts.
- Graceful shutdown duration is now limited by the time needed for flushing in-memory parts to files.
Merging for file parts is instantly canceled on graceful shutdown now.
- Deprecate -smallMergeConcurrency command-line flag, since the new background merge algorithm
should automatically self-tune according to the number of available CPU cores.
- Deprecate -finalMergeDelay command-line flag, since it wasn't working correctly.
It is better to run forced merge when needed - https://docs.victoriametrics.com/#forced-merge
- Tune the number of shards for pending rows and items before the data goes to in-memory parts
and becomes visible for search. This improves the maximum data ingestion rate and the maximum rate
for registration of new time series. This should reduce the duration of data ingestion slowdown
in VictoriaMetrics cluster on e.g. re-routing events, when some of vmstorage nodes become temporarily
unavailable.
- Prevent from possible "sync: WaitGroup misuse" panic on graceful shutdown.
This is a follow-up for fa566c68a6ccf7385a05f649aee7e5f5a38afb15 .
Thanks @misutoth to for the inspiration at https://github.com/VictoriaMetrics/VictoriaMetrics/pull/5212
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/5190
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/3790
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/3551
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/3337
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/3425
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/3647
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/3641
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/648
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/291
2024-01-26 21:39:49 +01:00
|
|
|
m := makeMapFromPartWrappers(pws)
|
|
|
|
|
2022-12-06 00:15:00 +01:00
|
|
|
removedInmemoryParts := 0
|
2019-05-22 23:16:55 +02:00
|
|
|
removedSmallParts := 0
|
|
|
|
removedBigParts := 0
|
2022-12-06 00:15:00 +01:00
|
|
|
|
2019-05-22 23:16:55 +02:00
|
|
|
pt.partsLock.Lock()
|
2023-03-19 09:36:05 +01:00
|
|
|
|
2022-12-06 00:15:00 +01:00
|
|
|
pt.inmemoryParts, removedInmemoryParts = removeParts(pt.inmemoryParts, m)
|
|
|
|
pt.smallParts, removedSmallParts = removeParts(pt.smallParts, m)
|
|
|
|
pt.bigParts, removedBigParts = removeParts(pt.bigParts, m)
|
|
|
|
if pwNew != nil {
|
|
|
|
switch dstPartType {
|
|
|
|
case partInmemory:
|
|
|
|
pt.inmemoryParts = append(pt.inmemoryParts, pwNew)
|
lib/{mergeset,storage}: make background merge more responsive and scalable
- Maintain a separate worker pool per each part type (in-memory, file, big and small).
Previously a shared pool was used for merging all the part types.
A single merge worker could merge parts with mixed types at once. For example,
it could merge simultaneously an in-memory part plus a big file part.
Such a merge could take hours for big file part. During the duration of this merge
the in-memory part was pinned in memory and couldn't be persisted to disk
under the configured -inmemoryDataFlushInterval .
Another common issue, which could happen when parts with mixed types are merged,
is uncontrolled growth of in-memory parts or small parts when all the merge workers
were busy with merging big files. Such growth could lead to significant performance
degradataion for queries, since every query needs to check ever growing list of parts.
This could also slow down the registration of new time series, since VictoriaMetrics
searches for the internal series_id in the indexdb for every new time series.
The third issue is graceful shutdown duration, which could be very long when a background
merge is running on in-memory parts plus big file parts. This merge couldn't be interrupted,
since it merges in-memory parts.
A separate pool of merge workers per every part type elegantly resolves both issues:
- In-memory parts are merged to file-based parts in a timely manner, since the maximum
size of in-memory parts is limited.
- Long-running merges for big parts do not block merges for in-memory parts and small parts.
- Graceful shutdown duration is now limited by the time needed for flushing in-memory parts to files.
Merging for file parts is instantly canceled on graceful shutdown now.
- Deprecate -smallMergeConcurrency command-line flag, since the new background merge algorithm
should automatically self-tune according to the number of available CPU cores.
- Deprecate -finalMergeDelay command-line flag, since it wasn't working correctly.
It is better to run forced merge when needed - https://docs.victoriametrics.com/#forced-merge
- Tune the number of shards for pending rows and items before the data goes to in-memory parts
and becomes visible for search. This improves the maximum data ingestion rate and the maximum rate
for registration of new time series. This should reduce the duration of data ingestion slowdown
in VictoriaMetrics cluster on e.g. re-routing events, when some of vmstorage nodes become temporarily
unavailable.
- Prevent from possible "sync: WaitGroup misuse" panic on graceful shutdown.
This is a follow-up for fa566c68a6ccf7385a05f649aee7e5f5a38afb15 .
Thanks @misutoth to for the inspiration at https://github.com/VictoriaMetrics/VictoriaMetrics/pull/5212
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/5190
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/3790
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/3551
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/3337
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/3425
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/3647
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/3641
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/648
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/291
2024-01-26 21:39:49 +01:00
|
|
|
pt.startInmemoryPartsMergerLocked()
|
2022-12-06 00:15:00 +01:00
|
|
|
case partSmall:
|
|
|
|
pt.smallParts = append(pt.smallParts, pwNew)
|
lib/{mergeset,storage}: make background merge more responsive and scalable
- Maintain a separate worker pool per each part type (in-memory, file, big and small).
Previously a shared pool was used for merging all the part types.
A single merge worker could merge parts with mixed types at once. For example,
it could merge simultaneously an in-memory part plus a big file part.
Such a merge could take hours for big file part. During the duration of this merge
the in-memory part was pinned in memory and couldn't be persisted to disk
under the configured -inmemoryDataFlushInterval .
Another common issue, which could happen when parts with mixed types are merged,
is uncontrolled growth of in-memory parts or small parts when all the merge workers
were busy with merging big files. Such growth could lead to significant performance
degradataion for queries, since every query needs to check ever growing list of parts.
This could also slow down the registration of new time series, since VictoriaMetrics
searches for the internal series_id in the indexdb for every new time series.
The third issue is graceful shutdown duration, which could be very long when a background
merge is running on in-memory parts plus big file parts. This merge couldn't be interrupted,
since it merges in-memory parts.
A separate pool of merge workers per every part type elegantly resolves both issues:
- In-memory parts are merged to file-based parts in a timely manner, since the maximum
size of in-memory parts is limited.
- Long-running merges for big parts do not block merges for in-memory parts and small parts.
- Graceful shutdown duration is now limited by the time needed for flushing in-memory parts to files.
Merging for file parts is instantly canceled on graceful shutdown now.
- Deprecate -smallMergeConcurrency command-line flag, since the new background merge algorithm
should automatically self-tune according to the number of available CPU cores.
- Deprecate -finalMergeDelay command-line flag, since it wasn't working correctly.
It is better to run forced merge when needed - https://docs.victoriametrics.com/#forced-merge
- Tune the number of shards for pending rows and items before the data goes to in-memory parts
and becomes visible for search. This improves the maximum data ingestion rate and the maximum rate
for registration of new time series. This should reduce the duration of data ingestion slowdown
in VictoriaMetrics cluster on e.g. re-routing events, when some of vmstorage nodes become temporarily
unavailable.
- Prevent from possible "sync: WaitGroup misuse" panic on graceful shutdown.
This is a follow-up for fa566c68a6ccf7385a05f649aee7e5f5a38afb15 .
Thanks @misutoth to for the inspiration at https://github.com/VictoriaMetrics/VictoriaMetrics/pull/5212
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/5190
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/3790
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/3551
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/3337
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/3425
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/3647
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/3641
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/648
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/291
2024-01-26 21:39:49 +01:00
|
|
|
pt.startSmallPartsMergerLocked()
|
2022-12-06 00:15:00 +01:00
|
|
|
case partBig:
|
|
|
|
pt.bigParts = append(pt.bigParts, pwNew)
|
lib/{mergeset,storage}: make background merge more responsive and scalable
- Maintain a separate worker pool per each part type (in-memory, file, big and small).
Previously a shared pool was used for merging all the part types.
A single merge worker could merge parts with mixed types at once. For example,
it could merge simultaneously an in-memory part plus a big file part.
Such a merge could take hours for big file part. During the duration of this merge
the in-memory part was pinned in memory and couldn't be persisted to disk
under the configured -inmemoryDataFlushInterval .
Another common issue, which could happen when parts with mixed types are merged,
is uncontrolled growth of in-memory parts or small parts when all the merge workers
were busy with merging big files. Such growth could lead to significant performance
degradataion for queries, since every query needs to check ever growing list of parts.
This could also slow down the registration of new time series, since VictoriaMetrics
searches for the internal series_id in the indexdb for every new time series.
The third issue is graceful shutdown duration, which could be very long when a background
merge is running on in-memory parts plus big file parts. This merge couldn't be interrupted,
since it merges in-memory parts.
A separate pool of merge workers per every part type elegantly resolves both issues:
- In-memory parts are merged to file-based parts in a timely manner, since the maximum
size of in-memory parts is limited.
- Long-running merges for big parts do not block merges for in-memory parts and small parts.
- Graceful shutdown duration is now limited by the time needed for flushing in-memory parts to files.
Merging for file parts is instantly canceled on graceful shutdown now.
- Deprecate -smallMergeConcurrency command-line flag, since the new background merge algorithm
should automatically self-tune according to the number of available CPU cores.
- Deprecate -finalMergeDelay command-line flag, since it wasn't working correctly.
It is better to run forced merge when needed - https://docs.victoriametrics.com/#forced-merge
- Tune the number of shards for pending rows and items before the data goes to in-memory parts
and becomes visible for search. This improves the maximum data ingestion rate and the maximum rate
for registration of new time series. This should reduce the duration of data ingestion slowdown
in VictoriaMetrics cluster on e.g. re-routing events, when some of vmstorage nodes become temporarily
unavailable.
- Prevent from possible "sync: WaitGroup misuse" panic on graceful shutdown.
This is a follow-up for fa566c68a6ccf7385a05f649aee7e5f5a38afb15 .
Thanks @misutoth to for the inspiration at https://github.com/VictoriaMetrics/VictoriaMetrics/pull/5212
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/5190
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/3790
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/3551
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/3337
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/3425
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/3647
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/3641
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/648
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/291
2024-01-26 21:39:49 +01:00
|
|
|
pt.startBigPartsMergerLocked()
|
2022-12-06 00:15:00 +01:00
|
|
|
default:
|
|
|
|
logger.Panicf("BUG: unknown partType=%d", dstPartType)
|
2019-05-22 23:16:55 +02:00
|
|
|
}
|
|
|
|
}
|
2023-03-19 09:36:05 +01:00
|
|
|
|
|
|
|
// Atomically store the updated list of file-based parts on disk.
|
|
|
|
// This must be performed under partsLock in order to prevent from races
|
|
|
|
// when multiple concurrently running goroutines update the list.
|
|
|
|
if removedSmallParts > 0 || removedBigParts > 0 || pwNew != nil && (dstPartType == partSmall || dstPartType == partBig) {
|
|
|
|
mustWritePartNames(pt.smallParts, pt.bigParts, pt.smallPartsPath)
|
|
|
|
}
|
|
|
|
|
2019-05-22 23:16:55 +02:00
|
|
|
pt.partsLock.Unlock()
|
2022-12-06 00:15:00 +01:00
|
|
|
|
|
|
|
removedParts := removedInmemoryParts + removedSmallParts + removedBigParts
|
|
|
|
if removedParts != len(m) {
|
|
|
|
logger.Panicf("BUG: unexpected number of parts removed; got %d, want %d", removedParts, len(m))
|
2019-05-22 23:16:55 +02:00
|
|
|
}
|
|
|
|
|
2023-03-19 09:36:05 +01:00
|
|
|
// Mark old parts as must be deleted and decrement reference count,
|
|
|
|
// so they are eventually closed and deleted.
|
2019-05-22 23:16:55 +02:00
|
|
|
for _, pw := range pws {
|
2024-02-23 21:54:55 +01:00
|
|
|
pw.mustDrop.Store(true)
|
2019-05-22 23:16:55 +02:00
|
|
|
pw.decRef()
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2022-12-04 08:10:16 +01:00
|
|
|
func getCompressLevel(rowsPerBlock float64) int {
|
2022-02-25 14:32:27 +01:00
|
|
|
// See https://github.com/facebook/zstd/releases/tag/v1.3.4 about negative compression levels.
|
2022-12-04 08:10:16 +01:00
|
|
|
if rowsPerBlock <= 10 {
|
2022-02-25 14:32:27 +01:00
|
|
|
return -5
|
|
|
|
}
|
2022-12-04 08:10:16 +01:00
|
|
|
if rowsPerBlock <= 50 {
|
2022-02-25 14:32:27 +01:00
|
|
|
return -2
|
|
|
|
}
|
2022-12-04 08:10:16 +01:00
|
|
|
if rowsPerBlock <= 200 {
|
2020-05-15 12:11:30 +02:00
|
|
|
return -1
|
|
|
|
}
|
2022-12-04 08:10:16 +01:00
|
|
|
if rowsPerBlock <= 500 {
|
2019-05-22 23:16:55 +02:00
|
|
|
return 1
|
|
|
|
}
|
2022-12-04 08:10:16 +01:00
|
|
|
if rowsPerBlock <= 1000 {
|
2019-05-22 23:16:55 +02:00
|
|
|
return 2
|
|
|
|
}
|
2024-01-23 16:44:05 +01:00
|
|
|
return 3
|
2019-05-22 23:16:55 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
func (pt *partition) nextMergeIdx() uint64 {
|
2024-02-23 23:15:21 +01:00
|
|
|
return pt.mergeIdx.Add(1)
|
2019-05-22 23:16:55 +02:00
|
|
|
}
|
|
|
|
|
lib/{mergeset,storage}: make background merge more responsive and scalable
- Maintain a separate worker pool per each part type (in-memory, file, big and small).
Previously a shared pool was used for merging all the part types.
A single merge worker could merge parts with mixed types at once. For example,
it could merge simultaneously an in-memory part plus a big file part.
Such a merge could take hours for big file part. During the duration of this merge
the in-memory part was pinned in memory and couldn't be persisted to disk
under the configured -inmemoryDataFlushInterval .
Another common issue, which could happen when parts with mixed types are merged,
is uncontrolled growth of in-memory parts or small parts when all the merge workers
were busy with merging big files. Such growth could lead to significant performance
degradataion for queries, since every query needs to check ever growing list of parts.
This could also slow down the registration of new time series, since VictoriaMetrics
searches for the internal series_id in the indexdb for every new time series.
The third issue is graceful shutdown duration, which could be very long when a background
merge is running on in-memory parts plus big file parts. This merge couldn't be interrupted,
since it merges in-memory parts.
A separate pool of merge workers per every part type elegantly resolves both issues:
- In-memory parts are merged to file-based parts in a timely manner, since the maximum
size of in-memory parts is limited.
- Long-running merges for big parts do not block merges for in-memory parts and small parts.
- Graceful shutdown duration is now limited by the time needed for flushing in-memory parts to files.
Merging for file parts is instantly canceled on graceful shutdown now.
- Deprecate -smallMergeConcurrency command-line flag, since the new background merge algorithm
should automatically self-tune according to the number of available CPU cores.
- Deprecate -finalMergeDelay command-line flag, since it wasn't working correctly.
It is better to run forced merge when needed - https://docs.victoriametrics.com/#forced-merge
- Tune the number of shards for pending rows and items before the data goes to in-memory parts
and becomes visible for search. This improves the maximum data ingestion rate and the maximum rate
for registration of new time series. This should reduce the duration of data ingestion slowdown
in VictoriaMetrics cluster on e.g. re-routing events, when some of vmstorage nodes become temporarily
unavailable.
- Prevent from possible "sync: WaitGroup misuse" panic on graceful shutdown.
This is a follow-up for fa566c68a6ccf7385a05f649aee7e5f5a38afb15 .
Thanks @misutoth to for the inspiration at https://github.com/VictoriaMetrics/VictoriaMetrics/pull/5212
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/5190
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/3790
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/3551
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/3337
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/3425
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/3647
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/3641
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/648
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/291
2024-01-26 21:39:49 +01:00
|
|
|
func removeParts(pws []*partWrapper, partsToRemove map[*partWrapper]struct{}) ([]*partWrapper, int) {
|
2019-05-22 23:16:55 +02:00
|
|
|
dst := pws[:0]
|
|
|
|
for _, pw := range pws {
|
lib/{mergeset,storage}: make background merge more responsive and scalable
- Maintain a separate worker pool per each part type (in-memory, file, big and small).
Previously a shared pool was used for merging all the part types.
A single merge worker could merge parts with mixed types at once. For example,
it could merge simultaneously an in-memory part plus a big file part.
Such a merge could take hours for big file part. During the duration of this merge
the in-memory part was pinned in memory and couldn't be persisted to disk
under the configured -inmemoryDataFlushInterval .
Another common issue, which could happen when parts with mixed types are merged,
is uncontrolled growth of in-memory parts or small parts when all the merge workers
were busy with merging big files. Such growth could lead to significant performance
degradataion for queries, since every query needs to check ever growing list of parts.
This could also slow down the registration of new time series, since VictoriaMetrics
searches for the internal series_id in the indexdb for every new time series.
The third issue is graceful shutdown duration, which could be very long when a background
merge is running on in-memory parts plus big file parts. This merge couldn't be interrupted,
since it merges in-memory parts.
A separate pool of merge workers per every part type elegantly resolves both issues:
- In-memory parts are merged to file-based parts in a timely manner, since the maximum
size of in-memory parts is limited.
- Long-running merges for big parts do not block merges for in-memory parts and small parts.
- Graceful shutdown duration is now limited by the time needed for flushing in-memory parts to files.
Merging for file parts is instantly canceled on graceful shutdown now.
- Deprecate -smallMergeConcurrency command-line flag, since the new background merge algorithm
should automatically self-tune according to the number of available CPU cores.
- Deprecate -finalMergeDelay command-line flag, since it wasn't working correctly.
It is better to run forced merge when needed - https://docs.victoriametrics.com/#forced-merge
- Tune the number of shards for pending rows and items before the data goes to in-memory parts
and becomes visible for search. This improves the maximum data ingestion rate and the maximum rate
for registration of new time series. This should reduce the duration of data ingestion slowdown
in VictoriaMetrics cluster on e.g. re-routing events, when some of vmstorage nodes become temporarily
unavailable.
- Prevent from possible "sync: WaitGroup misuse" panic on graceful shutdown.
This is a follow-up for fa566c68a6ccf7385a05f649aee7e5f5a38afb15 .
Thanks @misutoth to for the inspiration at https://github.com/VictoriaMetrics/VictoriaMetrics/pull/5212
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/5190
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/3790
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/3551
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/3337
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/3425
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/3647
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/3641
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/648
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/291
2024-01-26 21:39:49 +01:00
|
|
|
if _, ok := partsToRemove[pw]; !ok {
|
2020-09-17 01:05:54 +02:00
|
|
|
dst = append(dst, pw)
|
2019-05-22 23:16:55 +02:00
|
|
|
}
|
|
|
|
}
|
2022-12-06 00:15:00 +01:00
|
|
|
for i := len(dst); i < len(pws); i++ {
|
|
|
|
pws[i] = nil
|
|
|
|
}
|
|
|
|
return dst, len(pws) - len(dst)
|
2019-05-22 23:16:55 +02:00
|
|
|
}
|
|
|
|
|
2020-12-22 18:48:27 +01:00
|
|
|
func (pt *partition) stalePartsRemover() {
|
2024-01-22 17:12:37 +01:00
|
|
|
d := timeutil.AddJitterToDuration(7 * time.Minute)
|
|
|
|
ticker := time.NewTicker(d)
|
2020-12-22 18:48:27 +01:00
|
|
|
defer ticker.Stop()
|
|
|
|
for {
|
|
|
|
select {
|
|
|
|
case <-pt.stopCh:
|
|
|
|
return
|
|
|
|
case <-ticker.C:
|
|
|
|
pt.removeStaleParts()
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
func (pt *partition) removeStaleParts() {
|
|
|
|
startTime := time.Now()
|
2022-10-24 00:30:50 +02:00
|
|
|
retentionDeadline := timestampFromTime(startTime) - pt.s.retentionMsecs
|
2020-12-22 18:48:27 +01:00
|
|
|
|
2023-03-19 09:36:05 +01:00
|
|
|
var pws []*partWrapper
|
2020-12-22 18:48:27 +01:00
|
|
|
pt.partsLock.Lock()
|
2022-12-06 00:15:00 +01:00
|
|
|
for _, pw := range pt.inmemoryParts {
|
2020-12-24 07:50:10 +01:00
|
|
|
if !pw.isInMerge && pw.p.ph.MaxTimestamp < retentionDeadline {
|
2024-02-23 23:15:21 +01:00
|
|
|
pt.inmemoryRowsDeleted.Add(pw.p.ph.RowsCount)
|
2023-03-19 09:36:05 +01:00
|
|
|
pw.isInMerge = true
|
|
|
|
pws = append(pws, pw)
|
2020-12-22 18:48:27 +01:00
|
|
|
}
|
|
|
|
}
|
|
|
|
for _, pw := range pt.smallParts {
|
2020-12-24 07:50:10 +01:00
|
|
|
if !pw.isInMerge && pw.p.ph.MaxTimestamp < retentionDeadline {
|
2024-02-23 23:15:21 +01:00
|
|
|
pt.smallRowsDeleted.Add(pw.p.ph.RowsCount)
|
2023-03-19 09:36:05 +01:00
|
|
|
pw.isInMerge = true
|
|
|
|
pws = append(pws, pw)
|
2020-12-22 18:48:27 +01:00
|
|
|
}
|
|
|
|
}
|
2022-12-06 00:15:00 +01:00
|
|
|
for _, pw := range pt.bigParts {
|
|
|
|
if !pw.isInMerge && pw.p.ph.MaxTimestamp < retentionDeadline {
|
2024-02-23 23:15:21 +01:00
|
|
|
pt.bigRowsDeleted.Add(pw.p.ph.RowsCount)
|
2023-03-19 09:36:05 +01:00
|
|
|
pw.isInMerge = true
|
|
|
|
pws = append(pws, pw)
|
2022-12-06 00:15:00 +01:00
|
|
|
}
|
|
|
|
}
|
2020-12-22 18:48:27 +01:00
|
|
|
pt.partsLock.Unlock()
|
|
|
|
|
2023-03-19 09:36:05 +01:00
|
|
|
pt.swapSrcWithDstParts(pws, nil, partSmall)
|
2020-12-22 18:48:27 +01:00
|
|
|
}
|
|
|
|
|
2019-05-22 23:16:55 +02:00
|
|
|
// getPartsToMerge returns optimal parts to merge from pws.
|
|
|
|
//
|
2021-08-25 08:35:03 +02:00
|
|
|
// The summary size of the returned parts must be smaller than maxOutBytes.
|
lib/{mergeset,storage}: make background merge more responsive and scalable
- Maintain a separate worker pool per each part type (in-memory, file, big and small).
Previously a shared pool was used for merging all the part types.
A single merge worker could merge parts with mixed types at once. For example,
it could merge simultaneously an in-memory part plus a big file part.
Such a merge could take hours for big file part. During the duration of this merge
the in-memory part was pinned in memory and couldn't be persisted to disk
under the configured -inmemoryDataFlushInterval .
Another common issue, which could happen when parts with mixed types are merged,
is uncontrolled growth of in-memory parts or small parts when all the merge workers
were busy with merging big files. Such growth could lead to significant performance
degradataion for queries, since every query needs to check ever growing list of parts.
This could also slow down the registration of new time series, since VictoriaMetrics
searches for the internal series_id in the indexdb for every new time series.
The third issue is graceful shutdown duration, which could be very long when a background
merge is running on in-memory parts plus big file parts. This merge couldn't be interrupted,
since it merges in-memory parts.
A separate pool of merge workers per every part type elegantly resolves both issues:
- In-memory parts are merged to file-based parts in a timely manner, since the maximum
size of in-memory parts is limited.
- Long-running merges for big parts do not block merges for in-memory parts and small parts.
- Graceful shutdown duration is now limited by the time needed for flushing in-memory parts to files.
Merging for file parts is instantly canceled on graceful shutdown now.
- Deprecate -smallMergeConcurrency command-line flag, since the new background merge algorithm
should automatically self-tune according to the number of available CPU cores.
- Deprecate -finalMergeDelay command-line flag, since it wasn't working correctly.
It is better to run forced merge when needed - https://docs.victoriametrics.com/#forced-merge
- Tune the number of shards for pending rows and items before the data goes to in-memory parts
and becomes visible for search. This improves the maximum data ingestion rate and the maximum rate
for registration of new time series. This should reduce the duration of data ingestion slowdown
in VictoriaMetrics cluster on e.g. re-routing events, when some of vmstorage nodes become temporarily
unavailable.
- Prevent from possible "sync: WaitGroup misuse" panic on graceful shutdown.
This is a follow-up for fa566c68a6ccf7385a05f649aee7e5f5a38afb15 .
Thanks @misutoth to for the inspiration at https://github.com/VictoriaMetrics/VictoriaMetrics/pull/5212
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/5190
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/3790
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/3551
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/3337
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/3425
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/3647
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/3641
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/648
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/291
2024-01-26 21:39:49 +01:00
|
|
|
func getPartsToMerge(pws []*partWrapper, maxOutBytes uint64) []*partWrapper {
|
2019-05-22 23:16:55 +02:00
|
|
|
pwsRemaining := make([]*partWrapper, 0, len(pws))
|
|
|
|
for _, pw := range pws {
|
|
|
|
if !pw.isInMerge {
|
|
|
|
pwsRemaining = append(pwsRemaining, pw)
|
|
|
|
}
|
|
|
|
}
|
lib/{mergeset,storage}: make background merge more responsive and scalable
- Maintain a separate worker pool per each part type (in-memory, file, big and small).
Previously a shared pool was used for merging all the part types.
A single merge worker could merge parts with mixed types at once. For example,
it could merge simultaneously an in-memory part plus a big file part.
Such a merge could take hours for big file part. During the duration of this merge
the in-memory part was pinned in memory and couldn't be persisted to disk
under the configured -inmemoryDataFlushInterval .
Another common issue, which could happen when parts with mixed types are merged,
is uncontrolled growth of in-memory parts or small parts when all the merge workers
were busy with merging big files. Such growth could lead to significant performance
degradataion for queries, since every query needs to check ever growing list of parts.
This could also slow down the registration of new time series, since VictoriaMetrics
searches for the internal series_id in the indexdb for every new time series.
The third issue is graceful shutdown duration, which could be very long when a background
merge is running on in-memory parts plus big file parts. This merge couldn't be interrupted,
since it merges in-memory parts.
A separate pool of merge workers per every part type elegantly resolves both issues:
- In-memory parts are merged to file-based parts in a timely manner, since the maximum
size of in-memory parts is limited.
- Long-running merges for big parts do not block merges for in-memory parts and small parts.
- Graceful shutdown duration is now limited by the time needed for flushing in-memory parts to files.
Merging for file parts is instantly canceled on graceful shutdown now.
- Deprecate -smallMergeConcurrency command-line flag, since the new background merge algorithm
should automatically self-tune according to the number of available CPU cores.
- Deprecate -finalMergeDelay command-line flag, since it wasn't working correctly.
It is better to run forced merge when needed - https://docs.victoriametrics.com/#forced-merge
- Tune the number of shards for pending rows and items before the data goes to in-memory parts
and becomes visible for search. This improves the maximum data ingestion rate and the maximum rate
for registration of new time series. This should reduce the duration of data ingestion slowdown
in VictoriaMetrics cluster on e.g. re-routing events, when some of vmstorage nodes become temporarily
unavailable.
- Prevent from possible "sync: WaitGroup misuse" panic on graceful shutdown.
This is a follow-up for fa566c68a6ccf7385a05f649aee7e5f5a38afb15 .
Thanks @misutoth to for the inspiration at https://github.com/VictoriaMetrics/VictoriaMetrics/pull/5212
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/5190
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/3790
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/3551
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/3337
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/3425
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/3647
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/3641
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/648
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/291
2024-01-26 21:39:49 +01:00
|
|
|
|
|
|
|
pwsToMerge := appendPartsToMerge(nil, pwsRemaining, defaultPartsToMerge, maxOutBytes)
|
|
|
|
|
|
|
|
for _, pw := range pwsToMerge {
|
2019-05-22 23:16:55 +02:00
|
|
|
if pw.isInMerge {
|
|
|
|
logger.Panicf("BUG: partWrapper.isInMerge cannot be set")
|
|
|
|
}
|
|
|
|
pw.isInMerge = true
|
|
|
|
}
|
lib/{mergeset,storage}: make background merge more responsive and scalable
- Maintain a separate worker pool per each part type (in-memory, file, big and small).
Previously a shared pool was used for merging all the part types.
A single merge worker could merge parts with mixed types at once. For example,
it could merge simultaneously an in-memory part plus a big file part.
Such a merge could take hours for big file part. During the duration of this merge
the in-memory part was pinned in memory and couldn't be persisted to disk
under the configured -inmemoryDataFlushInterval .
Another common issue, which could happen when parts with mixed types are merged,
is uncontrolled growth of in-memory parts or small parts when all the merge workers
were busy with merging big files. Such growth could lead to significant performance
degradataion for queries, since every query needs to check ever growing list of parts.
This could also slow down the registration of new time series, since VictoriaMetrics
searches for the internal series_id in the indexdb for every new time series.
The third issue is graceful shutdown duration, which could be very long when a background
merge is running on in-memory parts plus big file parts. This merge couldn't be interrupted,
since it merges in-memory parts.
A separate pool of merge workers per every part type elegantly resolves both issues:
- In-memory parts are merged to file-based parts in a timely manner, since the maximum
size of in-memory parts is limited.
- Long-running merges for big parts do not block merges for in-memory parts and small parts.
- Graceful shutdown duration is now limited by the time needed for flushing in-memory parts to files.
Merging for file parts is instantly canceled on graceful shutdown now.
- Deprecate -smallMergeConcurrency command-line flag, since the new background merge algorithm
should automatically self-tune according to the number of available CPU cores.
- Deprecate -finalMergeDelay command-line flag, since it wasn't working correctly.
It is better to run forced merge when needed - https://docs.victoriametrics.com/#forced-merge
- Tune the number of shards for pending rows and items before the data goes to in-memory parts
and becomes visible for search. This improves the maximum data ingestion rate and the maximum rate
for registration of new time series. This should reduce the duration of data ingestion slowdown
in VictoriaMetrics cluster on e.g. re-routing events, when some of vmstorage nodes become temporarily
unavailable.
- Prevent from possible "sync: WaitGroup misuse" panic on graceful shutdown.
This is a follow-up for fa566c68a6ccf7385a05f649aee7e5f5a38afb15 .
Thanks @misutoth to for the inspiration at https://github.com/VictoriaMetrics/VictoriaMetrics/pull/5212
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/5190
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/3790
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/3551
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/3337
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/3425
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/3647
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/3641
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/648
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/291
2024-01-26 21:39:49 +01:00
|
|
|
|
|
|
|
return pwsToMerge
|
|
|
|
}
|
|
|
|
|
|
|
|
// getPartsForOptimalMerge returns parts from pws for optimal merge, plus the remaining parts.
|
|
|
|
//
|
|
|
|
// the pws items are replaced by nil after the call. This is needed for helping Go GC to reclaim the referenced items.
|
|
|
|
func getPartsForOptimalMerge(pws []*partWrapper) ([]*partWrapper, []*partWrapper) {
|
|
|
|
pwsToMerge := appendPartsToMerge(nil, pws, defaultPartsToMerge, 1<<64-1)
|
|
|
|
if len(pwsToMerge) == 0 {
|
|
|
|
return pws, nil
|
|
|
|
}
|
|
|
|
|
|
|
|
m := makeMapFromPartWrappers(pwsToMerge)
|
|
|
|
pwsRemaining := make([]*partWrapper, 0, len(pws)-len(pwsToMerge))
|
|
|
|
for _, pw := range pws {
|
|
|
|
if _, ok := m[pw]; !ok {
|
|
|
|
pwsRemaining = append(pwsRemaining, pw)
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
// Clear references to pws items, so they could be reclaimed faster by Go GC.
|
|
|
|
for i := range pws {
|
|
|
|
pws[i] = nil
|
|
|
|
}
|
|
|
|
|
|
|
|
return pwsToMerge, pwsRemaining
|
2019-05-22 23:16:55 +02:00
|
|
|
}
|
|
|
|
|
2021-08-25 08:35:03 +02:00
|
|
|
// minMergeMultiplier is the minimum multiplier for the size of the output part
|
|
|
|
// compared to the size of the maximum input part for the merge.
|
|
|
|
//
|
|
|
|
// Higher value reduces write amplification (disk write IO induced by the merge),
|
|
|
|
// while increases the number of unmerged parts.
|
|
|
|
// The 1.7 is good enough for production workloads.
|
|
|
|
const minMergeMultiplier = 1.7
|
|
|
|
|
2023-09-25 16:52:37 +02:00
|
|
|
// appendPartsToMerge finds optimal parts to merge from src, appends them to dst and returns the result.
|
|
|
|
func appendPartsToMerge(dst, src []*partWrapper, maxPartsToMerge int, maxOutBytes uint64) []*partWrapper {
|
2019-05-22 23:16:55 +02:00
|
|
|
if len(src) < 2 {
|
|
|
|
// There is no need in merging zero or one part :)
|
2023-09-25 16:52:37 +02:00
|
|
|
return dst
|
2019-05-22 23:16:55 +02:00
|
|
|
}
|
|
|
|
if maxPartsToMerge < 2 {
|
|
|
|
logger.Panicf("BUG: maxPartsToMerge cannot be smaller than 2; got %d", maxPartsToMerge)
|
|
|
|
}
|
|
|
|
|
|
|
|
// Filter out too big parts.
|
2020-07-31 12:48:35 +02:00
|
|
|
// This should reduce N for O(N^2) algorithm below.
|
2021-08-25 08:35:03 +02:00
|
|
|
maxInPartBytes := uint64(float64(maxOutBytes) / minMergeMultiplier)
|
2019-05-22 23:16:55 +02:00
|
|
|
tmp := make([]*partWrapper, 0, len(src))
|
|
|
|
for _, pw := range src {
|
2021-08-25 08:35:03 +02:00
|
|
|
if pw.p.size > maxInPartBytes {
|
2019-05-22 23:16:55 +02:00
|
|
|
continue
|
|
|
|
}
|
|
|
|
tmp = append(tmp, pw)
|
|
|
|
}
|
|
|
|
src = tmp
|
|
|
|
|
2022-12-06 00:15:00 +01:00
|
|
|
sortPartsForOptimalMerge(src)
|
2019-05-22 23:16:55 +02:00
|
|
|
|
2020-12-18 19:00:06 +01:00
|
|
|
maxSrcParts := maxPartsToMerge
|
2021-07-02 16:24:14 +02:00
|
|
|
if maxSrcParts > len(src) {
|
2020-12-18 19:00:06 +01:00
|
|
|
maxSrcParts = len(src)
|
2019-05-22 23:16:55 +02:00
|
|
|
}
|
2021-07-02 16:24:14 +02:00
|
|
|
minSrcParts := (maxSrcParts + 1) / 2
|
|
|
|
if minSrcParts < 2 {
|
|
|
|
minSrcParts = 2
|
|
|
|
}
|
2019-05-22 23:16:55 +02:00
|
|
|
|
2020-12-18 19:00:06 +01:00
|
|
|
// Exhaustive search for parts giving the lowest write amplification when merged.
|
2019-05-22 23:16:55 +02:00
|
|
|
var pws []*partWrapper
|
|
|
|
maxM := float64(0)
|
2020-12-18 19:00:06 +01:00
|
|
|
for i := minSrcParts; i <= maxSrcParts; i++ {
|
2019-05-22 23:16:55 +02:00
|
|
|
for j := 0; j <= len(src)-i; j++ {
|
2019-10-29 11:45:19 +01:00
|
|
|
a := src[j : j+i]
|
2021-08-25 08:35:03 +02:00
|
|
|
if a[0].p.size*uint64(len(a)) < a[len(a)-1].p.size {
|
|
|
|
// Do not merge parts with too big difference in size,
|
2020-12-18 19:00:06 +01:00
|
|
|
// since this results in unbalanced merges.
|
|
|
|
continue
|
|
|
|
}
|
2023-09-25 16:52:37 +02:00
|
|
|
outSize := getPartsSize(a)
|
2021-08-25 08:35:03 +02:00
|
|
|
if outSize > maxOutBytes {
|
|
|
|
// There is no need in verifying remaining parts with bigger sizes.
|
2019-10-29 11:45:19 +01:00
|
|
|
break
|
2019-05-22 23:16:55 +02:00
|
|
|
}
|
2021-08-25 08:35:03 +02:00
|
|
|
m := float64(outSize) / float64(a[len(a)-1].p.size)
|
2019-05-22 23:16:55 +02:00
|
|
|
if m < maxM {
|
|
|
|
continue
|
|
|
|
}
|
|
|
|
maxM = m
|
2019-10-29 11:45:19 +01:00
|
|
|
pws = a
|
2019-05-22 23:16:55 +02:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2019-10-29 11:45:19 +01:00
|
|
|
minM := float64(maxPartsToMerge) / 2
|
2021-08-25 08:35:03 +02:00
|
|
|
if minM < minMergeMultiplier {
|
|
|
|
minM = minMergeMultiplier
|
2019-05-22 23:16:55 +02:00
|
|
|
}
|
|
|
|
if maxM < minM {
|
2021-08-25 08:35:03 +02:00
|
|
|
// There is no sense in merging parts with too small m,
|
|
|
|
// since this leads to high disk write IO.
|
2023-09-25 16:52:37 +02:00
|
|
|
return dst
|
2019-05-22 23:16:55 +02:00
|
|
|
}
|
2023-09-25 16:52:37 +02:00
|
|
|
return append(dst, pws...)
|
2019-05-22 23:16:55 +02:00
|
|
|
}
|
|
|
|
|
2022-12-06 00:15:00 +01:00
|
|
|
func sortPartsForOptimalMerge(pws []*partWrapper) {
|
|
|
|
// Sort src parts by size and backwards timestamp.
|
|
|
|
// This should improve adjanced points' locality in the merged parts.
|
|
|
|
sort.Slice(pws, func(i, j int) bool {
|
|
|
|
a := pws[i].p
|
|
|
|
b := pws[j].p
|
|
|
|
if a.size == b.size {
|
|
|
|
return a.ph.MinTimestamp > b.ph.MinTimestamp
|
|
|
|
}
|
|
|
|
return a.size < b.size
|
|
|
|
})
|
|
|
|
}
|
|
|
|
|
lib/{mergeset,storage}: make background merge more responsive and scalable
- Maintain a separate worker pool per each part type (in-memory, file, big and small).
Previously a shared pool was used for merging all the part types.
A single merge worker could merge parts with mixed types at once. For example,
it could merge simultaneously an in-memory part plus a big file part.
Such a merge could take hours for big file part. During the duration of this merge
the in-memory part was pinned in memory and couldn't be persisted to disk
under the configured -inmemoryDataFlushInterval .
Another common issue, which could happen when parts with mixed types are merged,
is uncontrolled growth of in-memory parts or small parts when all the merge workers
were busy with merging big files. Such growth could lead to significant performance
degradataion for queries, since every query needs to check ever growing list of parts.
This could also slow down the registration of new time series, since VictoriaMetrics
searches for the internal series_id in the indexdb for every new time series.
The third issue is graceful shutdown duration, which could be very long when a background
merge is running on in-memory parts plus big file parts. This merge couldn't be interrupted,
since it merges in-memory parts.
A separate pool of merge workers per every part type elegantly resolves both issues:
- In-memory parts are merged to file-based parts in a timely manner, since the maximum
size of in-memory parts is limited.
- Long-running merges for big parts do not block merges for in-memory parts and small parts.
- Graceful shutdown duration is now limited by the time needed for flushing in-memory parts to files.
Merging for file parts is instantly canceled on graceful shutdown now.
- Deprecate -smallMergeConcurrency command-line flag, since the new background merge algorithm
should automatically self-tune according to the number of available CPU cores.
- Deprecate -finalMergeDelay command-line flag, since it wasn't working correctly.
It is better to run forced merge when needed - https://docs.victoriametrics.com/#forced-merge
- Tune the number of shards for pending rows and items before the data goes to in-memory parts
and becomes visible for search. This improves the maximum data ingestion rate and the maximum rate
for registration of new time series. This should reduce the duration of data ingestion slowdown
in VictoriaMetrics cluster on e.g. re-routing events, when some of vmstorage nodes become temporarily
unavailable.
- Prevent from possible "sync: WaitGroup misuse" panic on graceful shutdown.
This is a follow-up for fa566c68a6ccf7385a05f649aee7e5f5a38afb15 .
Thanks @misutoth to for the inspiration at https://github.com/VictoriaMetrics/VictoriaMetrics/pull/5212
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/5190
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/3790
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/3551
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/3337
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/3425
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/3647
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/3641
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/648
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/291
2024-01-26 21:39:49 +01:00
|
|
|
func makeMapFromPartWrappers(pws []*partWrapper) map[*partWrapper]struct{} {
|
|
|
|
m := make(map[*partWrapper]struct{}, len(pws))
|
|
|
|
for _, pw := range pws {
|
|
|
|
m[pw] = struct{}{}
|
|
|
|
}
|
|
|
|
if len(m) != len(pws) {
|
|
|
|
logger.Panicf("BUG: %d duplicate parts found in %d source parts", len(pws)-len(m), len(pws))
|
|
|
|
}
|
|
|
|
return m
|
|
|
|
}
|
|
|
|
|
2021-08-25 08:35:03 +02:00
|
|
|
func getPartsSize(pws []*partWrapper) uint64 {
|
2020-12-18 22:14:35 +01:00
|
|
|
n := uint64(0)
|
|
|
|
for _, pw := range pws {
|
2021-08-25 08:35:03 +02:00
|
|
|
n += pw.p.size
|
2020-12-18 22:14:35 +01:00
|
|
|
}
|
|
|
|
return n
|
|
|
|
}
|
|
|
|
|
2024-04-16 19:07:36 +02:00
|
|
|
func mustOpenParts(partsFile, path string, partNames []string) []*partWrapper {
|
2019-11-02 01:26:02 +01:00
|
|
|
// The path can be missing after restoring from backup, so create it if needed.
|
2023-04-14 07:11:56 +02:00
|
|
|
fs.MustMkdirIfNotExist(path)
|
2022-09-13 14:28:01 +02:00
|
|
|
fs.MustRemoveTemporaryDirs(path)
|
2019-05-22 23:16:55 +02:00
|
|
|
|
2023-03-19 09:36:05 +01:00
|
|
|
// Remove txn and tmp directories, which may be left after the upgrade
|
|
|
|
// to v1.90.0 and newer versions.
|
2023-03-25 22:33:54 +01:00
|
|
|
fs.MustRemoveAll(filepath.Join(path, "txn"))
|
|
|
|
fs.MustRemoveAll(filepath.Join(path, "tmp"))
|
2019-05-22 23:16:55 +02:00
|
|
|
|
2023-03-19 09:36:05 +01:00
|
|
|
// Remove dirs missing in partNames. These dirs may be left after unclean shutdown
|
|
|
|
// or after the update from versions prior to v1.90.0.
|
2023-04-15 07:08:43 +02:00
|
|
|
des := fs.MustReadDir(path)
|
2023-03-19 09:36:05 +01:00
|
|
|
m := make(map[string]struct{}, len(partNames))
|
|
|
|
for _, partName := range partNames {
|
2023-09-19 11:17:41 +02:00
|
|
|
// Make sure the partName exists on disk.
|
|
|
|
// If it is missing, then manual action from the user is needed,
|
|
|
|
// since this is unexpected state, which cannot occur under normal operation,
|
|
|
|
// including unclean shutdown.
|
|
|
|
partPath := filepath.Join(path, partName)
|
|
|
|
if !fs.IsPathExist(partPath) {
|
|
|
|
logger.Panicf("FATAL: part %q is listed in %q, but is missing on disk; "+
|
|
|
|
"ensure %q contents is not corrupted; remove %q to rebuild its' content from the list of existing parts",
|
|
|
|
partPath, partsFile, partsFile, partsFile)
|
|
|
|
}
|
|
|
|
|
2023-03-19 09:36:05 +01:00
|
|
|
m[partName] = struct{}{}
|
2019-05-22 23:16:55 +02:00
|
|
|
}
|
2023-03-18 05:03:34 +01:00
|
|
|
for _, de := range des {
|
|
|
|
if !fs.IsDirOrSymlink(de) {
|
2019-05-22 23:16:55 +02:00
|
|
|
// Skip non-directories.
|
|
|
|
continue
|
|
|
|
}
|
2023-03-18 05:03:34 +01:00
|
|
|
fn := de.Name()
|
2023-03-19 09:36:05 +01:00
|
|
|
if _, ok := m[fn]; !ok {
|
2023-03-25 22:33:54 +01:00
|
|
|
deletePath := filepath.Join(path, fn)
|
2024-04-16 19:07:36 +02:00
|
|
|
logger.Infof("deleting %q because it isn't listed in %q; this is the expected case after unclean shutdown", deletePath, partsFile)
|
2023-03-19 09:36:05 +01:00
|
|
|
fs.MustRemoveAll(deletePath)
|
2021-04-22 11:58:53 +02:00
|
|
|
}
|
2023-03-19 09:36:05 +01:00
|
|
|
}
|
|
|
|
fs.MustSyncPath(path)
|
|
|
|
|
|
|
|
// Open parts
|
|
|
|
var pws []*partWrapper
|
|
|
|
for _, partName := range partNames {
|
2023-03-25 22:33:54 +01:00
|
|
|
partPath := filepath.Join(path, partName)
|
2023-04-15 00:46:09 +02:00
|
|
|
p := mustOpenFilePart(partPath)
|
2019-05-22 23:16:55 +02:00
|
|
|
pw := &partWrapper{
|
2024-02-23 21:54:55 +01:00
|
|
|
p: p,
|
2019-05-22 23:16:55 +02:00
|
|
|
}
|
2024-02-23 21:54:55 +01:00
|
|
|
pw.incRef()
|
2019-05-22 23:16:55 +02:00
|
|
|
pws = append(pws, pw)
|
|
|
|
}
|
|
|
|
|
2023-04-15 07:08:43 +02:00
|
|
|
return pws
|
2019-05-22 23:16:55 +02:00
|
|
|
}
|
|
|
|
|
2023-04-14 08:02:55 +02:00
|
|
|
// MustCreateSnapshotAt creates pt snapshot at the given smallPath and bigPath dirs.
|
2019-05-22 23:16:55 +02:00
|
|
|
//
|
2023-03-19 09:36:05 +01:00
|
|
|
// Snapshot is created using linux hard links, so it is usually created very quickly.
|
2023-04-14 08:02:55 +02:00
|
|
|
func (pt *partition) MustCreateSnapshotAt(smallPath, bigPath string) {
|
2019-05-22 23:16:55 +02:00
|
|
|
logger.Infof("creating partition snapshot of %q and %q...", pt.smallPartsPath, pt.bigPartsPath)
|
|
|
|
startTime := time.Now()
|
|
|
|
|
|
|
|
// Flush inmemory data to disk.
|
lib/{mergeset,storage}: make background merge more responsive and scalable
- Maintain a separate worker pool per each part type (in-memory, file, big and small).
Previously a shared pool was used for merging all the part types.
A single merge worker could merge parts with mixed types at once. For example,
it could merge simultaneously an in-memory part plus a big file part.
Such a merge could take hours for big file part. During the duration of this merge
the in-memory part was pinned in memory and couldn't be persisted to disk
under the configured -inmemoryDataFlushInterval .
Another common issue, which could happen when parts with mixed types are merged,
is uncontrolled growth of in-memory parts or small parts when all the merge workers
were busy with merging big files. Such growth could lead to significant performance
degradataion for queries, since every query needs to check ever growing list of parts.
This could also slow down the registration of new time series, since VictoriaMetrics
searches for the internal series_id in the indexdb for every new time series.
The third issue is graceful shutdown duration, which could be very long when a background
merge is running on in-memory parts plus big file parts. This merge couldn't be interrupted,
since it merges in-memory parts.
A separate pool of merge workers per every part type elegantly resolves both issues:
- In-memory parts are merged to file-based parts in a timely manner, since the maximum
size of in-memory parts is limited.
- Long-running merges for big parts do not block merges for in-memory parts and small parts.
- Graceful shutdown duration is now limited by the time needed for flushing in-memory parts to files.
Merging for file parts is instantly canceled on graceful shutdown now.
- Deprecate -smallMergeConcurrency command-line flag, since the new background merge algorithm
should automatically self-tune according to the number of available CPU cores.
- Deprecate -finalMergeDelay command-line flag, since it wasn't working correctly.
It is better to run forced merge when needed - https://docs.victoriametrics.com/#forced-merge
- Tune the number of shards for pending rows and items before the data goes to in-memory parts
and becomes visible for search. This improves the maximum data ingestion rate and the maximum rate
for registration of new time series. This should reduce the duration of data ingestion slowdown
in VictoriaMetrics cluster on e.g. re-routing events, when some of vmstorage nodes become temporarily
unavailable.
- Prevent from possible "sync: WaitGroup misuse" panic on graceful shutdown.
This is a follow-up for fa566c68a6ccf7385a05f649aee7e5f5a38afb15 .
Thanks @misutoth to for the inspiration at https://github.com/VictoriaMetrics/VictoriaMetrics/pull/5212
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/5190
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/3790
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/3551
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/3337
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/3425
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/3647
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/3641
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/648
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/291
2024-01-26 21:39:49 +01:00
|
|
|
pt.flushInmemoryRowsToFiles()
|
2019-05-22 23:16:55 +02:00
|
|
|
|
2023-03-19 09:36:05 +01:00
|
|
|
pt.partsLock.Lock()
|
|
|
|
incRefForParts(pt.smallParts)
|
|
|
|
pwsSmall := append([]*partWrapper{}, pt.smallParts...)
|
|
|
|
incRefForParts(pt.bigParts)
|
|
|
|
pwsBig := append([]*partWrapper{}, pt.bigParts...)
|
|
|
|
pt.partsLock.Unlock()
|
|
|
|
|
|
|
|
defer func() {
|
|
|
|
pt.PutParts(pwsSmall)
|
|
|
|
pt.PutParts(pwsBig)
|
|
|
|
}()
|
|
|
|
|
2023-04-14 07:11:56 +02:00
|
|
|
fs.MustMkdirFailIfExist(smallPath)
|
|
|
|
fs.MustMkdirFailIfExist(bigPath)
|
2023-03-19 09:36:05 +01:00
|
|
|
|
|
|
|
// Create a file with part names at smallPath
|
|
|
|
mustWritePartNames(pwsSmall, pwsBig, smallPath)
|
2019-05-22 23:16:55 +02:00
|
|
|
|
2023-04-14 08:02:55 +02:00
|
|
|
pt.mustCreateSnapshot(pt.smallPartsPath, smallPath, pwsSmall)
|
|
|
|
pt.mustCreateSnapshot(pt.bigPartsPath, bigPath, pwsBig)
|
2019-05-22 23:16:55 +02:00
|
|
|
|
2020-01-22 17:27:44 +01:00
|
|
|
logger.Infof("created partition snapshot of %q and %q at %q and %q in %.3f seconds",
|
|
|
|
pt.smallPartsPath, pt.bigPartsPath, smallPath, bigPath, time.Since(startTime).Seconds())
|
2019-05-22 23:16:55 +02:00
|
|
|
}
|
|
|
|
|
2023-04-14 08:02:55 +02:00
|
|
|
// mustCreateSnapshot creates a snapshot from srcDir to dstDir.
|
|
|
|
func (pt *partition) mustCreateSnapshot(srcDir, dstDir string, pws []*partWrapper) {
|
2023-03-19 09:36:05 +01:00
|
|
|
// Make hardlinks for pws at dstDir
|
|
|
|
for _, pw := range pws {
|
|
|
|
srcPartPath := pw.p.path
|
2023-03-25 22:33:54 +01:00
|
|
|
dstPartPath := filepath.Join(dstDir, filepath.Base(srcPartPath))
|
2023-04-14 07:48:05 +02:00
|
|
|
fs.MustHardLinkFiles(srcPartPath, dstPartPath)
|
2019-05-22 23:16:55 +02:00
|
|
|
}
|
|
|
|
|
2023-03-25 22:33:54 +01:00
|
|
|
// Copy the appliedRetentionFilename to dstDir.
|
2023-03-19 09:36:05 +01:00
|
|
|
// This file can be created by VictoriaMetrics enterprise.
|
|
|
|
// See https://docs.victoriametrics.com/#retention-filters .
|
|
|
|
// Do not make hard link to this file, since it can be modified over time.
|
2023-03-25 22:33:54 +01:00
|
|
|
srcPath := filepath.Join(srcDir, appliedRetentionFilename)
|
2023-03-19 09:36:05 +01:00
|
|
|
if fs.IsPathExist(srcPath) {
|
2023-03-25 22:33:54 +01:00
|
|
|
dstPath := filepath.Join(dstDir, filepath.Base(srcPath))
|
2023-04-14 08:02:55 +02:00
|
|
|
fs.MustCopyFile(srcPath, dstPath)
|
2019-05-22 23:16:55 +02:00
|
|
|
}
|
|
|
|
|
2023-03-19 09:36:05 +01:00
|
|
|
fs.MustSyncPath(dstDir)
|
|
|
|
parentDir := filepath.Dir(dstDir)
|
|
|
|
fs.MustSyncPath(parentDir)
|
2019-05-22 23:16:55 +02:00
|
|
|
}
|
|
|
|
|
2023-03-19 09:36:05 +01:00
|
|
|
type partNamesJSON struct {
|
|
|
|
Small []string
|
|
|
|
Big []string
|
|
|
|
}
|
2019-05-22 23:16:55 +02:00
|
|
|
|
2023-03-19 09:36:05 +01:00
|
|
|
func mustWritePartNames(pwsSmall, pwsBig []*partWrapper, dstDir string) {
|
|
|
|
partNamesSmall := getPartNames(pwsSmall)
|
|
|
|
partNamesBig := getPartNames(pwsBig)
|
|
|
|
partNames := &partNamesJSON{
|
|
|
|
Small: partNamesSmall,
|
|
|
|
Big: partNamesBig,
|
2019-05-22 23:16:55 +02:00
|
|
|
}
|
2023-03-19 09:36:05 +01:00
|
|
|
data, err := json.Marshal(partNames)
|
|
|
|
if err != nil {
|
|
|
|
logger.Panicf("BUG: cannot marshal partNames to JSON: %s", err)
|
2019-05-22 23:16:55 +02:00
|
|
|
}
|
2024-04-16 19:07:36 +02:00
|
|
|
partsFile := filepath.Join(dstDir, partsFilename)
|
|
|
|
fs.MustWriteAtomic(partsFile, data, true)
|
2023-03-19 09:36:05 +01:00
|
|
|
}
|
2019-05-22 23:16:55 +02:00
|
|
|
|
2023-03-19 09:36:05 +01:00
|
|
|
func getPartNames(pws []*partWrapper) []string {
|
|
|
|
partNames := make([]string, 0, len(pws))
|
|
|
|
for _, pw := range pws {
|
|
|
|
if pw.mp != nil {
|
|
|
|
// Skip in-memory parts
|
|
|
|
continue
|
2019-05-22 23:16:55 +02:00
|
|
|
}
|
2023-03-19 09:36:05 +01:00
|
|
|
partName := filepath.Base(pw.p.path)
|
|
|
|
partNames = append(partNames, partName)
|
2019-05-22 23:16:55 +02:00
|
|
|
}
|
2023-03-19 09:36:05 +01:00
|
|
|
sort.Strings(partNames)
|
|
|
|
return partNames
|
2019-05-22 23:16:55 +02:00
|
|
|
}
|
|
|
|
|
2024-04-16 19:07:36 +02:00
|
|
|
func mustReadPartNames(partsFile, smallPartsPath, bigPartsPath string) ([]string, []string) {
|
|
|
|
if fs.IsPathExist(partsFile) {
|
|
|
|
data, err := os.ReadFile(partsFile)
|
2023-04-15 08:16:26 +02:00
|
|
|
if err != nil {
|
2024-04-16 19:07:36 +02:00
|
|
|
logger.Panicf("FATAL: cannot read %q: %s", partsFile, err)
|
2023-04-15 08:16:26 +02:00
|
|
|
}
|
2023-03-19 09:36:05 +01:00
|
|
|
var partNames partNamesJSON
|
|
|
|
if err := json.Unmarshal(data, &partNames); err != nil {
|
2024-04-16 19:07:36 +02:00
|
|
|
logger.Panicf("FATAL: cannot parse %q: %s", partsFile, err)
|
2023-03-19 09:36:05 +01:00
|
|
|
}
|
|
|
|
return partNames.Small, partNames.Big
|
2019-05-22 23:16:55 +02:00
|
|
|
}
|
2024-04-16 19:07:36 +02:00
|
|
|
// The partsFile is missing. This is the upgrade from versions previous to v1.90.0.
|
2023-03-19 09:36:05 +01:00
|
|
|
// Read part names from smallPartsPath and bigPartsPath directories
|
|
|
|
partNamesSmall := mustReadPartNamesFromDir(smallPartsPath)
|
|
|
|
partNamesBig := mustReadPartNamesFromDir(bigPartsPath)
|
|
|
|
return partNamesSmall, partNamesBig
|
|
|
|
}
|
2019-05-22 23:16:55 +02:00
|
|
|
|
2023-03-19 09:36:05 +01:00
|
|
|
func mustReadPartNamesFromDir(srcDir string) []string {
|
2023-04-15 07:08:43 +02:00
|
|
|
if !fs.IsPathExist(srcDir) {
|
|
|
|
return nil
|
2019-05-22 23:16:55 +02:00
|
|
|
}
|
2023-04-15 07:08:43 +02:00
|
|
|
des := fs.MustReadDir(srcDir)
|
2023-03-19 09:36:05 +01:00
|
|
|
var partNames []string
|
|
|
|
for _, de := range des {
|
|
|
|
if !fs.IsDirOrSymlink(de) {
|
|
|
|
// Skip non-directories.
|
|
|
|
continue
|
|
|
|
}
|
|
|
|
partName := de.Name()
|
|
|
|
if isSpecialDir(partName) {
|
|
|
|
// Skip special dirs.
|
|
|
|
continue
|
|
|
|
}
|
|
|
|
partNames = append(partNames, partName)
|
2019-05-22 23:16:55 +02:00
|
|
|
}
|
2023-03-19 09:36:05 +01:00
|
|
|
return partNames
|
2019-05-22 23:16:55 +02:00
|
|
|
}
|
|
|
|
|
2023-03-19 09:36:05 +01:00
|
|
|
func isSpecialDir(name string) bool {
|
2023-03-25 22:33:54 +01:00
|
|
|
return name == "tmp" || name == "txn" || name == snapshotsDirname || fs.IsScheduledForRemoval(name)
|
2019-05-22 23:16:55 +02:00
|
|
|
}
|