mirror of
https://github.com/VictoriaMetrics/VictoriaMetrics.git
synced 2024-12-15 16:30:55 +01:00
lib/storage: improve deduplication algorithm
Now it leaves only the first data point on each `-dedup.minScrapeInterval` interval. Previously it may leave two data points on the interval. This could lead to unexpected results for `histogram_quantile(phi, sum(rate(buckets)) by (le))` query.
This commit is contained in:
parent
d9bdda408c
commit
a0000c3a6e
@ -1,7 +1,6 @@
|
|||||||
package storage
|
package storage
|
||||||
|
|
||||||
import (
|
import (
|
||||||
"math"
|
|
||||||
"time"
|
"time"
|
||||||
)
|
)
|
||||||
|
|
||||||
@ -11,53 +10,39 @@ import (
|
|||||||
//
|
//
|
||||||
// This function must be called before initializing the storage.
|
// This function must be called before initializing the storage.
|
||||||
func SetMinScrapeIntervalForDeduplication(interval time.Duration) {
|
func SetMinScrapeIntervalForDeduplication(interval time.Duration) {
|
||||||
minScrapeInterval = interval
|
minScrapeInterval = interval.Milliseconds()
|
||||||
}
|
}
|
||||||
|
|
||||||
var minScrapeInterval = time.Duration(0)
|
var minScrapeInterval = int64(0)
|
||||||
|
|
||||||
func getMinDelta() int64 {
|
|
||||||
// Use 7/8 of minScrapeInterval in order to preserve proper data points.
|
|
||||||
// For instance, if minScrapeInterval=10, the following time series:
|
|
||||||
// 10 15 19 25 30 34 41
|
|
||||||
// Would be unexpectedly converted to if using 100% of minScrapeInterval:
|
|
||||||
// 10 25 41
|
|
||||||
// When using 7/8 of minScrapeInterval, it will be converted to the expected:
|
|
||||||
// 10 19 30 41
|
|
||||||
ms := minScrapeInterval.Milliseconds()
|
|
||||||
|
|
||||||
// Try calculating scrape interval via integer arithmetic.
|
|
||||||
d := (ms / 8) * 7
|
|
||||||
if d > 0 {
|
|
||||||
return d
|
|
||||||
}
|
|
||||||
// Too small scrape interval for integer arithmetic. Calculate d using floating-point arithmetic.
|
|
||||||
return int64(math.Round(float64(ms) / 8 * 7))
|
|
||||||
}
|
|
||||||
|
|
||||||
// DeduplicateSamples removes samples from src* if they are closer to each other than minScrapeInterval.
|
// DeduplicateSamples removes samples from src* if they are closer to each other than minScrapeInterval.
|
||||||
func DeduplicateSamples(srcTimestamps []int64, srcValues []float64) ([]int64, []float64) {
|
func DeduplicateSamples(srcTimestamps []int64, srcValues []float64) ([]int64, []float64) {
|
||||||
if minScrapeInterval <= 0 {
|
if minScrapeInterval <= 0 {
|
||||||
return srcTimestamps, srcValues
|
return srcTimestamps, srcValues
|
||||||
}
|
}
|
||||||
minDelta := getMinDelta()
|
if !needsDedup(srcTimestamps, minScrapeInterval) {
|
||||||
if !needsDedup(srcTimestamps, minDelta) {
|
|
||||||
// Fast path - nothing to deduplicate
|
// Fast path - nothing to deduplicate
|
||||||
return srcTimestamps, srcValues
|
return srcTimestamps, srcValues
|
||||||
}
|
}
|
||||||
|
|
||||||
// Slow path - dedup data points.
|
// Slow path - dedup data points.
|
||||||
prevTimestamp := srcTimestamps[0]
|
tsNext := (srcTimestamps[0] - srcTimestamps[0] % minScrapeInterval) + minScrapeInterval
|
||||||
dstTimestamps := srcTimestamps[:1]
|
dstTimestamps := srcTimestamps[:1]
|
||||||
dstValues := srcValues[:1]
|
dstValues := srcValues[:1]
|
||||||
for i := 1; i < len(srcTimestamps); i++ {
|
for i := 1; i < len(srcTimestamps); i++ {
|
||||||
ts := srcTimestamps[i]
|
ts := srcTimestamps[i]
|
||||||
if ts-prevTimestamp < minDelta {
|
if ts < tsNext {
|
||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
dstTimestamps = append(dstTimestamps, ts)
|
dstTimestamps = append(dstTimestamps, ts)
|
||||||
dstValues = append(dstValues, srcValues[i])
|
dstValues = append(dstValues, srcValues[i])
|
||||||
prevTimestamp = ts
|
|
||||||
|
// Update tsNext
|
||||||
|
tsNext += minScrapeInterval
|
||||||
|
if ts >= tsNext {
|
||||||
|
// Slow path for updating ts.
|
||||||
|
tsNext = (ts - ts % minScrapeInterval) + minScrapeInterval
|
||||||
|
}
|
||||||
}
|
}
|
||||||
return dstTimestamps, dstValues
|
return dstTimestamps, dstValues
|
||||||
}
|
}
|
||||||
@ -66,29 +51,29 @@ func deduplicateSamplesDuringMerge(srcTimestamps, srcValues []int64) ([]int64, [
|
|||||||
if minScrapeInterval <= 0 {
|
if minScrapeInterval <= 0 {
|
||||||
return srcTimestamps, srcValues
|
return srcTimestamps, srcValues
|
||||||
}
|
}
|
||||||
if len(srcTimestamps) < 32 {
|
if !needsDedup(srcTimestamps, minScrapeInterval) {
|
||||||
// Do not de-duplicate small number of samples during merge
|
|
||||||
// in order to improve deduplication accuracy on later stages.
|
|
||||||
return srcTimestamps, srcValues
|
|
||||||
}
|
|
||||||
minDelta := getMinDelta()
|
|
||||||
if !needsDedup(srcTimestamps, minDelta) {
|
|
||||||
// Fast path - nothing to deduplicate
|
// Fast path - nothing to deduplicate
|
||||||
return srcTimestamps, srcValues
|
return srcTimestamps, srcValues
|
||||||
}
|
}
|
||||||
|
|
||||||
// Slow path - dedup data points.
|
// Slow path - dedup data points.
|
||||||
prevTimestamp := srcTimestamps[0]
|
tsNext := (srcTimestamps[0] - srcTimestamps[0] % minScrapeInterval) + minScrapeInterval
|
||||||
dstTimestamps := srcTimestamps[:1]
|
dstTimestamps := srcTimestamps[:1]
|
||||||
dstValues := srcValues[:1]
|
dstValues := srcValues[:1]
|
||||||
for i := 1; i < len(srcTimestamps); i++ {
|
for i := 1; i < len(srcTimestamps); i++ {
|
||||||
ts := srcTimestamps[i]
|
ts := srcTimestamps[i]
|
||||||
if ts-prevTimestamp < minDelta {
|
if ts < tsNext {
|
||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
dstTimestamps = append(dstTimestamps, ts)
|
dstTimestamps = append(dstTimestamps, ts)
|
||||||
dstValues = append(dstValues, srcValues[i])
|
dstValues = append(dstValues, srcValues[i])
|
||||||
prevTimestamp = ts
|
|
||||||
|
// Update tsNext
|
||||||
|
tsNext += minScrapeInterval
|
||||||
|
if ts >= tsNext {
|
||||||
|
// Slow path for updating ts.
|
||||||
|
tsNext = (ts - ts % minScrapeInterval) + minScrapeInterval
|
||||||
|
}
|
||||||
}
|
}
|
||||||
return dstTimestamps, dstValues
|
return dstTimestamps, dstValues
|
||||||
}
|
}
|
||||||
|
@ -6,27 +6,6 @@ import (
|
|||||||
"time"
|
"time"
|
||||||
)
|
)
|
||||||
|
|
||||||
func TestGetMinDelta(t *testing.T) {
|
|
||||||
// Disable deduplication before exit, since the rest of tests expect disabled dedup.
|
|
||||||
defer SetMinScrapeIntervalForDeduplication(0)
|
|
||||||
|
|
||||||
f := func(scrapeInterval time.Duration, dExpected int64) {
|
|
||||||
t.Helper()
|
|
||||||
SetMinScrapeIntervalForDeduplication(scrapeInterval)
|
|
||||||
d := getMinDelta()
|
|
||||||
if d != dExpected {
|
|
||||||
t.Fatalf("unexpected getMinDelta(%s); got %d; want %d", scrapeInterval, d, dExpected)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
f(0, 0)
|
|
||||||
f(time.Millisecond, 1)
|
|
||||||
f(5*time.Millisecond, 4)
|
|
||||||
f(8*time.Millisecond, 7)
|
|
||||||
f(100*time.Millisecond, 84)
|
|
||||||
f(time.Second, 875)
|
|
||||||
f(10*time.Second, 8750)
|
|
||||||
}
|
|
||||||
|
|
||||||
func TestDeduplicateSamples(t *testing.T) {
|
func TestDeduplicateSamples(t *testing.T) {
|
||||||
// Disable deduplication before exit, since the rest of tests expect disabled dedup.
|
// Disable deduplication before exit, since the rest of tests expect disabled dedup.
|
||||||
defer SetMinScrapeIntervalForDeduplication(0)
|
defer SetMinScrapeIntervalForDeduplication(0)
|
||||||
@ -73,8 +52,8 @@ func TestDeduplicateSamples(t *testing.T) {
|
|||||||
f(time.Millisecond, []int64{123, 456}, []int64{123, 456})
|
f(time.Millisecond, []int64{123, 456}, []int64{123, 456})
|
||||||
f(time.Millisecond, []int64{0, 0, 0, 1, 1, 2, 3, 3, 3, 4}, []int64{0, 1, 2, 3, 4})
|
f(time.Millisecond, []int64{0, 0, 0, 1, 1, 2, 3, 3, 3, 4}, []int64{0, 1, 2, 3, 4})
|
||||||
f(0, []int64{0, 0, 0, 1, 1, 2, 3, 3, 3, 4}, []int64{0, 0, 0, 1, 1, 2, 3, 3, 3, 4})
|
f(0, []int64{0, 0, 0, 1, 1, 2, 3, 3, 3, 4}, []int64{0, 0, 0, 1, 1, 2, 3, 3, 3, 4})
|
||||||
f(100*time.Millisecond, []int64{0, 100, 100, 101, 150, 180, 200, 300, 1000}, []int64{0, 100, 200, 300, 1000})
|
f(100*time.Millisecond, []int64{0, 100, 100, 101, 150, 180, 205, 300, 1000}, []int64{0, 100, 205, 300, 1000})
|
||||||
f(10*time.Second, []int64{10e3, 13e3, 21e3, 22e3, 30e3, 33e3, 39e3, 45e3}, []int64{10e3, 21e3, 30e3, 39e3})
|
f(10*time.Second, []int64{10e3, 13e3, 21e3, 22e3, 30e3, 33e3, 39e3, 45e3}, []int64{10e3, 21e3, 30e3, 45e3})
|
||||||
}
|
}
|
||||||
|
|
||||||
func TestDeduplicateSamplesDuringMerge(t *testing.T) {
|
func TestDeduplicateSamplesDuringMerge(t *testing.T) {
|
||||||
@ -121,9 +100,9 @@ func TestDeduplicateSamplesDuringMerge(t *testing.T) {
|
|||||||
f(time.Millisecond, nil, []int64{})
|
f(time.Millisecond, nil, []int64{})
|
||||||
f(time.Millisecond, []int64{123}, []int64{123})
|
f(time.Millisecond, []int64{123}, []int64{123})
|
||||||
f(time.Millisecond, []int64{123, 456}, []int64{123, 456})
|
f(time.Millisecond, []int64{123, 456}, []int64{123, 456})
|
||||||
f(time.Millisecond, []int64{0, 0, 0, 1, 1, 2, 3, 3, 3, 4}, []int64{0, 0, 0, 1, 1, 2, 3, 3, 3, 4})
|
f(time.Millisecond, []int64{0, 0, 0, 1, 1, 2, 3, 3, 3, 4}, []int64{0, 1, 2, 3, 4})
|
||||||
f(100*time.Millisecond, []int64{0, 100, 100, 101, 150, 180, 200, 300, 1000}, []int64{0, 100, 100, 101, 150, 180, 200, 300, 1000})
|
f(100*time.Millisecond, []int64{0, 100, 100, 101, 150, 180, 200, 300, 1000}, []int64{0, 100, 200, 300, 1000})
|
||||||
f(10*time.Second, []int64{10e3, 13e3, 21e3, 22e3, 30e3, 33e3, 39e3, 45e3}, []int64{10e3, 13e3, 21e3, 22e3, 30e3, 33e3, 39e3, 45e3})
|
f(10*time.Second, []int64{10e3, 13e3, 21e3, 22e3, 30e3, 33e3, 39e3, 45e3}, []int64{10e3, 21e3, 30e3, 45e3})
|
||||||
|
|
||||||
var timestamps, timestampsExpected []int64
|
var timestamps, timestampsExpected []int64
|
||||||
for i := 0; i < 40; i++ {
|
for i := 0; i < 40; i++ {
|
||||||
|
36
lib/storage/dedup_timing_test.go
Normal file
36
lib/storage/dedup_timing_test.go
Normal file
@ -0,0 +1,36 @@
|
|||||||
|
package storage
|
||||||
|
|
||||||
|
import (
|
||||||
|
"fmt"
|
||||||
|
"testing"
|
||||||
|
"time"
|
||||||
|
)
|
||||||
|
|
||||||
|
func BenchmarkDeduplicateSamples(b *testing.B) {
|
||||||
|
const blockSize = 8192
|
||||||
|
timestamps := make([]int64, blockSize)
|
||||||
|
values := make([]float64, blockSize)
|
||||||
|
for i := 0; i < len(timestamps); i++ {
|
||||||
|
timestamps[i] = int64(i) * 1e3
|
||||||
|
}
|
||||||
|
for _, minScrapeInterval := range []time.Duration{time.Second, 2 * time.Second, 5 * time.Second, 10 * time.Second} {
|
||||||
|
b.Run(fmt.Sprintf("minScrapeInterval=%s", minScrapeInterval), func(b *testing.B) {
|
||||||
|
SetMinScrapeIntervalForDeduplication(minScrapeInterval)
|
||||||
|
defer SetMinScrapeIntervalForDeduplication(0)
|
||||||
|
b.ReportAllocs()
|
||||||
|
b.SetBytes(blockSize)
|
||||||
|
b.RunParallel(func(pb *testing.PB) {
|
||||||
|
timestampsCopy := make([]int64, 0, blockSize)
|
||||||
|
valuesCopy := make([]float64, 0, blockSize)
|
||||||
|
for pb.Next() {
|
||||||
|
timestampsCopy := append(timestampsCopy[:0], timestamps...)
|
||||||
|
valuesCopy := append(valuesCopy[:0], values...)
|
||||||
|
ts, vs := DeduplicateSamples(timestampsCopy, valuesCopy)
|
||||||
|
if len(ts) == 0 || len(vs) == 0 {
|
||||||
|
panic(fmt.Errorf("expecting non-empty results; got\nts=%v\nvs=%v", ts, vs))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
})
|
||||||
|
})
|
||||||
|
}
|
||||||
|
}
|
Loading…
Reference in New Issue
Block a user