From c3d86eef96e33b2beb08807904970ac7cd4a636f Mon Sep 17 00:00:00 2001
From: Aliaksandr Valialkin <valyala@gmail.com>
Date: Fri, 31 Jan 2020 01:09:44 +0200
Subject: [PATCH] all: add `-dedup.minScrapeInterval` command-line flag for
 data de-duplication

Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/86
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/278
---
 README.md                             | 14 ++++
 app/vmselect/netstorage/netstorage.go |  4 +-
 lib/storage/block.go                  | 19 +++++
 lib/storage/block_stream_writer.go    |  3 +-
 lib/storage/dedup.go                  | 99 +++++++++++++++++++++++++++
 5 files changed, 137 insertions(+), 2 deletions(-)
 create mode 100644 lib/storage/dedup.go

diff --git a/README.md b/README.md
index 14327e474..3ce398561 100644
--- a/README.md
+++ b/README.md
@@ -95,6 +95,7 @@ Cluster version is available [here](https://github.com/VictoriaMetrics/VictoriaM
   - [Federation](#federation)
   - [Capacity planning](#capacity-planning)
   - [High availability](#high-availability)
+  - [Deduplication](#deduplication)
   - [Retention](#retention)
   - [Multiple retentions](#multiple-retentions)
   - [Downsampling](#downsampling)
@@ -701,6 +702,19 @@ kill -HUP `pidof prometheus`
 If you have Prometheus HA pairs with replicas `r1` and `r2` in each pair, then configure each `r1`
 to write data to `victoriametrics-addr-1`, while each `r2` should write data to `victoriametrics-addr-2`.
 
+Another option is to write data simultaneously from Prometheus HA pair to a pair of VictoriaMetrics instances
+with the enabled de-duplication. See [this section](#deduplication) for details.
+
+
+### Deduplication
+
+VictoriaMetrics de-duplicates data points if `-dedup.minScrapeInterval` command-line flag
+is set to positive duration. For example, `-dedup.minScrapeInterval=60s` would de-duplicate data points
+on the same time series if they are located closer than 60s to each other.
+The de-duplication reduces disk space usage if multiple identically configured Prometheus instances in HA pair
+write data to the same VictoriaMetrics instance. Note that these Prometheus instances must have identical
+`external_labels` section in their configs, so they write data to the same time series.
+
 
 ### Retention
 
diff --git a/app/vmselect/netstorage/netstorage.go b/app/vmselect/netstorage/netstorage.go
index fbb4e8c7d..77e057875 100644
--- a/app/vmselect/netstorage/netstorage.go
+++ b/app/vmselect/netstorage/netstorage.go
@@ -266,7 +266,7 @@ func mergeSortBlocks(dst *Result, sbh sortBlocksHeap) {
 			dst.Timestamps = append(dst.Timestamps, top.Timestamps[top.NextIdx:]...)
 			dst.Values = append(dst.Values, top.Values[top.NextIdx:]...)
 			putSortBlock(top)
-			return
+			break
 		}
 		sbNext := sbh[0]
 		tsNext := sbNext.Timestamps[sbNext.NextIdx]
@@ -287,6 +287,8 @@ func mergeSortBlocks(dst *Result, sbh sortBlocksHeap) {
 			putSortBlock(top)
 		}
 	}
+
+	dst.Timestamps, dst.Values = storage.DeduplicateSamples(dst.Timestamps, dst.Values)
 }
 
 type sortBlock struct {
diff --git a/lib/storage/block.go b/lib/storage/block.go
index 252a10ddf..8cbf090cb 100644
--- a/lib/storage/block.go
+++ b/lib/storage/block.go
@@ -158,6 +158,25 @@ func (b *Block) tooBig() bool {
 	return false
 }
 
+func (b *Block) deduplicateSamplesDuringMerge() {
+	if len(b.values) == 0 {
+		// Nothing to dedup or the data is already marshaled.
+		return
+	}
+	srcTimestamps := b.timestamps[b.nextIdx:]
+	srcValues := b.values[b.nextIdx:]
+	timestamps, values := deduplicateSamplesDuringMerge(srcTimestamps, srcValues)
+	b.timestamps = b.timestamps[:b.nextIdx+len(timestamps)]
+	b.values = b.values[:b.nextIdx+len(values)]
+}
+
+func (b *Block) rowsCount() int {
+	if len(b.values) == 0 {
+		return int(b.bh.RowsCount)
+	}
+	return len(b.values[b.nextIdx:])
+}
+
 // MarshalData marshals the block into binary representation.
 func (b *Block) MarshalData(timestampsBlockOffset, valuesBlockOffset uint64) ([]byte, []byte, []byte) {
 	if len(b.values) == 0 {
diff --git a/lib/storage/block_stream_writer.go b/lib/storage/block_stream_writer.go
index ee9292251..ab68e27c8 100644
--- a/lib/storage/block_stream_writer.go
+++ b/lib/storage/block_stream_writer.go
@@ -171,6 +171,8 @@ func (bsw *blockStreamWriter) MustClose() {
 
 // WriteExternalBlock writes b to bsw and updates ph and rowsMerged.
 func (bsw *blockStreamWriter) WriteExternalBlock(b *Block, ph *partHeader, rowsMerged *uint64) {
+	atomic.AddUint64(rowsMerged, uint64(b.rowsCount()))
+	b.deduplicateSamplesDuringMerge()
 	headerData, timestampsData, valuesData := b.MarshalData(bsw.timestampsBlockOffset, bsw.valuesBlockOffset)
 
 	bsw.indexData = append(bsw.indexData, headerData...)
@@ -186,7 +188,6 @@ func (bsw *blockStreamWriter) WriteExternalBlock(b *Block, ph *partHeader, rowsM
 	bsw.valuesBlockOffset += uint64(len(valuesData))
 
 	updatePartHeader(b, ph)
-	atomic.AddUint64(rowsMerged, uint64(b.bh.RowsCount))
 }
 
 func updatePartHeader(b *Block, ph *partHeader) {
diff --git a/lib/storage/dedup.go b/lib/storage/dedup.go
new file mode 100644
index 000000000..04e9929a6
--- /dev/null
+++ b/lib/storage/dedup.go
@@ -0,0 +1,99 @@
+package storage
+
+import (
+	"flag"
+
+	"github.com/VictoriaMetrics/metrics"
+)
+
+var minScrapeInterval = flag.Duration("dedup.minScrapeInterval", 0, "Remove superflouos samples from time series if they are located closer to each other than this duration. "+
+	"This may be useful for reducing overhead when multiple identically configured Prometheus instances write data to the same VictoriaMetrics. "+
+	"Deduplication is disabled if the `-dedup.minScrapeInterval` is 0")
+
+func getMinDelta() int64 {
+	// Divide minScrapeInterval by 2 in order to preserve proper data points.
+	// For instance, if minScrapeInterval=10, the following time series:
+	//    10 15 19 25 30 34 41
+	// Would be unexpectedly converted to:
+	//    10 25 41
+	// When dividing minScrapeInterval by 2, it will be converted to the expected:
+	//    10 19 30 41
+	return minScrapeInterval.Milliseconds() / 2
+}
+
+// DeduplicateSamples removes samples from src* if they are closer to each other than minScrapeInterval.
+func DeduplicateSamples(srcTimestamps []int64, srcValues []float64) ([]int64, []float64) {
+	if *minScrapeInterval <= 0 {
+		return srcTimestamps, srcValues
+	}
+	minDelta := getMinDelta()
+	if !needsDedup(srcTimestamps, minDelta) {
+		// Fast path - nothing to deduplicate
+		return srcTimestamps, srcValues
+	}
+
+	// Slow path - dedup data points.
+	prevTimestamp := srcTimestamps[0]
+	dstTimestamps := srcTimestamps[:1]
+	dstValues := srcValues[:1]
+	dedups := 0
+	for i := 1; i < len(srcTimestamps); i++ {
+		ts := srcTimestamps[i]
+		if ts-prevTimestamp < minDelta {
+			dedups++
+			continue
+		}
+		dstTimestamps = append(dstTimestamps, ts)
+		dstValues = append(dstValues, srcValues[i])
+		prevTimestamp = ts
+	}
+	dedupsDuringSelect.Add(dedups)
+	return dstTimestamps, dstValues
+}
+
+var dedupsDuringSelect = metrics.NewCounter(`deduplicated_samples_total{type="select"}`)
+
+func deduplicateSamplesDuringMerge(srcTimestamps []int64, srcValues []int64) ([]int64, []int64) {
+	if *minScrapeInterval <= 0 {
+		return srcTimestamps, srcValues
+	}
+	minDelta := getMinDelta()
+	if !needsDedup(srcTimestamps, minDelta) {
+		// Fast path - nothing to deduplicate
+		return srcTimestamps, srcValues
+	}
+
+	// Slow path - dedup data points.
+	prevTimestamp := srcTimestamps[0]
+	dstTimestamps := srcTimestamps[:1]
+	dstValues := srcValues[:1]
+	dedups := 0
+	for i := 1; i < len(srcTimestamps); i++ {
+		ts := srcTimestamps[i]
+		if ts-prevTimestamp < minDelta {
+			dedups++
+			continue
+		}
+		dstTimestamps = append(dstTimestamps, ts)
+		dstValues = append(dstValues, srcValues[i])
+		prevTimestamp = ts
+	}
+	dedupsDuringMerge.Add(dedups)
+	return dstTimestamps, dstValues
+}
+
+var dedupsDuringMerge = metrics.NewCounter(`deduplicated_samples_total{type="merge"}`)
+
+func needsDedup(timestamps []int64, minDelta int64) bool {
+	if len(timestamps) == 0 {
+		return false
+	}
+	prevTimestamp := timestamps[0]
+	for _, ts := range timestamps[1:] {
+		if ts-prevTimestamp < minDelta {
+			return true
+		}
+		prevTimestamp = ts
+	}
+	return false
+}