VictoriaMetrics/lib/streamaggr/streamaggr_timing_test.go

package streamaggr

import (
	"fmt"
	"strings"
	"testing"

	"github.com/VictoriaMetrics/VictoriaMetrics/lib/prompbmarshal"
)

func BenchmarkAggregatorsPushByJobAvg(b *testing.B) {
	for _, output := range []string{
		"total",
		"total_prometheus",
		"increase",
		"increase_prometheus",
		"count_series",
		"count_samples",
		"unique_samples",
		"sum_samples",
		"last",
		"min",
		"max",
		"avg",
		"stddev",
		"stdvar",
		"histogram_bucket",
		"quantiles(0, 0.5, 1)",
	} {
		b.Run(fmt.Sprintf("output=%s", output), func(b *testing.B) {
			benchmarkAggregatorsPush(b, output)
		})
	}
}

func benchmarkAggregatorsPush(b *testing.B, output string) {
	config := fmt.Sprintf(`
- match: http_requests_total
  interval: 24h
  without: [job]
  outputs: [%q]
`, output)
	pushFunc := func(tss []prompbmarshal.TimeSeries) {}
	a, err := newAggregatorsFromData([]byte(config), pushFunc, 0)
	if err != nil {
		b.Fatalf("unexpected error when initializing aggregators: %s", err)
	}
	defer a.MustStop()

	const loops = 10

	b.ReportAllocs()
	b.SetBytes(int64(len(benchSeries) * loops))
	b.RunParallel(func(pb *testing.PB) {
		var matchIdxs []byte
		for pb.Next() {
			for i := 0; i < loops; i++ {
				series := benchSeries
				for len(series) > 0 {
					chunk := series
					if len(chunk) > 1_000 {
						chunk = series[:1_000]
						series = series[len(chunk):]
					} else {
						series = nil
					}
					matchIdxs = a.Push(chunk, matchIdxs)
				}
			}
		}
	})
}

func newBenchSeries(seriesCount int) []prompbmarshal.TimeSeries {
	a := make([]string, seriesCount)
	for j := 0; j < seriesCount; j++ {
		s := fmt.Sprintf(`http_requests_total{path="/foo/%d",job="foo",instance="bar",pod="pod-123232312",namespace="kube-foo-bar",node="node-123-3434-443",`+
			`some_other_label="foo-bar-baz",environment="prod",label1="value1",label2="value2",label3="value3"} %d`, j, j*1000)
		a = append(a, s)
	}
	metrics := strings.Join(a, "\n")
	return mustParsePromMetrics(metrics)
}

const seriesCount = 10_000

var benchSeries = newBenchSeries(seriesCount)
app/{vmagent,vminsert}: add support for streaming aggregation See https://docs.victoriametrics.com/stream-aggregation.html Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/3460 2023-01-04 07:19:18 +01:00			`package streamaggr`

			`import (`
			`"fmt"`
			`"strings"`
			`"testing"`

			`"github.com/VictoriaMetrics/VictoriaMetrics/lib/prompbmarshal"`
			`)`

			`func BenchmarkAggregatorsPushByJobAvg(b *testing.B) {`
			`for _, output := range []string{`
			`"total",`
lib/streamaggr: huge pile of changes - Reduce memory usage by up to 5x when de-duplicating samples across big number of time series. - Reduce memory usage by up to 5x when aggregating across big number of output time series. - Add lib/promutils.LabelsCompressor, which is going to be used by other VictoriaMetrics components for reducing memory usage for marshaled []prompbmarshal.Label. - Add `dedup_interval` option at aggregation config, which allows setting individual deduplication intervals per each aggregation. - Add `keep_metric_names` option at aggregation config, which allows keeping the original metric names in the output samples. - Add `unique_samples` output, which counts the number of unique sample values. - Add `increase_prometheus` and `total_prometheus` outputs, which ignore the first sample per each newly encountered time series. - Use 64-bit hashes instead of marshaled labels as map keys when calculating `count_series` output. This makes obsolete https://github.com/VictoriaMetrics/VictoriaMetrics/pull/5579 - Expose various metrics, which may help debugging stream aggregation: - vm_streamaggr_dedup_state_size_bytes - the size of data structures responsible for deduplication - vm_streamaggr_dedup_state_items_count - the number of items in the deduplication data structures - vm_streamaggr_labels_compressor_size_bytes - the size of labels compressor data structures - vm_streamaggr_labels_compressor_items_count - the number of entries in the labels compressor - vm_streamaggr_flush_duration_seconds - a histogram, which shows the duration of stream aggregation flushes - vm_streamaggr_dedup_flush_duration_seconds - a histogram, which shows the duration of deduplication flushes - vm_streamaggr_flush_timeouts_total - counter for timed out stream aggregation flushes, which took longer than the configured interval - vm_streamaggr_dedup_flush_timeouts_total - counter for timed out deduplication flushes, which took longer than the configured dedup_interval - Actualize docs/stream-aggregation.md The memory usage reduction increases CPU usage during stream aggregation by up to 30%. This commit is based on https://github.com/VictoriaMetrics/VictoriaMetrics/pull/5850 Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/5898 2024-03-02 01:42:26 +01:00			`"total_prometheus",`
app/{vmagent,vminsert}: add support for streaming aggregation See https://docs.victoriametrics.com/stream-aggregation.html Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/3460 2023-01-04 07:19:18 +01:00			`"increase",`
lib/streamaggr: huge pile of changes - Reduce memory usage by up to 5x when de-duplicating samples across big number of time series. - Reduce memory usage by up to 5x when aggregating across big number of output time series. - Add lib/promutils.LabelsCompressor, which is going to be used by other VictoriaMetrics components for reducing memory usage for marshaled []prompbmarshal.Label. - Add `dedup_interval` option at aggregation config, which allows setting individual deduplication intervals per each aggregation. - Add `keep_metric_names` option at aggregation config, which allows keeping the original metric names in the output samples. - Add `unique_samples` output, which counts the number of unique sample values. - Add `increase_prometheus` and `total_prometheus` outputs, which ignore the first sample per each newly encountered time series. - Use 64-bit hashes instead of marshaled labels as map keys when calculating `count_series` output. This makes obsolete https://github.com/VictoriaMetrics/VictoriaMetrics/pull/5579 - Expose various metrics, which may help debugging stream aggregation: - vm_streamaggr_dedup_state_size_bytes - the size of data structures responsible for deduplication - vm_streamaggr_dedup_state_items_count - the number of items in the deduplication data structures - vm_streamaggr_labels_compressor_size_bytes - the size of labels compressor data structures - vm_streamaggr_labels_compressor_items_count - the number of entries in the labels compressor - vm_streamaggr_flush_duration_seconds - a histogram, which shows the duration of stream aggregation flushes - vm_streamaggr_dedup_flush_duration_seconds - a histogram, which shows the duration of deduplication flushes - vm_streamaggr_flush_timeouts_total - counter for timed out stream aggregation flushes, which took longer than the configured interval - vm_streamaggr_dedup_flush_timeouts_total - counter for timed out deduplication flushes, which took longer than the configured dedup_interval - Actualize docs/stream-aggregation.md The memory usage reduction increases CPU usage during stream aggregation by up to 30%. This commit is based on https://github.com/VictoriaMetrics/VictoriaMetrics/pull/5850 Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/5898 2024-03-02 01:42:26 +01:00			`"increase_prometheus",`
app/{vmagent,vminsert}: add support for streaming aggregation See https://docs.victoriametrics.com/stream-aggregation.html Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/3460 2023-01-04 07:19:18 +01:00			`"count_series",`
			`"count_samples",`
lib/streamaggr: huge pile of changes - Reduce memory usage by up to 5x when de-duplicating samples across big number of time series. - Reduce memory usage by up to 5x when aggregating across big number of output time series. - Add lib/promutils.LabelsCompressor, which is going to be used by other VictoriaMetrics components for reducing memory usage for marshaled []prompbmarshal.Label. - Add `dedup_interval` option at aggregation config, which allows setting individual deduplication intervals per each aggregation. - Add `keep_metric_names` option at aggregation config, which allows keeping the original metric names in the output samples. - Add `unique_samples` output, which counts the number of unique sample values. - Add `increase_prometheus` and `total_prometheus` outputs, which ignore the first sample per each newly encountered time series. - Use 64-bit hashes instead of marshaled labels as map keys when calculating `count_series` output. This makes obsolete https://github.com/VictoriaMetrics/VictoriaMetrics/pull/5579 - Expose various metrics, which may help debugging stream aggregation: - vm_streamaggr_dedup_state_size_bytes - the size of data structures responsible for deduplication - vm_streamaggr_dedup_state_items_count - the number of items in the deduplication data structures - vm_streamaggr_labels_compressor_size_bytes - the size of labels compressor data structures - vm_streamaggr_labels_compressor_items_count - the number of entries in the labels compressor - vm_streamaggr_flush_duration_seconds - a histogram, which shows the duration of stream aggregation flushes - vm_streamaggr_dedup_flush_duration_seconds - a histogram, which shows the duration of deduplication flushes - vm_streamaggr_flush_timeouts_total - counter for timed out stream aggregation flushes, which took longer than the configured interval - vm_streamaggr_dedup_flush_timeouts_total - counter for timed out deduplication flushes, which took longer than the configured dedup_interval - Actualize docs/stream-aggregation.md The memory usage reduction increases CPU usage during stream aggregation by up to 30%. This commit is based on https://github.com/VictoriaMetrics/VictoriaMetrics/pull/5850 Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/5898 2024-03-02 01:42:26 +01:00			`"unique_samples",`
app/{vmagent,vminsert}: add support for streaming aggregation See https://docs.victoriametrics.com/stream-aggregation.html Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/3460 2023-01-04 07:19:18 +01:00			`"sum_samples",`
			`"last",`
			`"min",`
			`"max",`
			`"avg",`
			`"stddev",`
			`"stdvar",`
			`"histogram_bucket",`
			`"quantiles(0, 0.5, 1)",`
			`} {`
			`b.Run(fmt.Sprintf("output=%s", output), func(b *testing.B) {`
			`benchmarkAggregatorsPush(b, output)`
			`})`
			`}`
			`}`

			`func benchmarkAggregatorsPush(b *testing.B, output string) {`
			config := fmt.Sprintf(`
			`- match: http_requests_total`
			`interval: 24h`
			`without: [job]`
			`outputs: [%q]`
			`, output)
lib/streamaggr: make the BenchmarkAggregatorsPushByJobAvg closer to production case with long list of labels per sample 2024-02-29 01:39:00 +01:00			`pushFunc := func(tss []prompbmarshal.TimeSeries) {}`
lib/streamaggr: expand `%{ENV}` placeholders in stream aggregation configs 2024-01-24 11:31:27 +01:00			`a, err := newAggregatorsFromData([]byte(config), pushFunc, 0)`
app/{vmagent,vminsert}: add support for streaming aggregation See https://docs.victoriametrics.com/stream-aggregation.html Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/3460 2023-01-04 07:19:18 +01:00			`if err != nil {`
			`b.Fatalf("unexpected error when initializing aggregators: %s", err)`
			`}`
			`defer a.MustStop()`

lib/streamaggr: huge pile of changes - Reduce memory usage by up to 5x when de-duplicating samples across big number of time series. - Reduce memory usage by up to 5x when aggregating across big number of output time series. - Add lib/promutils.LabelsCompressor, which is going to be used by other VictoriaMetrics components for reducing memory usage for marshaled []prompbmarshal.Label. - Add `dedup_interval` option at aggregation config, which allows setting individual deduplication intervals per each aggregation. - Add `keep_metric_names` option at aggregation config, which allows keeping the original metric names in the output samples. - Add `unique_samples` output, which counts the number of unique sample values. - Add `increase_prometheus` and `total_prometheus` outputs, which ignore the first sample per each newly encountered time series. - Use 64-bit hashes instead of marshaled labels as map keys when calculating `count_series` output. This makes obsolete https://github.com/VictoriaMetrics/VictoriaMetrics/pull/5579 - Expose various metrics, which may help debugging stream aggregation: - vm_streamaggr_dedup_state_size_bytes - the size of data structures responsible for deduplication - vm_streamaggr_dedup_state_items_count - the number of items in the deduplication data structures - vm_streamaggr_labels_compressor_size_bytes - the size of labels compressor data structures - vm_streamaggr_labels_compressor_items_count - the number of entries in the labels compressor - vm_streamaggr_flush_duration_seconds - a histogram, which shows the duration of stream aggregation flushes - vm_streamaggr_dedup_flush_duration_seconds - a histogram, which shows the duration of deduplication flushes - vm_streamaggr_flush_timeouts_total - counter for timed out stream aggregation flushes, which took longer than the configured interval - vm_streamaggr_dedup_flush_timeouts_total - counter for timed out deduplication flushes, which took longer than the configured dedup_interval - Actualize docs/stream-aggregation.md The memory usage reduction increases CPU usage during stream aggregation by up to 30%. This commit is based on https://github.com/VictoriaMetrics/VictoriaMetrics/pull/5850 Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/5898 2024-03-02 01:42:26 +01:00			`const loops = 10`
lib/streamaggr: make the BenchmarkAggregatorsPushByJobAvg closer to production case with long list of labels per sample 2024-02-29 01:39:00 +01:00
app/{vmagent,vminsert}: add support for streaming aggregation See https://docs.victoriametrics.com/stream-aggregation.html Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/3460 2023-01-04 07:19:18 +01:00			`b.ReportAllocs()`
lib/streamaggr: make the BenchmarkAggregatorsPushByJobAvg closer to production case with long list of labels per sample 2024-02-29 01:39:00 +01:00			`b.SetBytes(int64(len(benchSeries) * loops))`
app/{vmagent,vminsert}: add support for streaming aggregation See https://docs.victoriametrics.com/stream-aggregation.html Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/3460 2023-01-04 07:19:18 +01:00			`b.RunParallel(func(pb *testing.PB) {`
lib/streamaggr: follow-up for 736197179e0c097ccf22ade06ee0c873d2e6997b - Use a byte slice instead of a map for tracking indexes for matching series. This improves performance, since access by slice index is faster than access by map key. - Re-use the byte slice for tracking indexes for matching series. This removes unnecessary memory allocations and improves stream aggregation performance a bit. - Add an ability to return to the previous behvaiour by specifying -remoteWrite.streamAggr.dropInput command-line flag. In this case all the input samples are dropped when stream aggregation is enabled. - Backport the new stream aggregation behaviour from vmagent to single-node VictoriaMetrics when -streamAggr.config option is set. - Improve docs regarding this change at docs/CHANGELOG.md - Document the new behavior at docs/stream-aggregation.md Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/4243 Updates https://github.com/VictoriaMetrics/VictoriaMetrics/pull/4575 2023-07-25 01:44:09 +02:00			`var matchIdxs []byte`
app/{vmagent,vminsert}: add support for streaming aggregation See https://docs.victoriametrics.com/stream-aggregation.html Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/3460 2023-01-04 07:19:18 +01:00			`for pb.Next() {`
lib/streamaggr: make the BenchmarkAggregatorsPushByJobAvg closer to production case with long list of labels per sample 2024-02-29 01:39:00 +01:00			`for i := 0; i < loops; i++ {`
lib/streamaggr: huge pile of changes - Reduce memory usage by up to 5x when de-duplicating samples across big number of time series. - Reduce memory usage by up to 5x when aggregating across big number of output time series. - Add lib/promutils.LabelsCompressor, which is going to be used by other VictoriaMetrics components for reducing memory usage for marshaled []prompbmarshal.Label. - Add `dedup_interval` option at aggregation config, which allows setting individual deduplication intervals per each aggregation. - Add `keep_metric_names` option at aggregation config, which allows keeping the original metric names in the output samples. - Add `unique_samples` output, which counts the number of unique sample values. - Add `increase_prometheus` and `total_prometheus` outputs, which ignore the first sample per each newly encountered time series. - Use 64-bit hashes instead of marshaled labels as map keys when calculating `count_series` output. This makes obsolete https://github.com/VictoriaMetrics/VictoriaMetrics/pull/5579 - Expose various metrics, which may help debugging stream aggregation: - vm_streamaggr_dedup_state_size_bytes - the size of data structures responsible for deduplication - vm_streamaggr_dedup_state_items_count - the number of items in the deduplication data structures - vm_streamaggr_labels_compressor_size_bytes - the size of labels compressor data structures - vm_streamaggr_labels_compressor_items_count - the number of entries in the labels compressor - vm_streamaggr_flush_duration_seconds - a histogram, which shows the duration of stream aggregation flushes - vm_streamaggr_dedup_flush_duration_seconds - a histogram, which shows the duration of deduplication flushes - vm_streamaggr_flush_timeouts_total - counter for timed out stream aggregation flushes, which took longer than the configured interval - vm_streamaggr_dedup_flush_timeouts_total - counter for timed out deduplication flushes, which took longer than the configured dedup_interval - Actualize docs/stream-aggregation.md The memory usage reduction increases CPU usage during stream aggregation by up to 30%. This commit is based on https://github.com/VictoriaMetrics/VictoriaMetrics/pull/5850 Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/5898 2024-03-02 01:42:26 +01:00			`series := benchSeries`
			`for len(series) > 0 {`
			`chunk := series`
			`if len(chunk) > 1_000 {`
			`chunk = series[:1_000]`
			`series = series[len(chunk):]`
			`} else {`
			`series = nil`
			`}`
			`matchIdxs = a.Push(chunk, matchIdxs)`
			`}`
lib/streamaggr: make the BenchmarkAggregatorsPushByJobAvg closer to production case with long list of labels per sample 2024-02-29 01:39:00 +01:00			`}`
app/{vmagent,vminsert}: add support for streaming aggregation See https://docs.victoriametrics.com/stream-aggregation.html Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/3460 2023-01-04 07:19:18 +01:00			`}`
			`})`
			`}`

lib/streamaggr: make the BenchmarkAggregatorsPushByJobAvg closer to production case with long list of labels per sample 2024-02-29 01:39:00 +01:00			`func newBenchSeries(seriesCount int) []prompbmarshal.TimeSeries {`
			`a := make([]string, seriesCount)`
			`for j := 0; j < seriesCount; j++ {`
			s := fmt.Sprintf(`http_requests_total{path="/foo/%d",job="foo",instance="bar",pod="pod-123232312",namespace="kube-foo-bar",node="node-123-3434-443",`+
			`some_other_label="foo-bar-baz",environment="prod",label1="value1",label2="value2",label3="value3"} %d`, j, j*1000)
			`a = append(a, s)`
app/{vmagent,vminsert}: add support for streaming aggregation See https://docs.victoriametrics.com/stream-aggregation.html Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/3460 2023-01-04 07:19:18 +01:00			`}`
			`metrics := strings.Join(a, "\n")`
			`return mustParsePromMetrics(metrics)`
			`}`

lib/streamaggr: make the BenchmarkAggregatorsPushByJobAvg closer to production case with long list of labels per sample 2024-02-29 01:39:00 +01:00			`const seriesCount = 10_000`
app/{vmagent,vminsert}: add support for streaming aggregation See https://docs.victoriametrics.com/stream-aggregation.html Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/3460 2023-01-04 07:19:18 +01:00
lib/streamaggr: make the BenchmarkAggregatorsPushByJobAvg closer to production case with long list of labels per sample 2024-02-29 01:39:00 +01:00			`var benchSeries = newBenchSeries(seriesCount)`