VictoriaMetrics/lib/promutils/labelscompressor_test.go

package promutils

import (
	"fmt"
	"sync"
	"testing"

	"github.com/VictoriaMetrics/VictoriaMetrics/lib/prompbmarshal"
)

func TestLabelsCompressorSerial(t *testing.T) {
	var lc LabelsCompressor

	f := func(labels []prompbmarshal.Label) {
		t.Helper()

		sExpected := labelsToString(labels)

		data := lc.Compress(nil, labels)
		labelsResult := lc.Decompress(nil, data)

		sResult := labelsToString(labelsResult)
		if sExpected != sResult {
			t.Fatalf("unexpected result; got %s; want %s", sResult, sExpected)
		}

		if len(labels) > 0 {
			if n := lc.SizeBytes(); n == 0 {
				t.Fatalf("Unexpected zero SizeBytes()")
			}
			if n := lc.ItemsCount(); n == 0 {
				t.Fatalf("Unexpected zero ItemsCount()")
			}
		}
	}

	// empty labels
	f(nil)
	f([]prompbmarshal.Label{})

	// non-empty labels
	f([]prompbmarshal.Label{
		{
			Name:  "instance",
			Value: "12345.4342.342.3",
		},
		{
			Name:  "job",
			Value: "kube-pod-12323",
		},
	})
	f([]prompbmarshal.Label{
		{
			Name:  "instance",
			Value: "12345.4342.342.3",
		},
		{
			Name:  "job",
			Value: "kube-pod-12323",
		},
		{
			Name:  "pod",
			Value: "foo-bar-baz",
		},
	})
}

func TestLabelsCompressorConcurrent(t *testing.T) {
	const concurrency = 5
	var lc LabelsCompressor
	var expectCompressedKeys sync.Map

	var wg sync.WaitGroup
	for i := 0; i < concurrency; i++ {
		wg.Add(1)
		go func() {
			defer wg.Done()
			series := newTestSeries(100, 20)
			for n, labels := range series {
				sExpected := labelsToString(labels)
				data := lc.Compress(nil, labels)
				if expectData, ok := expectCompressedKeys.LoadOrStore(n, data); ok {
					if string(data) != string(expectData.([]byte)) {
						panic(fmt.Errorf("unexpected compress result at series/%d in iteration %d ", n, i))
					}
				}
				labelsResult := lc.Decompress(nil, data)
				sResult := labelsToString(labelsResult)
				if sExpected != sResult {
					panic(fmt.Errorf("unexpected result on iteration %d; got %s; want %s", i, sResult, sExpected))
				}
			}
		}()
	}
	wg.Wait()

	if n := lc.SizeBytes(); n == 0 {
		t.Fatalf("Unexpected zero SizeBytes()")
	}
	if n := lc.ItemsCount(); n == 0 {
		t.Fatalf("Unexpected zero ItemsCount()")
	}
}

func labelsToString(labels []prompbmarshal.Label) string {
	l := Labels{
		Labels: labels,
	}
	return l.String()
}

func newTestSeries(seriesCount, labelsPerSeries int) [][]prompbmarshal.Label {
	series := make([][]prompbmarshal.Label, seriesCount)
	for i := 0; i < seriesCount; i++ {
		labels := make([]prompbmarshal.Label, labelsPerSeries)
		for j := 0; j < labelsPerSeries; j++ {
			labels[j] = prompbmarshal.Label{
				Name:  fmt.Sprintf("label_%d", j),
				Value: fmt.Sprintf("value_%d_%d", i, j),
			}
		}
		series[i] = labels
	}
	return series
}
lib/streamaggr: huge pile of changes - Reduce memory usage by up to 5x when de-duplicating samples across big number of time series. - Reduce memory usage by up to 5x when aggregating across big number of output time series. - Add lib/promutils.LabelsCompressor, which is going to be used by other VictoriaMetrics components for reducing memory usage for marshaled []prompbmarshal.Label. - Add `dedup_interval` option at aggregation config, which allows setting individual deduplication intervals per each aggregation. - Add `keep_metric_names` option at aggregation config, which allows keeping the original metric names in the output samples. - Add `unique_samples` output, which counts the number of unique sample values. - Add `increase_prometheus` and `total_prometheus` outputs, which ignore the first sample per each newly encountered time series. - Use 64-bit hashes instead of marshaled labels as map keys when calculating `count_series` output. This makes obsolete https://github.com/VictoriaMetrics/VictoriaMetrics/pull/5579 - Expose various metrics, which may help debugging stream aggregation: - vm_streamaggr_dedup_state_size_bytes - the size of data structures responsible for deduplication - vm_streamaggr_dedup_state_items_count - the number of items in the deduplication data structures - vm_streamaggr_labels_compressor_size_bytes - the size of labels compressor data structures - vm_streamaggr_labels_compressor_items_count - the number of entries in the labels compressor - vm_streamaggr_flush_duration_seconds - a histogram, which shows the duration of stream aggregation flushes - vm_streamaggr_dedup_flush_duration_seconds - a histogram, which shows the duration of deduplication flushes - vm_streamaggr_flush_timeouts_total - counter for timed out stream aggregation flushes, which took longer than the configured interval - vm_streamaggr_dedup_flush_timeouts_total - counter for timed out deduplication flushes, which took longer than the configured dedup_interval - Actualize docs/stream-aggregation.md The memory usage reduction increases CPU usage during stream aggregation by up to 30%. This commit is based on https://github.com/VictoriaMetrics/VictoriaMetrics/pull/5850 Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/5898 2024-03-02 01:42:26 +01:00			`package promutils`

			`import (`
			`"fmt"`
			`"sync"`
			`"testing"`

			`"github.com/VictoriaMetrics/VictoriaMetrics/lib/prompbmarshal"`
			`)`

			`func TestLabelsCompressorSerial(t *testing.T) {`
			`var lc LabelsCompressor`

			`f := func(labels []prompbmarshal.Label) {`
			`t.Helper()`

			`sExpected := labelsToString(labels)`

			`data := lc.Compress(nil, labels)`
			`labelsResult := lc.Decompress(nil, data)`

			`sResult := labelsToString(labelsResult)`
			`if sExpected != sResult {`
			`t.Fatalf("unexpected result; got %s; want %s", sResult, sExpected)`
			`}`

			`if len(labels) > 0 {`
			`if n := lc.SizeBytes(); n == 0 {`
			`t.Fatalf("Unexpected zero SizeBytes()")`
			`}`
			`if n := lc.ItemsCount(); n == 0 {`
			`t.Fatalf("Unexpected zero ItemsCount()")`
			`}`
			`}`
			`}`

			`// empty labels`
			`f(nil)`
			`f([]prompbmarshal.Label{})`

			`// non-empty labels`
			`f([]prompbmarshal.Label{`
			`{`
			`Name: "instance",`
			`Value: "12345.4342.342.3",`
			`},`
			`{`
			`Name: "job",`
			`Value: "kube-pod-12323",`
			`},`
			`})`
			`f([]prompbmarshal.Label{`
			`{`
			`Name: "instance",`
			`Value: "12345.4342.342.3",`
			`},`
			`{`
			`Name: "job",`
			`Value: "kube-pod-12323",`
			`},`
			`{`
			`Name: "pod",`
			`Value: "foo-bar-baz",`
			`},`
			`})`
			`}`

			`func TestLabelsCompressorConcurrent(t *testing.T) {`
			`const concurrency = 5`
			`var lc LabelsCompressor`
stream aggregation: fix possible duplicated aggregation results (#7118) When ingesting samples with the same labels(duplicated samples or samples with the same labels after `by` or `without` options). They could register different entries for the same labelset in LabelsCompressor. For example, both index 99 and 100 can be assigned to label `foo=1` in two concurrent pushes. Then due to differing label indexes in encoded keys, the samples will appear as distinct in aggrState, resulting in duplicated results after decompressing the label indexes. https://github.com/VictoriaMetrics/VictoriaMetrics/blob/fbde238cdcdf4e2c892d85a3e9e2be6e54e69cef/lib/streamaggr/streamaggr.go#L933 In this pull request, since we need to store `idxToLabel` first to ensure the idx can be searched after `lc.labelToIdxStore`, the `lc.idxToLabel` still could contain a duplicated entries [100]="foo=1". But given the low likelihood of this issue and the size of idxToLabel, it should be fine. 2024-09-30 14:24:59 +02:00			`var expectCompressedKeys sync.Map`
lib/streamaggr: huge pile of changes - Reduce memory usage by up to 5x when de-duplicating samples across big number of time series. - Reduce memory usage by up to 5x when aggregating across big number of output time series. - Add lib/promutils.LabelsCompressor, which is going to be used by other VictoriaMetrics components for reducing memory usage for marshaled []prompbmarshal.Label. - Add `dedup_interval` option at aggregation config, which allows setting individual deduplication intervals per each aggregation. - Add `keep_metric_names` option at aggregation config, which allows keeping the original metric names in the output samples. - Add `unique_samples` output, which counts the number of unique sample values. - Add `increase_prometheus` and `total_prometheus` outputs, which ignore the first sample per each newly encountered time series. - Use 64-bit hashes instead of marshaled labels as map keys when calculating `count_series` output. This makes obsolete https://github.com/VictoriaMetrics/VictoriaMetrics/pull/5579 - Expose various metrics, which may help debugging stream aggregation: - vm_streamaggr_dedup_state_size_bytes - the size of data structures responsible for deduplication - vm_streamaggr_dedup_state_items_count - the number of items in the deduplication data structures - vm_streamaggr_labels_compressor_size_bytes - the size of labels compressor data structures - vm_streamaggr_labels_compressor_items_count - the number of entries in the labels compressor - vm_streamaggr_flush_duration_seconds - a histogram, which shows the duration of stream aggregation flushes - vm_streamaggr_dedup_flush_duration_seconds - a histogram, which shows the duration of deduplication flushes - vm_streamaggr_flush_timeouts_total - counter for timed out stream aggregation flushes, which took longer than the configured interval - vm_streamaggr_dedup_flush_timeouts_total - counter for timed out deduplication flushes, which took longer than the configured dedup_interval - Actualize docs/stream-aggregation.md The memory usage reduction increases CPU usage during stream aggregation by up to 30%. This commit is based on https://github.com/VictoriaMetrics/VictoriaMetrics/pull/5850 Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/5898 2024-03-02 01:42:26 +01:00
			`var wg sync.WaitGroup`
			`for i := 0; i < concurrency; i++ {`
			`wg.Add(1)`
			`go func() {`
			`defer wg.Done()`
			`series := newTestSeries(100, 20)`
stream aggregation: fix possible duplicated aggregation results (#7118) When ingesting samples with the same labels(duplicated samples or samples with the same labels after `by` or `without` options). They could register different entries for the same labelset in LabelsCompressor. For example, both index 99 and 100 can be assigned to label `foo=1` in two concurrent pushes. Then due to differing label indexes in encoded keys, the samples will appear as distinct in aggrState, resulting in duplicated results after decompressing the label indexes. https://github.com/VictoriaMetrics/VictoriaMetrics/blob/fbde238cdcdf4e2c892d85a3e9e2be6e54e69cef/lib/streamaggr/streamaggr.go#L933 In this pull request, since we need to store `idxToLabel` first to ensure the idx can be searched after `lc.labelToIdxStore`, the `lc.idxToLabel` still could contain a duplicated entries [100]="foo=1". But given the low likelihood of this issue and the size of idxToLabel, it should be fine. 2024-09-30 14:24:59 +02:00			`for n, labels := range series {`
lib/streamaggr: huge pile of changes - Reduce memory usage by up to 5x when de-duplicating samples across big number of time series. - Reduce memory usage by up to 5x when aggregating across big number of output time series. - Add lib/promutils.LabelsCompressor, which is going to be used by other VictoriaMetrics components for reducing memory usage for marshaled []prompbmarshal.Label. - Add `dedup_interval` option at aggregation config, which allows setting individual deduplication intervals per each aggregation. - Add `keep_metric_names` option at aggregation config, which allows keeping the original metric names in the output samples. - Add `unique_samples` output, which counts the number of unique sample values. - Add `increase_prometheus` and `total_prometheus` outputs, which ignore the first sample per each newly encountered time series. - Use 64-bit hashes instead of marshaled labels as map keys when calculating `count_series` output. This makes obsolete https://github.com/VictoriaMetrics/VictoriaMetrics/pull/5579 - Expose various metrics, which may help debugging stream aggregation: - vm_streamaggr_dedup_state_size_bytes - the size of data structures responsible for deduplication - vm_streamaggr_dedup_state_items_count - the number of items in the deduplication data structures - vm_streamaggr_labels_compressor_size_bytes - the size of labels compressor data structures - vm_streamaggr_labels_compressor_items_count - the number of entries in the labels compressor - vm_streamaggr_flush_duration_seconds - a histogram, which shows the duration of stream aggregation flushes - vm_streamaggr_dedup_flush_duration_seconds - a histogram, which shows the duration of deduplication flushes - vm_streamaggr_flush_timeouts_total - counter for timed out stream aggregation flushes, which took longer than the configured interval - vm_streamaggr_dedup_flush_timeouts_total - counter for timed out deduplication flushes, which took longer than the configured dedup_interval - Actualize docs/stream-aggregation.md The memory usage reduction increases CPU usage during stream aggregation by up to 30%. This commit is based on https://github.com/VictoriaMetrics/VictoriaMetrics/pull/5850 Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/5898 2024-03-02 01:42:26 +01:00			`sExpected := labelsToString(labels)`
			`data := lc.Compress(nil, labels)`
stream aggregation: fix possible duplicated aggregation results (#7118) When ingesting samples with the same labels(duplicated samples or samples with the same labels after `by` or `without` options). They could register different entries for the same labelset in LabelsCompressor. For example, both index 99 and 100 can be assigned to label `foo=1` in two concurrent pushes. Then due to differing label indexes in encoded keys, the samples will appear as distinct in aggrState, resulting in duplicated results after decompressing the label indexes. https://github.com/VictoriaMetrics/VictoriaMetrics/blob/fbde238cdcdf4e2c892d85a3e9e2be6e54e69cef/lib/streamaggr/streamaggr.go#L933 In this pull request, since we need to store `idxToLabel` first to ensure the idx can be searched after `lc.labelToIdxStore`, the `lc.idxToLabel` still could contain a duplicated entries [100]="foo=1". But given the low likelihood of this issue and the size of idxToLabel, it should be fine. 2024-09-30 14:24:59 +02:00			`if expectData, ok := expectCompressedKeys.LoadOrStore(n, data); ok {`
			`if string(data) != string(expectData.([]byte)) {`
			`panic(fmt.Errorf("unexpected compress result at series/%d in iteration %d ", n, i))`
			`}`
			`}`
lib/streamaggr: huge pile of changes - Reduce memory usage by up to 5x when de-duplicating samples across big number of time series. - Reduce memory usage by up to 5x when aggregating across big number of output time series. - Add lib/promutils.LabelsCompressor, which is going to be used by other VictoriaMetrics components for reducing memory usage for marshaled []prompbmarshal.Label. - Add `dedup_interval` option at aggregation config, which allows setting individual deduplication intervals per each aggregation. - Add `keep_metric_names` option at aggregation config, which allows keeping the original metric names in the output samples. - Add `unique_samples` output, which counts the number of unique sample values. - Add `increase_prometheus` and `total_prometheus` outputs, which ignore the first sample per each newly encountered time series. - Use 64-bit hashes instead of marshaled labels as map keys when calculating `count_series` output. This makes obsolete https://github.com/VictoriaMetrics/VictoriaMetrics/pull/5579 - Expose various metrics, which may help debugging stream aggregation: - vm_streamaggr_dedup_state_size_bytes - the size of data structures responsible for deduplication - vm_streamaggr_dedup_state_items_count - the number of items in the deduplication data structures - vm_streamaggr_labels_compressor_size_bytes - the size of labels compressor data structures - vm_streamaggr_labels_compressor_items_count - the number of entries in the labels compressor - vm_streamaggr_flush_duration_seconds - a histogram, which shows the duration of stream aggregation flushes - vm_streamaggr_dedup_flush_duration_seconds - a histogram, which shows the duration of deduplication flushes - vm_streamaggr_flush_timeouts_total - counter for timed out stream aggregation flushes, which took longer than the configured interval - vm_streamaggr_dedup_flush_timeouts_total - counter for timed out deduplication flushes, which took longer than the configured dedup_interval - Actualize docs/stream-aggregation.md The memory usage reduction increases CPU usage during stream aggregation by up to 30%. This commit is based on https://github.com/VictoriaMetrics/VictoriaMetrics/pull/5850 Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/5898 2024-03-02 01:42:26 +01:00			`labelsResult := lc.Decompress(nil, data)`
			`sResult := labelsToString(labelsResult)`
			`if sExpected != sResult {`
			`panic(fmt.Errorf("unexpected result on iteration %d; got %s; want %s", i, sResult, sExpected))`
			`}`
			`}`
			`}()`
			`}`
			`wg.Wait()`

			`if n := lc.SizeBytes(); n == 0 {`
			`t.Fatalf("Unexpected zero SizeBytes()")`
			`}`
			`if n := lc.ItemsCount(); n == 0 {`
			`t.Fatalf("Unexpected zero ItemsCount()")`
			`}`
			`}`

			`func labelsToString(labels []prompbmarshal.Label) string {`
			`l := Labels{`
			`Labels: labels,`
			`}`
			`return l.String()`
			`}`

			`func newTestSeries(seriesCount, labelsPerSeries int) [][]prompbmarshal.Label {`
			`series := make([][]prompbmarshal.Label, seriesCount)`
			`for i := 0; i < seriesCount; i++ {`
			`labels := make([]prompbmarshal.Label, labelsPerSeries)`
			`for j := 0; j < labelsPerSeries; j++ {`
			`labels[j] = prompbmarshal.Label{`
			`Name: fmt.Sprintf("label_%d", j),`
			`Value: fmt.Sprintf("value_%d_%d", i, j),`
			`}`
			`}`
			`series[i] = labels`
			`}`
			`return series`
			`}`