app/vmstorage: add vm_slow_row_inserts_total and vm_slow_per_day_index_inserts_total metrics for determining whether VictoriaMetrics required more RAM for the current number of active time series

2024-11-23 12:31:07 +01:00 · 2020-05-15 13:44:23 +03:00 · 2020-05-15 13:44:23 +03:00 · 82ccdfaa91
commit 82ccdfaa91
parent ab8f5545bc
4 changed files with 28 additions and 2 deletions
--- a/README.md
+++ b/README.md
@ -910,6 +910,9 @@ The most interesting metrics are:
 * `sum(rate(vm_rows_inserted_total[5m]))` - ingestion rate, i.e. how many samples are inserted int the database per second.
 * `vm_free_disk_space_bytes` - free space left at `-storageDataPath`.
 * `sum(vm_data_size_bytes)` - the total size of data on disk.
+* `increase(vm_slow_row_inserts_total[5m])` - the number of slow inserts during the last 5 minutes.
+  If this value remains high during extended periods of time, then it is likely more RAM is needed for optimal handling
+  for the current number of active time series.


 ### Troubleshooting
@ -922,8 +925,9 @@ The most interesting metrics are:

 * If VictoriaMetrics works slowly and eats more than a CPU core per 100K ingested data points per second,
  then it is likely you have too many active time series for the current amount of RAM.
+  See `vm_slow_row_inserts_total` and `vm_slow_per_day_index_inserts_total` [metrics](#monitoring).
  It is recommended increasing the amount of RAM on the node with VictoriaMetrics in order to improve
-  ingestion performance.
+  ingestion performance in this case.
  Another option is to increase `-memory.allowedPercent` command-line flag value. Be careful with this
  option, since too big value for `-memory.allowedPercent` may result in high I/O usage.

--- a/app/vmstorage/main.go
+++ b/app/vmstorage/main.go
@ -409,6 +409,13 @@ func registerStorageMetrics() {
 		return float64(m().AddRowsConcurrencyCurrent)
 	})

+	metrics.NewGauge(`vm_slow_row_inserts_total`, func() float64 {
+		return float64(m().SlowRowInserts)
+	})
+	metrics.NewGauge(`vm_slow_per_day_index_inserts_total`, func() float64 {
+		return float64(m().SlowPerDayIndexInserts)
+	})
+
 	metrics.NewGauge(`vm_rows{type="storage/big"}`, func() float64 {
 		return float64(tm().BigRowsCount)
 	})
--- a/docs/Single-server-VictoriaMetrics.md
+++ b/docs/Single-server-VictoriaMetrics.md
@ -910,6 +910,9 @@ The most interesting metrics are:
 * `sum(rate(vm_rows_inserted_total[5m]))` - ingestion rate, i.e. how many samples are inserted int the database per second.
 * `vm_free_disk_space_bytes` - free space left at `-storageDataPath`.
 * `sum(vm_data_size_bytes)` - the total size of data on disk.
+* `increase(vm_slow_row_inserts_total[5m])` - the number of slow inserts during the last 5 minutes.
+  If this value remains high during extended periods of time, then it is likely more RAM is needed for optimal handling
+  for the current number of active time series.


 ### Troubleshooting
@ -922,8 +925,9 @@ The most interesting metrics are:

 * If VictoriaMetrics works slowly and eats more than a CPU core per 100K ingested data points per second,
  then it is likely you have too many active time series for the current amount of RAM.
+  See `vm_slow_row_inserts_total` and `vm_slow_per_day_index_inserts_total` [metrics](#monitoring).
  It is recommended increasing the amount of RAM on the node with VictoriaMetrics in order to improve
-  ingestion performance.
+  ingestion performance in this case.
  Another option is to increase `-memory.allowedPercent` command-line flag value. Be careful with this
  option, since too big value for `-memory.allowedPercent` may result in high I/O usage.

--- a/lib/storage/storage.go
+++ b/lib/storage/storage.go
@ -39,6 +39,9 @@ type Storage struct {
 	addRowsConcurrencyLimitTimeout uint64
 	addRowsConcurrencyDroppedRows  uint64

+	slowRowInserts         uint64
+	slowPerDayIndexInserts uint64
+
 	path            string
 	cachePath       string
 	retentionMonths int
@ -323,6 +326,9 @@ type Metrics struct {
 	AddRowsConcurrencyCapacity     uint64
 	AddRowsConcurrencyCurrent      uint64

+	SlowRowInserts         uint64
+	SlowPerDayIndexInserts uint64
+
 	TSIDCacheSize       uint64
 	TSIDCacheSizeBytes  uint64
 	TSIDCacheRequests   uint64
@ -377,6 +383,9 @@ func (s *Storage) UpdateMetrics(m *Metrics) {
 	m.AddRowsConcurrencyCapacity = uint64(cap(addRowsConcurrencyCh))
 	m.AddRowsConcurrencyCurrent = uint64(len(addRowsConcurrencyCh))

+	m.SlowRowInserts += atomic.LoadUint64(&s.slowRowInserts)
+	m.SlowPerDayIndexInserts += atomic.LoadUint64(&s.slowPerDayIndexInserts)
+
 	var cs fastcache.Stats
 	s.tsidCache.UpdateStats(&cs)
 	m.TSIDCacheSize += cs.EntriesCount
@ -1095,6 +1104,7 @@ func (s *Storage) add(rows []rawRow, mrs []MetricRow, precisionBits uint8) ([]ra
 		}
 	}
 	if pmrs != nil {
+		atomic.AddUint64(&s.slowRowInserts, uint64(len(pmrs.pmrs)))
 		// Sort pendingMetricRows by canonical metric name in order to speed up search via `is` in the loop below.
 		pendingMetricRows := pmrs.pmrs
 		sort.Slice(pendingMetricRows, func(i, j int) bool {
@ -1294,6 +1304,7 @@ func (s *Storage) updatePerDateData(rows []rawRow) error {

 	// Slow path - add new (date, metricID) entries to indexDB.

+	atomic.AddUint64(&s.slowPerDayIndexInserts, uint64(len(pendingDateMetricIDs)))
 	// Sort pendingDateMetricIDs by (date, metricID) in order to speed up `is` search in the loop below.
 	sort.Slice(pendingDateMetricIDs, func(i, j int) bool {
 		a := pendingDateMetricIDs[i]