From 82ffbcb9a664c2e237981aeaee2ef73f85be8f3f Mon Sep 17 00:00:00 2001
From: Aliaksandr Valialkin <valyala@gmail.com>
Date: Fri, 15 May 2020 14:11:39 +0300
Subject: [PATCH] app/vmstorage: add `vm_slow_metric_name_loads_total` metric,
 which could be used as an indicator when more RAM is needed for improving
 query performance

---
 README.md                             | 11 +++++++----
 app/vmstorage/main.go                 |  3 +++
 docs/Single-server-VictoriaMetrics.md | 11 +++++++----
 lib/storage/storage.go                |  4 ++++
 4 files changed, 21 insertions(+), 8 deletions(-)

diff --git a/README.md b/README.md
index fea5f7976..5b79c1291 100644
--- a/README.md
+++ b/README.md
@@ -911,8 +911,11 @@ The most interesting metrics are:
 * `vm_free_disk_space_bytes` - free space left at `-storageDataPath`.
 * `sum(vm_data_size_bytes)` - the total size of data on disk.
 * `increase(vm_slow_row_inserts_total[5m])` - the number of slow inserts during the last 5 minutes.
-  If this value remains high during extended periods of time, then it is likely more RAM is needed for optimal handling
-  for the current number of active time series.
+  If this number remains high during extended periods of time, then it is likely more RAM is needed for optimal handling
+  of the current number of active time series.
+* `increase(vm_slow_metric_name_loads_total[5m])` - the number of slow loads of metric names during the last 5 minutes.
+  If this number remains high during extended periods of time, then it is likely more RAM is needed for optimal handling
+  of the current number of active time series.
 
 
 ### Troubleshooting
@@ -925,9 +928,9 @@ The most interesting metrics are:
 
 * If VictoriaMetrics works slowly and eats more than a CPU core per 100K ingested data points per second,
   then it is likely you have too many active time series for the current amount of RAM.
-  See `vm_slow_row_inserts_total` and `vm_slow_per_day_index_inserts_total` [metrics](#monitoring).
+  VictoriaMetrics [exposes](#monitoring) `vm_slow_*` metrics, which could be used as an indicator of low amounts of RAM.
   It is recommended increasing the amount of RAM on the node with VictoriaMetrics in order to improve
-  ingestion performance in this case.
+  ingestion and query performance in this case.
   Another option is to increase `-memory.allowedPercent` command-line flag value. Be careful with this
   option, since too big value for `-memory.allowedPercent` may result in high I/O usage.
 
diff --git a/app/vmstorage/main.go b/app/vmstorage/main.go
index 3b03b2ba6..8f37b72f0 100644
--- a/app/vmstorage/main.go
+++ b/app/vmstorage/main.go
@@ -415,6 +415,9 @@ func registerStorageMetrics() {
 	metrics.NewGauge(`vm_slow_per_day_index_inserts_total`, func() float64 {
 		return float64(m().SlowPerDayIndexInserts)
 	})
+	metrics.NewGauge(`vm_slow_metric_name_loads_total`, func() float64 {
+		return float64(m().SlowMetricNameLoads)
+	})
 
 	metrics.NewGauge(`vm_rows{type="storage/big"}`, func() float64 {
 		return float64(tm().BigRowsCount)
diff --git a/docs/Single-server-VictoriaMetrics.md b/docs/Single-server-VictoriaMetrics.md
index fea5f7976..5b79c1291 100644
--- a/docs/Single-server-VictoriaMetrics.md
+++ b/docs/Single-server-VictoriaMetrics.md
@@ -911,8 +911,11 @@ The most interesting metrics are:
 * `vm_free_disk_space_bytes` - free space left at `-storageDataPath`.
 * `sum(vm_data_size_bytes)` - the total size of data on disk.
 * `increase(vm_slow_row_inserts_total[5m])` - the number of slow inserts during the last 5 minutes.
-  If this value remains high during extended periods of time, then it is likely more RAM is needed for optimal handling
-  for the current number of active time series.
+  If this number remains high during extended periods of time, then it is likely more RAM is needed for optimal handling
+  of the current number of active time series.
+* `increase(vm_slow_metric_name_loads_total[5m])` - the number of slow loads of metric names during the last 5 minutes.
+  If this number remains high during extended periods of time, then it is likely more RAM is needed for optimal handling
+  of the current number of active time series.
 
 
 ### Troubleshooting
@@ -925,9 +928,9 @@ The most interesting metrics are:
 
 * If VictoriaMetrics works slowly and eats more than a CPU core per 100K ingested data points per second,
   then it is likely you have too many active time series for the current amount of RAM.
-  See `vm_slow_row_inserts_total` and `vm_slow_per_day_index_inserts_total` [metrics](#monitoring).
+  VictoriaMetrics [exposes](#monitoring) `vm_slow_*` metrics, which could be used as an indicator of low amounts of RAM.
   It is recommended increasing the amount of RAM on the node with VictoriaMetrics in order to improve
-  ingestion performance in this case.
+  ingestion and query performance in this case.
   Another option is to increase `-memory.allowedPercent` command-line flag value. Be careful with this
   option, since too big value for `-memory.allowedPercent` may result in high I/O usage.
 
diff --git a/lib/storage/storage.go b/lib/storage/storage.go
index 0c4d68992..9c7186993 100644
--- a/lib/storage/storage.go
+++ b/lib/storage/storage.go
@@ -41,6 +41,7 @@ type Storage struct {
 
 	slowRowInserts         uint64
 	slowPerDayIndexInserts uint64
+	slowMetricNameLoads    uint64
 
 	path            string
 	cachePath       string
@@ -328,6 +329,7 @@ type Metrics struct {
 
 	SlowRowInserts         uint64
 	SlowPerDayIndexInserts uint64
+	SlowMetricNameLoads    uint64
 
 	TSIDCacheSize       uint64
 	TSIDCacheSizeBytes  uint64
@@ -385,6 +387,7 @@ func (s *Storage) UpdateMetrics(m *Metrics) {
 
 	m.SlowRowInserts += atomic.LoadUint64(&s.slowRowInserts)
 	m.SlowPerDayIndexInserts += atomic.LoadUint64(&s.slowPerDayIndexInserts)
+	m.SlowMetricNameLoads += atomic.LoadUint64(&s.slowMetricNameLoads)
 
 	var cs fastcache.Stats
 	s.tsidCache.UpdateStats(&cs)
@@ -814,6 +817,7 @@ func (s *Storage) prefetchMetricNames(tsids []TSID) error {
 		}
 		metricIDs = append(metricIDs, metricID)
 	}
+	atomic.AddUint64(&s.slowMetricNameLoads, uint64(len(metricIDs)))
 	if len(metricIDs) < 500 {
 		// It is cheaper to skip pre-fetching and obtain metricNames inline.
 		return nil