diff --git a/README.md b/README.md index fea5f7976..5b79c1291 100644 --- a/README.md +++ b/README.md @@ -911,8 +911,11 @@ The most interesting metrics are: * `vm_free_disk_space_bytes` - free space left at `-storageDataPath`. * `sum(vm_data_size_bytes)` - the total size of data on disk. * `increase(vm_slow_row_inserts_total[5m])` - the number of slow inserts during the last 5 minutes. - If this value remains high during extended periods of time, then it is likely more RAM is needed for optimal handling - for the current number of active time series. + If this number remains high during extended periods of time, then it is likely more RAM is needed for optimal handling + of the current number of active time series. +* `increase(vm_slow_metric_name_loads_total[5m])` - the number of slow loads of metric names during the last 5 minutes. + If this number remains high during extended periods of time, then it is likely more RAM is needed for optimal handling + of the current number of active time series. ### Troubleshooting @@ -925,9 +928,9 @@ The most interesting metrics are: * If VictoriaMetrics works slowly and eats more than a CPU core per 100K ingested data points per second, then it is likely you have too many active time series for the current amount of RAM. - See `vm_slow_row_inserts_total` and `vm_slow_per_day_index_inserts_total` [metrics](#monitoring). + VictoriaMetrics [exposes](#monitoring) `vm_slow_*` metrics, which could be used as an indicator of low amounts of RAM. It is recommended increasing the amount of RAM on the node with VictoriaMetrics in order to improve - ingestion performance in this case. + ingestion and query performance in this case. Another option is to increase `-memory.allowedPercent` command-line flag value. Be careful with this option, since too big value for `-memory.allowedPercent` may result in high I/O usage. diff --git a/app/vmstorage/main.go b/app/vmstorage/main.go index 3b03b2ba6..8f37b72f0 100644 --- a/app/vmstorage/main.go +++ b/app/vmstorage/main.go @@ -415,6 +415,9 @@ func registerStorageMetrics() { metrics.NewGauge(`vm_slow_per_day_index_inserts_total`, func() float64 { return float64(m().SlowPerDayIndexInserts) }) + metrics.NewGauge(`vm_slow_metric_name_loads_total`, func() float64 { + return float64(m().SlowMetricNameLoads) + }) metrics.NewGauge(`vm_rows{type="storage/big"}`, func() float64 { return float64(tm().BigRowsCount) diff --git a/docs/Single-server-VictoriaMetrics.md b/docs/Single-server-VictoriaMetrics.md index fea5f7976..5b79c1291 100644 --- a/docs/Single-server-VictoriaMetrics.md +++ b/docs/Single-server-VictoriaMetrics.md @@ -911,8 +911,11 @@ The most interesting metrics are: * `vm_free_disk_space_bytes` - free space left at `-storageDataPath`. * `sum(vm_data_size_bytes)` - the total size of data on disk. * `increase(vm_slow_row_inserts_total[5m])` - the number of slow inserts during the last 5 minutes. - If this value remains high during extended periods of time, then it is likely more RAM is needed for optimal handling - for the current number of active time series. + If this number remains high during extended periods of time, then it is likely more RAM is needed for optimal handling + of the current number of active time series. +* `increase(vm_slow_metric_name_loads_total[5m])` - the number of slow loads of metric names during the last 5 minutes. + If this number remains high during extended periods of time, then it is likely more RAM is needed for optimal handling + of the current number of active time series. ### Troubleshooting @@ -925,9 +928,9 @@ The most interesting metrics are: * If VictoriaMetrics works slowly and eats more than a CPU core per 100K ingested data points per second, then it is likely you have too many active time series for the current amount of RAM. - See `vm_slow_row_inserts_total` and `vm_slow_per_day_index_inserts_total` [metrics](#monitoring). + VictoriaMetrics [exposes](#monitoring) `vm_slow_*` metrics, which could be used as an indicator of low amounts of RAM. It is recommended increasing the amount of RAM on the node with VictoriaMetrics in order to improve - ingestion performance in this case. + ingestion and query performance in this case. Another option is to increase `-memory.allowedPercent` command-line flag value. Be careful with this option, since too big value for `-memory.allowedPercent` may result in high I/O usage. diff --git a/lib/storage/storage.go b/lib/storage/storage.go index 0c4d68992..9c7186993 100644 --- a/lib/storage/storage.go +++ b/lib/storage/storage.go @@ -41,6 +41,7 @@ type Storage struct { slowRowInserts uint64 slowPerDayIndexInserts uint64 + slowMetricNameLoads uint64 path string cachePath string @@ -328,6 +329,7 @@ type Metrics struct { SlowRowInserts uint64 SlowPerDayIndexInserts uint64 + SlowMetricNameLoads uint64 TSIDCacheSize uint64 TSIDCacheSizeBytes uint64 @@ -385,6 +387,7 @@ func (s *Storage) UpdateMetrics(m *Metrics) { m.SlowRowInserts += atomic.LoadUint64(&s.slowRowInserts) m.SlowPerDayIndexInserts += atomic.LoadUint64(&s.slowPerDayIndexInserts) + m.SlowMetricNameLoads += atomic.LoadUint64(&s.slowMetricNameLoads) var cs fastcache.Stats s.tsidCache.UpdateStats(&cs) @@ -814,6 +817,7 @@ func (s *Storage) prefetchMetricNames(tsids []TSID) error { } metricIDs = append(metricIDs, metricID) } + atomic.AddUint64(&s.slowMetricNameLoads, uint64(len(metricIDs))) if len(metricIDs) < 500 { // It is cheaper to skip pre-fetching and obtain metricNames inline. return nil