diff --git a/README.md b/README.md index a06200012..fea5f7976 100644 --- a/README.md +++ b/README.md @@ -910,6 +910,9 @@ The most interesting metrics are: * `sum(rate(vm_rows_inserted_total[5m]))` - ingestion rate, i.e. how many samples are inserted int the database per second. * `vm_free_disk_space_bytes` - free space left at `-storageDataPath`. * `sum(vm_data_size_bytes)` - the total size of data on disk. +* `increase(vm_slow_row_inserts_total[5m])` - the number of slow inserts during the last 5 minutes. + If this value remains high during extended periods of time, then it is likely more RAM is needed for optimal handling + for the current number of active time series. ### Troubleshooting @@ -922,8 +925,9 @@ The most interesting metrics are: * If VictoriaMetrics works slowly and eats more than a CPU core per 100K ingested data points per second, then it is likely you have too many active time series for the current amount of RAM. + See `vm_slow_row_inserts_total` and `vm_slow_per_day_index_inserts_total` [metrics](#monitoring). It is recommended increasing the amount of RAM on the node with VictoriaMetrics in order to improve - ingestion performance. + ingestion performance in this case. Another option is to increase `-memory.allowedPercent` command-line flag value. Be careful with this option, since too big value for `-memory.allowedPercent` may result in high I/O usage. diff --git a/app/vmstorage/main.go b/app/vmstorage/main.go index 5b56cb061..3b03b2ba6 100644 --- a/app/vmstorage/main.go +++ b/app/vmstorage/main.go @@ -409,6 +409,13 @@ func registerStorageMetrics() { return float64(m().AddRowsConcurrencyCurrent) }) + metrics.NewGauge(`vm_slow_row_inserts_total`, func() float64 { + return float64(m().SlowRowInserts) + }) + metrics.NewGauge(`vm_slow_per_day_index_inserts_total`, func() float64 { + return float64(m().SlowPerDayIndexInserts) + }) + metrics.NewGauge(`vm_rows{type="storage/big"}`, func() float64 { return float64(tm().BigRowsCount) }) diff --git a/docs/Single-server-VictoriaMetrics.md b/docs/Single-server-VictoriaMetrics.md index a06200012..fea5f7976 100644 --- a/docs/Single-server-VictoriaMetrics.md +++ b/docs/Single-server-VictoriaMetrics.md @@ -910,6 +910,9 @@ The most interesting metrics are: * `sum(rate(vm_rows_inserted_total[5m]))` - ingestion rate, i.e. how many samples are inserted int the database per second. * `vm_free_disk_space_bytes` - free space left at `-storageDataPath`. * `sum(vm_data_size_bytes)` - the total size of data on disk. +* `increase(vm_slow_row_inserts_total[5m])` - the number of slow inserts during the last 5 minutes. + If this value remains high during extended periods of time, then it is likely more RAM is needed for optimal handling + for the current number of active time series. ### Troubleshooting @@ -922,8 +925,9 @@ The most interesting metrics are: * If VictoriaMetrics works slowly and eats more than a CPU core per 100K ingested data points per second, then it is likely you have too many active time series for the current amount of RAM. + See `vm_slow_row_inserts_total` and `vm_slow_per_day_index_inserts_total` [metrics](#monitoring). It is recommended increasing the amount of RAM on the node with VictoriaMetrics in order to improve - ingestion performance. + ingestion performance in this case. Another option is to increase `-memory.allowedPercent` command-line flag value. Be careful with this option, since too big value for `-memory.allowedPercent` may result in high I/O usage. diff --git a/lib/storage/storage.go b/lib/storage/storage.go index 1cbb5e99c..0c4d68992 100644 --- a/lib/storage/storage.go +++ b/lib/storage/storage.go @@ -39,6 +39,9 @@ type Storage struct { addRowsConcurrencyLimitTimeout uint64 addRowsConcurrencyDroppedRows uint64 + slowRowInserts uint64 + slowPerDayIndexInserts uint64 + path string cachePath string retentionMonths int @@ -323,6 +326,9 @@ type Metrics struct { AddRowsConcurrencyCapacity uint64 AddRowsConcurrencyCurrent uint64 + SlowRowInserts uint64 + SlowPerDayIndexInserts uint64 + TSIDCacheSize uint64 TSIDCacheSizeBytes uint64 TSIDCacheRequests uint64 @@ -377,6 +383,9 @@ func (s *Storage) UpdateMetrics(m *Metrics) { m.AddRowsConcurrencyCapacity = uint64(cap(addRowsConcurrencyCh)) m.AddRowsConcurrencyCurrent = uint64(len(addRowsConcurrencyCh)) + m.SlowRowInserts += atomic.LoadUint64(&s.slowRowInserts) + m.SlowPerDayIndexInserts += atomic.LoadUint64(&s.slowPerDayIndexInserts) + var cs fastcache.Stats s.tsidCache.UpdateStats(&cs) m.TSIDCacheSize += cs.EntriesCount @@ -1095,6 +1104,7 @@ func (s *Storage) add(rows []rawRow, mrs []MetricRow, precisionBits uint8) ([]ra } } if pmrs != nil { + atomic.AddUint64(&s.slowRowInserts, uint64(len(pmrs.pmrs))) // Sort pendingMetricRows by canonical metric name in order to speed up search via `is` in the loop below. pendingMetricRows := pmrs.pmrs sort.Slice(pendingMetricRows, func(i, j int) bool { @@ -1294,6 +1304,7 @@ func (s *Storage) updatePerDateData(rows []rawRow) error { // Slow path - add new (date, metricID) entries to indexDB. + atomic.AddUint64(&s.slowPerDayIndexInserts, uint64(len(pendingDateMetricIDs))) // Sort pendingDateMetricIDs by (date, metricID) in order to speed up `is` search in the loop below. sort.Slice(pendingDateMetricIDs, func(i, j int) bool { a := pendingDateMetricIDs[i]