app/vmstorage: add vm_slow_row_inserts_total and vm_slow_per_day_index_inserts_total metrics for determining whether VictoriaMetrics required more RAM for the current number of active time series

This commit is contained in:
Aliaksandr Valialkin 2020-05-15 13:44:23 +03:00
parent ab8f5545bc
commit 82ccdfaa91
4 changed files with 28 additions and 2 deletions

View File

@ -910,6 +910,9 @@ The most interesting metrics are:
* `sum(rate(vm_rows_inserted_total[5m]))` - ingestion rate, i.e. how many samples are inserted int the database per second. * `sum(rate(vm_rows_inserted_total[5m]))` - ingestion rate, i.e. how many samples are inserted int the database per second.
* `vm_free_disk_space_bytes` - free space left at `-storageDataPath`. * `vm_free_disk_space_bytes` - free space left at `-storageDataPath`.
* `sum(vm_data_size_bytes)` - the total size of data on disk. * `sum(vm_data_size_bytes)` - the total size of data on disk.
* `increase(vm_slow_row_inserts_total[5m])` - the number of slow inserts during the last 5 minutes.
If this value remains high during extended periods of time, then it is likely more RAM is needed for optimal handling
for the current number of active time series.
### Troubleshooting ### Troubleshooting
@ -922,8 +925,9 @@ The most interesting metrics are:
* If VictoriaMetrics works slowly and eats more than a CPU core per 100K ingested data points per second, * If VictoriaMetrics works slowly and eats more than a CPU core per 100K ingested data points per second,
then it is likely you have too many active time series for the current amount of RAM. then it is likely you have too many active time series for the current amount of RAM.
See `vm_slow_row_inserts_total` and `vm_slow_per_day_index_inserts_total` [metrics](#monitoring).
It is recommended increasing the amount of RAM on the node with VictoriaMetrics in order to improve It is recommended increasing the amount of RAM on the node with VictoriaMetrics in order to improve
ingestion performance. ingestion performance in this case.
Another option is to increase `-memory.allowedPercent` command-line flag value. Be careful with this Another option is to increase `-memory.allowedPercent` command-line flag value. Be careful with this
option, since too big value for `-memory.allowedPercent` may result in high I/O usage. option, since too big value for `-memory.allowedPercent` may result in high I/O usage.

View File

@ -409,6 +409,13 @@ func registerStorageMetrics() {
return float64(m().AddRowsConcurrencyCurrent) return float64(m().AddRowsConcurrencyCurrent)
}) })
metrics.NewGauge(`vm_slow_row_inserts_total`, func() float64 {
return float64(m().SlowRowInserts)
})
metrics.NewGauge(`vm_slow_per_day_index_inserts_total`, func() float64 {
return float64(m().SlowPerDayIndexInserts)
})
metrics.NewGauge(`vm_rows{type="storage/big"}`, func() float64 { metrics.NewGauge(`vm_rows{type="storage/big"}`, func() float64 {
return float64(tm().BigRowsCount) return float64(tm().BigRowsCount)
}) })

View File

@ -910,6 +910,9 @@ The most interesting metrics are:
* `sum(rate(vm_rows_inserted_total[5m]))` - ingestion rate, i.e. how many samples are inserted int the database per second. * `sum(rate(vm_rows_inserted_total[5m]))` - ingestion rate, i.e. how many samples are inserted int the database per second.
* `vm_free_disk_space_bytes` - free space left at `-storageDataPath`. * `vm_free_disk_space_bytes` - free space left at `-storageDataPath`.
* `sum(vm_data_size_bytes)` - the total size of data on disk. * `sum(vm_data_size_bytes)` - the total size of data on disk.
* `increase(vm_slow_row_inserts_total[5m])` - the number of slow inserts during the last 5 minutes.
If this value remains high during extended periods of time, then it is likely more RAM is needed for optimal handling
for the current number of active time series.
### Troubleshooting ### Troubleshooting
@ -922,8 +925,9 @@ The most interesting metrics are:
* If VictoriaMetrics works slowly and eats more than a CPU core per 100K ingested data points per second, * If VictoriaMetrics works slowly and eats more than a CPU core per 100K ingested data points per second,
then it is likely you have too many active time series for the current amount of RAM. then it is likely you have too many active time series for the current amount of RAM.
See `vm_slow_row_inserts_total` and `vm_slow_per_day_index_inserts_total` [metrics](#monitoring).
It is recommended increasing the amount of RAM on the node with VictoriaMetrics in order to improve It is recommended increasing the amount of RAM on the node with VictoriaMetrics in order to improve
ingestion performance. ingestion performance in this case.
Another option is to increase `-memory.allowedPercent` command-line flag value. Be careful with this Another option is to increase `-memory.allowedPercent` command-line flag value. Be careful with this
option, since too big value for `-memory.allowedPercent` may result in high I/O usage. option, since too big value for `-memory.allowedPercent` may result in high I/O usage.

View File

@ -39,6 +39,9 @@ type Storage struct {
addRowsConcurrencyLimitTimeout uint64 addRowsConcurrencyLimitTimeout uint64
addRowsConcurrencyDroppedRows uint64 addRowsConcurrencyDroppedRows uint64
slowRowInserts uint64
slowPerDayIndexInserts uint64
path string path string
cachePath string cachePath string
retentionMonths int retentionMonths int
@ -323,6 +326,9 @@ type Metrics struct {
AddRowsConcurrencyCapacity uint64 AddRowsConcurrencyCapacity uint64
AddRowsConcurrencyCurrent uint64 AddRowsConcurrencyCurrent uint64
SlowRowInserts uint64
SlowPerDayIndexInserts uint64
TSIDCacheSize uint64 TSIDCacheSize uint64
TSIDCacheSizeBytes uint64 TSIDCacheSizeBytes uint64
TSIDCacheRequests uint64 TSIDCacheRequests uint64
@ -377,6 +383,9 @@ func (s *Storage) UpdateMetrics(m *Metrics) {
m.AddRowsConcurrencyCapacity = uint64(cap(addRowsConcurrencyCh)) m.AddRowsConcurrencyCapacity = uint64(cap(addRowsConcurrencyCh))
m.AddRowsConcurrencyCurrent = uint64(len(addRowsConcurrencyCh)) m.AddRowsConcurrencyCurrent = uint64(len(addRowsConcurrencyCh))
m.SlowRowInserts += atomic.LoadUint64(&s.slowRowInserts)
m.SlowPerDayIndexInserts += atomic.LoadUint64(&s.slowPerDayIndexInserts)
var cs fastcache.Stats var cs fastcache.Stats
s.tsidCache.UpdateStats(&cs) s.tsidCache.UpdateStats(&cs)
m.TSIDCacheSize += cs.EntriesCount m.TSIDCacheSize += cs.EntriesCount
@ -1095,6 +1104,7 @@ func (s *Storage) add(rows []rawRow, mrs []MetricRow, precisionBits uint8) ([]ra
} }
} }
if pmrs != nil { if pmrs != nil {
atomic.AddUint64(&s.slowRowInserts, uint64(len(pmrs.pmrs)))
// Sort pendingMetricRows by canonical metric name in order to speed up search via `is` in the loop below. // Sort pendingMetricRows by canonical metric name in order to speed up search via `is` in the loop below.
pendingMetricRows := pmrs.pmrs pendingMetricRows := pmrs.pmrs
sort.Slice(pendingMetricRows, func(i, j int) bool { sort.Slice(pendingMetricRows, func(i, j int) bool {
@ -1294,6 +1304,7 @@ func (s *Storage) updatePerDateData(rows []rawRow) error {
// Slow path - add new (date, metricID) entries to indexDB. // Slow path - add new (date, metricID) entries to indexDB.
atomic.AddUint64(&s.slowPerDayIndexInserts, uint64(len(pendingDateMetricIDs)))
// Sort pendingDateMetricIDs by (date, metricID) in order to speed up `is` search in the loop below. // Sort pendingDateMetricIDs by (date, metricID) in order to speed up `is` search in the loop below.
sort.Slice(pendingDateMetricIDs, func(i, j int) bool { sort.Slice(pendingDateMetricIDs, func(i, j int) bool {
a := pendingDateMetricIDs[i] a := pendingDateMetricIDs[i]