lib/storage: wait for up to 60 seconds before deciding to delete metricID entries from indexdb if metricID->metricName entry is missing during search

The metricID->metricName entry can remain invisible for search for some time after registering new metricName.
This is expected condition. So wait for up to 60 seconds in the hope that the metricID->metricName
entry will become visible before deleting all the entries from indexdb, which are associated with the given metricID.

Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/5959
Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/5948

See also 20812008a7
This commit is contained in:
Aliaksandr Valialkin 2024-03-18 00:19:17 +02:00
parent 729b263670
commit 76f00cea6b
No known key found for this signature in database
GPG Key ID: 52C003EE2BCDB9EB
3 changed files with 46 additions and 7 deletions

View File

@ -55,6 +55,7 @@ See also [LTS releases](https://docs.victoriametrics.com/lts-releases/).
* FEATURE: [vmctl](https://docs.victoriametrics.com/vmctl.html): support client-side TLS configuration for [native protocol](https://docs.victoriametrics.com/vmctl/#migrating-data-from-victoriametrics). See [this feature request](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/5748). Thanks to @khushijain21 for the [pull request](https://github.com/VictoriaMetrics/VictoriaMetrics/pull/5824).
* FEATURE: [vmctl](https://docs.victoriametrics.com/vmctl.html): support client-side TLS configuration for VictoriaMetrics destination specified via `--vm-*` cmd-line flags used in [InfluxDB](https://docs.victoriametrics.com/vmctl/#migrating-data-from-influxdb-1x), [Remote Read protocol](https://docs.victoriametrics.com/vmctl/#migrating-data-by-remote-read-protocol), [OpenTSDB](https://docs.victoriametrics.com/vmctl/#migrating-data-from-opentsdb), [Prometheus](https://docs.victoriametrics.com/vmctl/#migrating-data-from-prometheus) and [Promscale](https://docs.victoriametrics.com/vmctl/#migrating-data-from-promscale) migration modes.
* BUGFIX: prevent from automatic deletion of newly registered time series when it is queried immediately after the addition. The probability of this bug has been increased significantly after [v1.99.0](https://github.com/VictoriaMetrics/VictoriaMetrics/releases/tag/v1.99.0) because of optimizations related to registering new time series. See [this](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/5948) and [this](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/5959) issue.
* BUGFIX: [vmagent](https://docs.victoriametrics.com/vmagent.html): properly set `Host` header in requests to scrape targets if it is specified via [`headers` option](https://docs.victoriametrics.com/sd_configs/#http-api-client-options). Thanks to @fholzer for [the bugreport](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/5969) and [the fix](https://github.com/VictoriaMetrics/VictoriaMetrics/pull/5970).
* BUGFIX: [vmagent](https://docs.victoriametrics.com/vmagent.html): properly set `Host` header in requests to scrape targets when [`server_name` option at `tls_config`](https://prometheus.io/docs/prometheus/latest/configuration/configuration/#tls_config) is set. Previously the `Host` header was set incorrectly to the target hostname in this case.
* BUGFIX: do not drop `match[]` filter at [`/api/v1/series`](https://docs.victoriametrics.com/url-examples/#apiv1series) if `-search.ignoreExtraFiltersAtLabelsAPI` command-line flag is set, since missing `match[]` filter breaks `/api/v1/series` requests.

View File

@ -1485,14 +1485,45 @@ func (db *indexDB) searchMetricNameWithCache(dst []byte, metricID uint64) ([]byt
return dst, true
}
// Cannot find MetricName for the given metricID. This may be the case
// when indexDB contains incomplete set of metricID -> metricName entries
// after a snapshot or due to unflushed entries.
db.missingMetricNamesForMetricID.Add(1)
// Cannot find the MetricName for the given metricID.
// There are the following expected cases when this may happen:
//
// 1. The corresponding metricID -> metricName entry isn't visible for search yet.
// The solution is to wait for some time and try the search again.
// It is OK if newly registered time series isn't visible for search during some time.
// This should resolve https://github.com/VictoriaMetrics/VictoriaMetrics/issues/5959
//
// 2. The metricID -> metricName entry doesn't exist in the indexdb.
// This is possible after unclean shutdown or after restoring of indexdb from a snapshot.
// In this case the metricID must be deleted, so new metricID is registered
// again when new sample for the given metricName is ingested next time.
//
ct := fasttime.UnixTimestamp()
db.s.missingMetricIDsLock.Lock()
if ct > db.s.missingMetricIDsResetDeadline {
db.s.missingMetricIDs = nil
db.s.missingMetricIDsResetDeadline = ct + 2*60
}
timestamp, ok := db.s.missingMetricIDs[metricID]
if !ok {
if db.s.missingMetricIDs == nil {
db.s.missingMetricIDs = make(map[uint64]uint64)
}
db.s.missingMetricIDs[metricID] = ct
timestamp = ct
}
db.s.missingMetricIDsLock.Unlock()
if ct > timestamp+60 {
// Cannot find the MetricName for the given metricID for the last 60 seconds.
// It is likely the indexDB contains incomplete set of metricID -> metricName entries
// after unclean shutdown or after restoring from a snapshot.
// Mark the metricID as deleted, so it is created again when new sample
// for the given time series is ingested next time.
db.missingMetricNamesForMetricID.Add(1)
db.deleteMetricIDs([]uint64{metricID})
}
// Mark the metricID as deleted, so it will be created again when new data point
// for the given time series will arrive.
db.deleteMetricIDs([]uint64{metricID})
return dst, false
}

View File

@ -147,6 +147,13 @@ type Storage struct {
deletedMetricIDs atomic.Pointer[uint64set.Set]
deletedMetricIDsUpdateLock sync.Mutex
// missingMetricIDs maps metricID to the timestamp of first unsuccessful lookup
// of metricName by the given metricID.
// This is used inside searchMetricNameWithCache() for detecting permanently missing metricID->metricName entries.
missingMetricIDsLock sync.Mutex
missingMetricIDs map[uint64]uint64
missingMetricIDsResetDeadline uint64
// isReadOnly is set to true when the storage is in read-only mode.
isReadOnly atomic.Bool
}