mirror of
https://github.com/VictoriaMetrics/VictoriaMetrics.git
synced 2025-01-20 07:19:17 +01:00
vmalert: expose new metrics for tracking number of produced samples during last evaluation (#1518)
* vmalert: expose new metrics for tracking number of produced samples during last evaluation Two new metrics were added to track the number of samples produced during the last evaluation: * vmalert_recording_rules_last_evaluation_samples * vmalert_alerting_rules_last_evaluation_samples The gauge type is used to remain consistent with Prometheus metric `prometheus_rule_group_last_evaluation_samples` which is on the group level. However, the counter type was considered as well. Two metrics instead of one are used to make it easier to separate recording and alerting rules. It is likely, number of samples produced by recording rules is more important so people will refer to it more frequently. The expected usage of the new metric is the following: ``` - alert: RecordingRuleReturnsEmptyResults expr: sum(vmalert_recording_rules_last_evaluation_samples) by(recording) < 1 annotations: summary: Recording rule {{$labels.recording}} returns empty results. Please verify expression correctness. ``` Addresses https://github.com/VictoriaMetrics/VictoriaMetrics/issues/1494 * vmalert: rename `vmalert_alerts_error` to `vmalert_alerting_rules_error` to remain consistent with recording rules metrics
This commit is contained in:
parent
d826352688
commit
7416fdaa8b
@ -42,6 +42,9 @@ type AlertingRule struct {
|
|||||||
// resets on every successful Exec
|
// resets on every successful Exec
|
||||||
// may be used as Health state
|
// may be used as Health state
|
||||||
lastExecError error
|
lastExecError error
|
||||||
|
// stores the number of samples returned during
|
||||||
|
// the last evaluation
|
||||||
|
lastExecSamples int
|
||||||
|
|
||||||
metrics *alertingRuleMetrics
|
metrics *alertingRuleMetrics
|
||||||
}
|
}
|
||||||
@ -50,6 +53,7 @@ type alertingRuleMetrics struct {
|
|||||||
errors *gauge
|
errors *gauge
|
||||||
pending *gauge
|
pending *gauge
|
||||||
active *gauge
|
active *gauge
|
||||||
|
samples *gauge
|
||||||
}
|
}
|
||||||
|
|
||||||
func newAlertingRule(qb datasource.QuerierBuilder, group *Group, cfg config.Rule) *AlertingRule {
|
func newAlertingRule(qb datasource.QuerierBuilder, group *Group, cfg config.Rule) *AlertingRule {
|
||||||
@ -76,8 +80,8 @@ func newAlertingRule(qb datasource.QuerierBuilder, group *Group, cfg config.Rule
|
|||||||
labels := fmt.Sprintf(`alertname=%q, group=%q, id="%d"`, ar.Name, group.Name, ar.ID())
|
labels := fmt.Sprintf(`alertname=%q, group=%q, id="%d"`, ar.Name, group.Name, ar.ID())
|
||||||
ar.metrics.pending = getOrCreateGauge(fmt.Sprintf(`vmalert_alerts_pending{%s}`, labels),
|
ar.metrics.pending = getOrCreateGauge(fmt.Sprintf(`vmalert_alerts_pending{%s}`, labels),
|
||||||
func() float64 {
|
func() float64 {
|
||||||
ar.mu.Lock()
|
ar.mu.RLock()
|
||||||
defer ar.mu.Unlock()
|
defer ar.mu.RUnlock()
|
||||||
var num int
|
var num int
|
||||||
for _, a := range ar.alerts {
|
for _, a := range ar.alerts {
|
||||||
if a.State == notifier.StatePending {
|
if a.State == notifier.StatePending {
|
||||||
@ -88,8 +92,8 @@ func newAlertingRule(qb datasource.QuerierBuilder, group *Group, cfg config.Rule
|
|||||||
})
|
})
|
||||||
ar.metrics.active = getOrCreateGauge(fmt.Sprintf(`vmalert_alerts_firing{%s}`, labels),
|
ar.metrics.active = getOrCreateGauge(fmt.Sprintf(`vmalert_alerts_firing{%s}`, labels),
|
||||||
func() float64 {
|
func() float64 {
|
||||||
ar.mu.Lock()
|
ar.mu.RLock()
|
||||||
defer ar.mu.Unlock()
|
defer ar.mu.RUnlock()
|
||||||
var num int
|
var num int
|
||||||
for _, a := range ar.alerts {
|
for _, a := range ar.alerts {
|
||||||
if a.State == notifier.StateFiring {
|
if a.State == notifier.StateFiring {
|
||||||
@ -98,15 +102,21 @@ func newAlertingRule(qb datasource.QuerierBuilder, group *Group, cfg config.Rule
|
|||||||
}
|
}
|
||||||
return float64(num)
|
return float64(num)
|
||||||
})
|
})
|
||||||
ar.metrics.errors = getOrCreateGauge(fmt.Sprintf(`vmalert_alerts_error{%s}`, labels),
|
ar.metrics.errors = getOrCreateGauge(fmt.Sprintf(`vmalert_alerting_rules_error{%s}`, labels),
|
||||||
func() float64 {
|
func() float64 {
|
||||||
ar.mu.Lock()
|
ar.mu.RLock()
|
||||||
defer ar.mu.Unlock()
|
defer ar.mu.RUnlock()
|
||||||
if ar.lastExecError == nil {
|
if ar.lastExecError == nil {
|
||||||
return 0
|
return 0
|
||||||
}
|
}
|
||||||
return 1
|
return 1
|
||||||
})
|
})
|
||||||
|
ar.metrics.samples = getOrCreateGauge(fmt.Sprintf(`vmalert_alerting_rules_last_evaluation_samples{%s}`, labels),
|
||||||
|
func() float64 {
|
||||||
|
ar.mu.RLock()
|
||||||
|
defer ar.mu.RUnlock()
|
||||||
|
return float64(ar.lastExecSamples)
|
||||||
|
})
|
||||||
return ar
|
return ar
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -115,6 +125,7 @@ func (ar *AlertingRule) Close() {
|
|||||||
metrics.UnregisterMetric(ar.metrics.active.name)
|
metrics.UnregisterMetric(ar.metrics.active.name)
|
||||||
metrics.UnregisterMetric(ar.metrics.pending.name)
|
metrics.UnregisterMetric(ar.metrics.pending.name)
|
||||||
metrics.UnregisterMetric(ar.metrics.errors.name)
|
metrics.UnregisterMetric(ar.metrics.errors.name)
|
||||||
|
metrics.UnregisterMetric(ar.metrics.samples.name)
|
||||||
}
|
}
|
||||||
|
|
||||||
// String implements Stringer interface
|
// String implements Stringer interface
|
||||||
@ -194,6 +205,7 @@ func (ar *AlertingRule) Exec(ctx context.Context) ([]prompbmarshal.TimeSeries, e
|
|||||||
|
|
||||||
ar.lastExecError = err
|
ar.lastExecError = err
|
||||||
ar.lastExecTime = time.Now()
|
ar.lastExecTime = time.Now()
|
||||||
|
ar.lastExecSamples = len(qMetrics)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return nil, fmt.Errorf("failed to execute query %q: %w", ar.Expr, err)
|
return nil, fmt.Errorf("failed to execute query %q: %w", ar.Expr, err)
|
||||||
}
|
}
|
||||||
@ -384,6 +396,7 @@ func (ar *AlertingRule) RuleAPI() APIAlertingRule {
|
|||||||
Expression: ar.Expr,
|
Expression: ar.Expr,
|
||||||
For: ar.For.String(),
|
For: ar.For.String(),
|
||||||
LastError: lastErr,
|
LastError: lastErr,
|
||||||
|
LastSamples: ar.lastExecSamples,
|
||||||
LastExec: ar.lastExecTime,
|
LastExec: ar.lastExecTime,
|
||||||
Labels: ar.Labels,
|
Labels: ar.Labels,
|
||||||
Annotations: ar.Annotations,
|
Annotations: ar.Annotations,
|
||||||
|
@ -35,12 +35,16 @@ type RecordingRule struct {
|
|||||||
// resets on every successful Exec
|
// resets on every successful Exec
|
||||||
// may be used as Health state
|
// may be used as Health state
|
||||||
lastExecError error
|
lastExecError error
|
||||||
|
// stores the number of samples returned during
|
||||||
|
// the last evaluation
|
||||||
|
lastExecSamples int
|
||||||
|
|
||||||
metrics *recordingRuleMetrics
|
metrics *recordingRuleMetrics
|
||||||
}
|
}
|
||||||
|
|
||||||
type recordingRuleMetrics struct {
|
type recordingRuleMetrics struct {
|
||||||
errors *gauge
|
errors *gauge
|
||||||
|
samples *gauge
|
||||||
}
|
}
|
||||||
|
|
||||||
// String implements Stringer interface
|
// String implements Stringer interface
|
||||||
@ -73,19 +77,26 @@ func newRecordingRule(qb datasource.QuerierBuilder, group *Group, cfg config.Rul
|
|||||||
labels := fmt.Sprintf(`recording=%q, group=%q, id="%d"`, rr.Name, group.Name, rr.ID())
|
labels := fmt.Sprintf(`recording=%q, group=%q, id="%d"`, rr.Name, group.Name, rr.ID())
|
||||||
rr.metrics.errors = getOrCreateGauge(fmt.Sprintf(`vmalert_recording_rules_error{%s}`, labels),
|
rr.metrics.errors = getOrCreateGauge(fmt.Sprintf(`vmalert_recording_rules_error{%s}`, labels),
|
||||||
func() float64 {
|
func() float64 {
|
||||||
rr.mu.Lock()
|
rr.mu.RLock()
|
||||||
defer rr.mu.Unlock()
|
defer rr.mu.RUnlock()
|
||||||
if rr.lastExecError == nil {
|
if rr.lastExecError == nil {
|
||||||
return 0
|
return 0
|
||||||
}
|
}
|
||||||
return 1
|
return 1
|
||||||
})
|
})
|
||||||
|
rr.metrics.samples = getOrCreateGauge(fmt.Sprintf(`vmalert_recording_rules_last_evaluation_samples{%s}`, labels),
|
||||||
|
func() float64 {
|
||||||
|
rr.mu.RLock()
|
||||||
|
defer rr.mu.RUnlock()
|
||||||
|
return float64(rr.lastExecSamples)
|
||||||
|
})
|
||||||
return rr
|
return rr
|
||||||
}
|
}
|
||||||
|
|
||||||
// Close unregisters rule metrics
|
// Close unregisters rule metrics
|
||||||
func (rr *RecordingRule) Close() {
|
func (rr *RecordingRule) Close() {
|
||||||
metrics.UnregisterMetric(rr.metrics.errors.name)
|
metrics.UnregisterMetric(rr.metrics.errors.name)
|
||||||
|
metrics.UnregisterMetric(rr.metrics.samples.name)
|
||||||
}
|
}
|
||||||
|
|
||||||
// ExecRange executes recording rule on the given time range similarly to Exec.
|
// ExecRange executes recording rule on the given time range similarly to Exec.
|
||||||
@ -118,6 +129,7 @@ func (rr *RecordingRule) Exec(ctx context.Context) ([]prompbmarshal.TimeSeries,
|
|||||||
|
|
||||||
rr.lastExecTime = time.Now()
|
rr.lastExecTime = time.Now()
|
||||||
rr.lastExecError = err
|
rr.lastExecError = err
|
||||||
|
rr.lastExecSamples = len(qMetrics)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return nil, fmt.Errorf("failed to execute query %q: %w", rr.Expr, err)
|
return nil, fmt.Errorf("failed to execute query %q: %w", rr.Expr, err)
|
||||||
}
|
}
|
||||||
@ -190,13 +202,14 @@ func (rr *RecordingRule) RuleAPI() APIRecordingRule {
|
|||||||
}
|
}
|
||||||
return APIRecordingRule{
|
return APIRecordingRule{
|
||||||
// encode as strings to avoid rounding
|
// encode as strings to avoid rounding
|
||||||
ID: fmt.Sprintf("%d", rr.ID()),
|
ID: fmt.Sprintf("%d", rr.ID()),
|
||||||
GroupID: fmt.Sprintf("%d", rr.GroupID),
|
GroupID: fmt.Sprintf("%d", rr.GroupID),
|
||||||
Name: rr.Name,
|
Name: rr.Name,
|
||||||
Type: rr.Type.String(),
|
Type: rr.Type.String(),
|
||||||
Expression: rr.Expr,
|
Expression: rr.Expr,
|
||||||
LastError: lastErr,
|
LastError: lastErr,
|
||||||
LastExec: rr.lastExecTime,
|
LastSamples: rr.lastExecSamples,
|
||||||
Labels: rr.Labels,
|
LastExec: rr.lastExecTime,
|
||||||
|
Labels: rr.Labels,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -40,6 +40,7 @@ type APIAlertingRule struct {
|
|||||||
Expression string `json:"expression"`
|
Expression string `json:"expression"`
|
||||||
For string `json:"for"`
|
For string `json:"for"`
|
||||||
LastError string `json:"last_error"`
|
LastError string `json:"last_error"`
|
||||||
|
LastSamples int `json:"last_samples"`
|
||||||
LastExec time.Time `json:"last_exec"`
|
LastExec time.Time `json:"last_exec"`
|
||||||
Labels map[string]string `json:"labels"`
|
Labels map[string]string `json:"labels"`
|
||||||
Annotations map[string]string `json:"annotations"`
|
Annotations map[string]string `json:"annotations"`
|
||||||
@ -47,12 +48,13 @@ type APIAlertingRule struct {
|
|||||||
|
|
||||||
// APIRecordingRule represents RecordingRule for WEB view
|
// APIRecordingRule represents RecordingRule for WEB view
|
||||||
type APIRecordingRule struct {
|
type APIRecordingRule struct {
|
||||||
ID string `json:"id"`
|
ID string `json:"id"`
|
||||||
Name string `json:"name"`
|
Name string `json:"name"`
|
||||||
Type string `json:"type"`
|
Type string `json:"type"`
|
||||||
GroupID string `json:"group_id"`
|
GroupID string `json:"group_id"`
|
||||||
Expression string `json:"expression"`
|
Expression string `json:"expression"`
|
||||||
LastError string `json:"last_error"`
|
LastError string `json:"last_error"`
|
||||||
LastExec time.Time `json:"last_exec"`
|
LastSamples int `json:"last_samples"`
|
||||||
Labels map[string]string `json:"labels"`
|
LastExec time.Time `json:"last_exec"`
|
||||||
|
Labels map[string]string `json:"labels"`
|
||||||
}
|
}
|
||||||
|
Loading…
Reference in New Issue
Block a user