vmalert: add debug mode for alerting rules (#3055)

* vmalert: add `debug` mode for alerting rules

Debug information includes alerts state changes and requests
sent to the datasource. Debug can be enabled only on rule's
level. It might be useful for debugging unexpected
behaviour of alerting rule.

https://github.com/VictoriaMetrics/VictoriaMetrics/issues/3025

Signed-off-by: hagen1778 <roman@victoriametrics.com>

* vmalert: review fixes

Signed-off-by: hagen1778 <roman@victoriametrics.com>

* Update app/vmalert/alerting.go

Co-authored-by: Aliaksandr Valialkin <valyala@victoriametrics.com>

* vmalert: go fmt

Signed-off-by: hagen1778 <roman@victoriametrics.com>

Signed-off-by: hagen1778 <roman@victoriametrics.com>
Co-authored-by: Aliaksandr Valialkin <valyala@victoriametrics.com>
This commit is contained in:
Roman Khavronenko 2022-09-13 15:25:43 +02:00 committed by GitHub
parent 25c9a1604a
commit 8441375da2
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
9 changed files with 84 additions and 2 deletions

View File

@ -184,6 +184,13 @@ expr: <string>
# as firing once they return.
[ for: <duration> | default = 0s ]
# Whether to print debug information into logs.
# Information includes alerts state changes and requests sent to the datasource.
# Please note, that if rule's query params contain sensitive
# information - it will be printed to logs.
# Is applicable to alerting rules only.
[ debug: <bool> | default = false ]
# Labels to add or overwrite for each alert.
labels:
[ <labelname>: <tmpl_string> ]

View File

@ -6,6 +6,7 @@ import (
"hash/fnv"
"sort"
"strconv"
"strings"
"sync"
"time"
@ -30,6 +31,7 @@ type AlertingRule struct {
GroupID uint64
GroupName string
EvalInterval time.Duration
Debug bool
q datasource.Querier
@ -71,11 +73,13 @@ func newAlertingRule(qb datasource.QuerierBuilder, group *Group, cfg config.Rule
GroupID: group.ID(),
GroupName: group.Name,
EvalInterval: group.Interval,
Debug: cfg.Debug,
q: qb.BuildWithParams(datasource.QuerierParams{
DataSourceType: group.Type.String(),
EvaluationInterval: group.Interval,
QueryParams: group.Params,
Headers: group.Headers,
Debug: cfg.Debug,
}),
alerts: make(map[uint64]*notifier.Alert),
metrics: &alertingRuleMetrics{},
@ -143,6 +147,32 @@ func (ar *AlertingRule) ID() uint64 {
return ar.RuleID
}
func (ar *AlertingRule) logDebug(at time.Time, a *notifier.Alert, msg string) {
if !ar.Debug {
return
}
prefix := fmt.Sprintf("DEBUG rule %q:%q (%d) at %v: ",
ar.GroupName, ar.Name, ar.RuleID, at.Format(time.RFC3339))
if a != nil {
labelKeys := make([]string, len(a.Labels))
var i int
for k := range a.Labels {
labelKeys[i] = k
i++
}
sort.Strings(labelKeys)
var labels string
for _, l := range labelKeys {
labels += fmt.Sprintf("%s=%s,", l, a.Labels[l])
}
labels = strings.TrimRight(labels, ",")
prefix += fmt.Sprintf("alert %d (%s) ", a.ID, labels)
}
logger.Infof("%s", prefix+msg)
}
type labelSet struct {
// origin labels from series
// used for templating
@ -255,9 +285,13 @@ func (ar *AlertingRule) Exec(ctx context.Context, ts time.Time, limit int) ([]pr
return nil, fmt.Errorf("failed to execute query %q: %w", ar.Expr, err)
}
ar.logDebug(ts, nil, fmt.Sprintf("query returned %d samples (elapsed: %s)",
ar.lastExecSamples, ar.lastExecDuration))
for h, a := range ar.alerts {
// cleanup inactive alerts from previous Exec
if a.State == notifier.StateInactive && ts.Sub(a.ResolvedAt) > resolvedRetention {
ar.logDebug(ts, a, "deleted as inactive")
delete(ar.alerts, h)
}
}
@ -285,9 +319,10 @@ func (ar *AlertingRule) Exec(ctx context.Context, ts time.Time, limit int) ([]pr
// back to notifier.StatePending
a.State = notifier.StatePending
a.ActiveAt = ts
ar.logDebug(ts, a, "INACTIVE => PENDING")
}
if a.Value != m.Values[0] {
// update Value field with latest value
// update Value field with the latest value
a.Value = m.Values[0]
// and re-exec template since Value can be used
// in annotations
@ -307,6 +342,7 @@ func (ar *AlertingRule) Exec(ctx context.Context, ts time.Time, limit int) ([]pr
a.State = notifier.StatePending
a.ActiveAt = ts
ar.alerts[h] = a
ar.logDebug(ts, a, "created in state PENDING")
}
var numActivePending int
for h, a := range ar.alerts {
@ -317,11 +353,13 @@ func (ar *AlertingRule) Exec(ctx context.Context, ts time.Time, limit int) ([]pr
// alert was in Pending state - it is not
// active anymore
delete(ar.alerts, h)
ar.logDebug(ts, a, "PENDING => DELETED: is absent in current evaluation round")
continue
}
if a.State == notifier.StateFiring {
a.State = notifier.StateInactive
a.ResolvedAt = ts
ar.logDebug(ts, a, "FIRING => INACTIVE: is absent in current evaluation round")
}
continue
}
@ -330,6 +368,8 @@ func (ar *AlertingRule) Exec(ctx context.Context, ts time.Time, limit int) ([]pr
a.State = notifier.StateFiring
a.Start = ts
alertsFired.Inc()
ar.logDebug(ts, a, fmt.Sprintf("PENDING => FIRING: %s since becoming active at %v",
ts.Sub(a.ActiveAt), a.ActiveAt))
}
}
if limit > 0 && numActivePending > limit {

View File

@ -113,6 +113,7 @@ type Rule struct {
For *promutils.Duration `yaml:"for,omitempty"`
Labels map[string]string `yaml:"labels,omitempty"`
Annotations map[string]string `yaml:"annotations,omitempty"`
Debug bool `yaml:"debug,omitempty"`
// Catches all undefined fields and must be empty after parsing.
XXX map[string]interface{} `yaml:",inline"`

View File

@ -535,6 +535,21 @@ headers:
rules:
- alert: foo
expr: sum by(job) (up == 1)
`)
})
t.Run("`debug` change", func(t *testing.T) {
f(t, `
name: TestGroup
rules:
- alert: foo
expr: sum by(job) (up == 1)
`, `
name: TestGroup
rules:
- alert: foo
expr: sum by(job) (up == 1)
debug: true
`)
})
}

View File

@ -1,6 +1,6 @@
groups:
- name: TestGroup
interval: 2s
interval: 5s
concurrency: 2
limit: 1000
headers:
@ -11,6 +11,7 @@ groups:
- alert: Conns
expr: sum(vm_tcplistener_conns) by(instance) > 1
for: 3m
debug: true
annotations:
summary: Too high connection number for {{$labels.instance}}
{{ with printf "sum(vm_tcplistener_conns{instance=%q})" .Labels.instance | query }}

View File

@ -23,6 +23,7 @@ type QuerierParams struct {
EvaluationInterval time.Duration
QueryParams url.Values
Headers map[string]string
Debug bool
}
// Metric is the basic entity which should be return by datasource

View File

@ -9,6 +9,7 @@ import (
"strings"
"time"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/logger"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/promauth"
)
@ -39,6 +40,10 @@ type VMStorage struct {
evaluationInterval time.Duration
extraParams url.Values
extraHeaders []keyValue
// whether to print additional log messages
// for each sent request
debug bool
}
type keyValue struct {
@ -64,6 +69,7 @@ func (s *VMStorage) ApplyParams(params QuerierParams) *VMStorage {
s.dataSourceType = toDatasourceType(params.DataSourceType)
s.evaluationInterval = params.EvaluationInterval
s.extraParams = params.QueryParams
s.debug = params.Debug
if params.Headers != nil {
for key, value := range params.Headers {
kv := keyValue{key: key, value: value}
@ -151,6 +157,9 @@ func (s *VMStorage) QueryRange(ctx context.Context, query string, start, end tim
}
func (s *VMStorage) do(ctx context.Context, req *http.Request) (*http.Response, error) {
if s.debug {
logger.Infof("DEBUG datasource request: executing %s request with params %q", req.Method, req.URL.RawQuery)
}
resp, err := s.c.Do(req.WithContext(ctx))
if err != nil {
return nil, fmt.Errorf("error getting response from %s: %w", req.URL.Redacted(), err)

View File

@ -22,6 +22,7 @@ The following tip changes can be tested by building VictoriaMetrics components f
* FEATURE: set the `start` arg to `end - 5 minutes` if isn't passed explicitly to [/api/v1/labels](https://docs.victoriametrics.com/url-examples.html#apiv1labels) and [/api/v1/label/.../values](https://docs.victoriametrics.com/url-examples.html#apiv1labelvalues). See [this pull request](https://github.com/VictoriaMetrics/VictoriaMetrics/pull/3052).
* FEATURE: [vmctl](https://docs.victoriametrics.com/vmctl.html): add `vm-native-step-interval` command line flag for `vm-native` mode. New option allows splitting the import process into chunks by time interval. This helps migrating data sets with high churn rate and provides better control over the process. See [feature request](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/2733).
* FEATURE: [vmui](https://docs.victoriametrics.com/#vmui): add `top queries` tab, which shows various stats for recently executed queries. See [these docs](https://docs.victoriametrics.com/#top-queries) and [this feature request](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/2707).
* FEATURE: [vmalert](https://docs.victoriametrics.com/vmalert.html): add `debug` mode to the alerting rule settings for printing additional information into logs during evaluation. See `debug` param in [alerting rule config](https://docs.victoriametrics.com/vmalert.html#alerting-rules)
* BUGFIX: [MetricsQL](https://docs.victoriametrics.com/MetricsQL.html): properly calculate `rate_over_sum(m[d])` as `sum_over_time(m[d])/d`. Previously the `sum_over_time(m[d])` could be improperly divided by smaller than `d` time range. See [rate_over_sum() docs](https://docs.victoriametrics.com/MetricsQL.html#rate_over_sum) and [this issue](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/3045).
* BUGFIX: [VictoriaMetrics cluster](https://docs.victoriametrics.com/Cluster-VictoriaMetrics.html): properly calculate query results at `vmselect`. See [this issue](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/3067). The issue has been introduced in [v1.81.0](https://docs.victoriametrics.com/CHANGELOG.html#v1810).

View File

@ -188,6 +188,13 @@ expr: <string>
# as firing once they return.
[ for: <duration> | default = 0s ]
# Whether to print debug information into logs.
# Information includes alerts state changes and requests sent to the datasource.
# Please note, that if rule's query params contain sensitive
# information - it will be printed to logs.
# Is applicable to alerting rules only.
[ debug: <bool> | default = false ]
# Labels to add or overwrite for each alert.
labels:
[ <labelname>: <tmpl_string> ]