VictoriaMetrics/app/vmalert/notifier/alert.go

204 lines
6.0 KiB
Go
Raw Normal View History

2020-04-27 23:19:27 +02:00
package notifier
import (
"bytes"
"fmt"
"io"
"strings"
textTpl "text/template"
2020-04-27 23:19:27 +02:00
"time"
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/templates"
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/utils"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/prompbmarshal"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/promrelabel"
2020-04-27 23:19:27 +02:00
)
// Alert the triggered alert
// TODO: Looks like alert name isn't unique
type Alert struct {
// GroupID contains the ID of the parent rules group
GroupID uint64
// Name represents Alert name
Name string
// Labels is the list of label-value pairs attached to the Alert
Labels map[string]string
// Annotations is the list of annotations generated on Alert evaluation
2020-04-27 23:19:27 +02:00
Annotations map[string]string
// State represents the current state of the Alert
State AlertState
// Expr contains expression that was executed to generate the Alert
Expr string
Vmalert compliance 2 (#2340) * vmalert: split alert's `Start` field into `ActiveAt` and `Start` The `ActiveAt` field identifies when alert becomes active for rules with `for > 0`. Previously, this value was stored in field `Start`. The field `Start` now identifies the moment alert became `FIRING`. The split is needed in order to distinguish these two moments in the API responses for alerts. Signed-off-by: hagen1778 <roman@victoriametrics.com> * vmalert: support specific moment of time for rules evaluation The Querier interface was extended to accept a new argument used as a timestamp at which evaluation should be made. It is needed to align rules execution time within the group. Signed-off-by: hagen1778 <roman@victoriametrics.com> * vmalert: mark disappeared series as stale Series generated by alerting rules, which were sent to remote write now will be marked as stale if they will disappear on the next evaluation. This would make ALERTS and ALERTS_FOR_TIME series more precise. Signed-off-by: hagen1778 <roman@victoriametrics.com> * wip Signed-off-by: hagen1778 <roman@victoriametrics.com> * vmalert: evaluate rules at fixed timestamp Before, time at which rules were evaluated was calculated right before rule execution. The change makes sure that timestamp is calculated only once per evalution round and all rules are using the same timestamp. It also updates the logic of resending of already resolved alert notification. Signed-off-by: hagen1778 <roman@victoriametrics.com> * vmalert: allow overridin `alertname` label value if it is present in response Previously, `alertname` was always equal to the Alerting Rule name. Now, its value can be overriden if series in response containt the different value for this label. The change is needed for improving compatibility with Prometheus. Signed-off-by: hagen1778 <roman@victoriametrics.com> * vmalert: align rules evaluation in time Now, evaluation timestamp for rules evaluates as if there was no delay in rules evaluation. It means, that rules will be evaluated at fixed timestamps+group_interval. This way provides more consistent evaluation results and improves compatibility with Prometheus, Signed-off-by: hagen1778 <roman@victoriametrics.com> * vmalert: add metric for missed iterations New metric `vmalert_iteration_missed_total` will show whether rules evaluation round was missed. Signed-off-by: hagen1778 <roman@victoriametrics.com> * vmalert: reduce delay before the initial rule evaluation in group Signed-off-by: hagen1778 <roman@victoriametrics.com> * vmalert: rollback alertname override According to the spec: ``` The alert name from the alerting rule (HighRequestLatency from the example above) MUST be added to the labels of the alert with the label name as alertname. It MUST override any existing alertname label. ``` https://github.com/prometheus/compliance/blob/main/alert_generator/specification.md#step-3 Signed-off-by: hagen1778 <roman@victoriametrics.com> * vmalert: throw err immediately on dedup detection ``` The execution of an alerting rule MUST error out immediately and MUST NOT send any alerts or add samples to samples receiver if there is more than one alert with the same labels ``` https://github.com/prometheus/compliance/blob/main/alert_generator/specification.md#step-4 Signed-off-by: hagen1778 <roman@victoriametrics.com> * vmalert: cleanup Signed-off-by: hagen1778 <roman@victoriametrics.com> * vmalert: use strings builder to reduce allocs Signed-off-by: hagen1778 <roman@victoriametrics.com>
2022-03-29 15:09:07 +02:00
// ActiveAt defines the moment of time when Alert has become active
ActiveAt time.Time
// Start defines the moment of time when Alert has become firing
2020-04-27 23:19:27 +02:00
Start time.Time
// End defines the moment of time when Alert supposed to expire
End time.Time
Vmalert compliance 2 (#2340) * vmalert: split alert's `Start` field into `ActiveAt` and `Start` The `ActiveAt` field identifies when alert becomes active for rules with `for > 0`. Previously, this value was stored in field `Start`. The field `Start` now identifies the moment alert became `FIRING`. The split is needed in order to distinguish these two moments in the API responses for alerts. Signed-off-by: hagen1778 <roman@victoriametrics.com> * vmalert: support specific moment of time for rules evaluation The Querier interface was extended to accept a new argument used as a timestamp at which evaluation should be made. It is needed to align rules execution time within the group. Signed-off-by: hagen1778 <roman@victoriametrics.com> * vmalert: mark disappeared series as stale Series generated by alerting rules, which were sent to remote write now will be marked as stale if they will disappear on the next evaluation. This would make ALERTS and ALERTS_FOR_TIME series more precise. Signed-off-by: hagen1778 <roman@victoriametrics.com> * wip Signed-off-by: hagen1778 <roman@victoriametrics.com> * vmalert: evaluate rules at fixed timestamp Before, time at which rules were evaluated was calculated right before rule execution. The change makes sure that timestamp is calculated only once per evalution round and all rules are using the same timestamp. It also updates the logic of resending of already resolved alert notification. Signed-off-by: hagen1778 <roman@victoriametrics.com> * vmalert: allow overridin `alertname` label value if it is present in response Previously, `alertname` was always equal to the Alerting Rule name. Now, its value can be overriden if series in response containt the different value for this label. The change is needed for improving compatibility with Prometheus. Signed-off-by: hagen1778 <roman@victoriametrics.com> * vmalert: align rules evaluation in time Now, evaluation timestamp for rules evaluates as if there was no delay in rules evaluation. It means, that rules will be evaluated at fixed timestamps+group_interval. This way provides more consistent evaluation results and improves compatibility with Prometheus, Signed-off-by: hagen1778 <roman@victoriametrics.com> * vmalert: add metric for missed iterations New metric `vmalert_iteration_missed_total` will show whether rules evaluation round was missed. Signed-off-by: hagen1778 <roman@victoriametrics.com> * vmalert: reduce delay before the initial rule evaluation in group Signed-off-by: hagen1778 <roman@victoriametrics.com> * vmalert: rollback alertname override According to the spec: ``` The alert name from the alerting rule (HighRequestLatency from the example above) MUST be added to the labels of the alert with the label name as alertname. It MUST override any existing alertname label. ``` https://github.com/prometheus/compliance/blob/main/alert_generator/specification.md#step-3 Signed-off-by: hagen1778 <roman@victoriametrics.com> * vmalert: throw err immediately on dedup detection ``` The execution of an alerting rule MUST error out immediately and MUST NOT send any alerts or add samples to samples receiver if there is more than one alert with the same labels ``` https://github.com/prometheus/compliance/blob/main/alert_generator/specification.md#step-4 Signed-off-by: hagen1778 <roman@victoriametrics.com> * vmalert: cleanup Signed-off-by: hagen1778 <roman@victoriametrics.com> * vmalert: use strings builder to reduce allocs Signed-off-by: hagen1778 <roman@victoriametrics.com>
2022-03-29 15:09:07 +02:00
// ResolvedAt defines the moment when Alert was switched from Firing to Inactive
ResolvedAt time.Time
// LastSent defines the moment when Alert was sent last time
LastSent time.Time
// KeepFiringSince defines the moment when StateFiring was kept because of `keep_firing_for` instead of real alert
KeepFiringSince time.Time
// Value stores the value returned from evaluating expression from Expr field
2020-04-27 23:19:27 +02:00
Value float64
2023-02-13 13:27:13 +01:00
// ID is the unique identifier for the Alert
ID uint64
// Restored is true if Alert was restored after restart
Restored bool
// For defines for how long Alert needs to be active to become StateFiring
For time.Duration
2020-04-27 23:19:27 +02:00
}
// AlertState type indicates the Alert state
type AlertState int
const (
// StateInactive is the state of an alert that is neither firing nor pending.
StateInactive AlertState = iota
// StatePending is the state of an alert that has been active for less than
// the configured threshold duration.
StatePending
// StateFiring is the state of an alert that has been active for longer than
// the configured threshold duration.
StateFiring
)
// String stringer for AlertState
func (as AlertState) String() string {
switch as {
case StateFiring:
return "firing"
case StatePending:
return "pending"
}
return "inactive"
}
// AlertTplData is used to execute templating
type AlertTplData struct {
Labels map[string]string
Value float64
Expr string
AlertID uint64
GroupID uint64
ActiveAt time.Time
For time.Duration
2020-04-27 23:19:27 +02:00
}
var tplHeaders = []string{
"{{ $value := .Value }}",
"{{ $labels := .Labels }}",
"{{ $expr := .Expr }}",
"{{ $externalLabels := .ExternalLabels }}",
"{{ $externalURL := .ExternalURL }}",
"{{ $alertID := .AlertID }}",
"{{ $groupID := .GroupID }}",
"{{ $activeAt := .ActiveAt }}",
"{{ $for := .For }}",
}
2020-04-27 23:19:27 +02:00
// ExecTemplate executes the Alert template for given
2020-04-27 23:19:27 +02:00
// map of annotations.
// Every alert could have a different datasource, so function
// requires a queryFunction as an argument.
func (a *Alert) ExecTemplate(q templates.QueryFn, labels, annotations map[string]string) (map[string]string, error) {
tplData := AlertTplData{
Value: a.Value,
Labels: labels,
Expr: a.Expr,
AlertID: a.ID,
GroupID: a.GroupID,
ActiveAt: a.ActiveAt,
For: a.For,
}
return ExecTemplate(q, annotations, tplData)
2020-04-27 23:19:27 +02:00
}
// ExecTemplate executes the given template for given annotations map.
func ExecTemplate(q templates.QueryFn, annotations map[string]string, tplData AlertTplData) (map[string]string, error) {
tmpl, err := templates.GetWithFuncs(templates.FuncsWithQuery(q))
if err != nil {
return nil, fmt.Errorf("error cloning template: %w", err)
}
return templateAnnotations(annotations, tplData, tmpl, true)
}
2020-04-27 23:19:27 +02:00
// ValidateTemplates validate annotations for possible template error, uses empty data for template population
func ValidateTemplates(annotations map[string]string) error {
tmpl, err := templates.Get()
if err != nil {
return err
}
_, err = templateAnnotations(annotations, AlertTplData{
2020-04-27 23:19:27 +02:00
Labels: map[string]string{},
Value: 0,
}, tmpl, false)
2020-04-27 23:19:27 +02:00
return err
}
func templateAnnotations(annotations map[string]string, data AlertTplData, tmpl *textTpl.Template, execute bool) (map[string]string, error) {
2020-04-27 23:19:27 +02:00
var builder strings.Builder
var buf bytes.Buffer
eg := new(utils.ErrGroup)
2020-04-27 23:19:27 +02:00
r := make(map[string]string, len(annotations))
tData := tplData{data, externalLabels, externalURL}
header := strings.Join(tplHeaders, "")
2020-04-27 23:19:27 +02:00
for key, text := range annotations {
buf.Reset()
builder.Reset()
builder.Grow(len(header) + len(text))
builder.WriteString(header)
2020-04-27 23:19:27 +02:00
builder.WriteString(text)
if err := templateAnnotation(&buf, builder.String(), tData, tmpl, execute); err != nil {
r[key] = text
eg.Add(fmt.Errorf("key %q, template %q: %w", key, text, err))
2020-04-27 23:19:27 +02:00
continue
}
r[key] = buf.String()
}
return r, eg.Err()
2020-04-27 23:19:27 +02:00
}
type tplData struct {
AlertTplData
ExternalLabels map[string]string
ExternalURL string
}
func templateAnnotation(dst io.Writer, text string, data tplData, tmpl *textTpl.Template, execute bool) error {
tpl, err := tmpl.Clone()
2020-04-27 23:19:27 +02:00
if err != nil {
return fmt.Errorf("error cloning template before parse annotation: %w", err)
}
// Clone() doesn't copy tpl Options, so we set them manually
tpl = tpl.Option("missingkey=zero")
tpl, err = tpl.Parse(text)
if err != nil {
return fmt.Errorf("error parsing annotation template: %w", err)
}
if !execute {
return nil
2020-04-27 23:19:27 +02:00
}
if err = tpl.Execute(dst, data); err != nil {
return fmt.Errorf("error evaluating annotation template: %w", err)
2020-04-27 23:19:27 +02:00
}
return nil
}
func (a Alert) toPromLabels(relabelCfg *promrelabel.ParsedConfigs) []prompbmarshal.Label {
var labels []prompbmarshal.Label
for k, v := range a.Labels {
labels = append(labels, prompbmarshal.Label{
Name: k,
Value: v,
})
}
if relabelCfg != nil {
labels = relabelCfg.Apply(labels, 0)
}
promrelabel.SortLabels(labels)
return labels
}