mirror of
https://github.com/VictoriaMetrics/VictoriaMetrics.git
synced 2025-01-20 07:19:17 +01:00
`external.label` flag supposed to help to distinguish alert or recording rules source in situations when more than one `vmalert` runs for the same datasource or AlertManager.
This commit is contained in:
parent
0da202023b
commit
2f1e7298ce
@ -64,6 +64,8 @@ run-vmalert: vmalert
|
||||
-notifier.url=http://127.0.0.1:9093 \
|
||||
-remoteWrite.url=http://localhost:8428 \
|
||||
-remoteRead.url=http://localhost:8428 \
|
||||
-external.label=cluster=east-1 \
|
||||
-external.label=replica=a \
|
||||
-evaluationInterval=3s
|
||||
|
||||
vmalert-amd64:
|
||||
|
@ -44,10 +44,19 @@ compatible storage address for storing recording rules results and alerts state
|
||||
Then configure `vmalert` accordingly:
|
||||
```
|
||||
./bin/vmalert -rule=alert.rules \
|
||||
-datasource.url=http://localhost:8428 \
|
||||
-notifier.url=http://localhost:9093
|
||||
-datasource.url=http://localhost:8428 \ # PromQL compatible datasource
|
||||
-notifier.url=http://localhost:9093 \ # AlertManager URL
|
||||
-notifier.url=http://127.0.0.1:9093 \ # AlertManager replica URL
|
||||
-remoteWrite.url=http://localhost:8428 \ # remote write compatible storage to persist rules
|
||||
-remoteRead.url=http://localhost:8428 \ # PromQL compatible datasource to restore alerts state from
|
||||
-external.label=cluster=east-1 \ # External label to be applied for each rule
|
||||
-external.label=replica=a \ # Multiple external labels may be set
|
||||
-evaluationInterval=3s # Default evaluation interval if not specified in rules group
|
||||
```
|
||||
|
||||
If you run multiple `vmalert` services for the same datastore or AlertManager - do not forget
|
||||
to specify different `external.label` flags in order to define which `vmalert` generated rules or alerts.
|
||||
|
||||
Configuration for [recording](https://prometheus.io/docs/prometheus/latest/configuration/recording_rules/)
|
||||
and [alerting](https://prometheus.io/docs/prometheus/latest/configuration/alerting_rules/) rules is very
|
||||
similar to Prometheus rules and configured using YAML. Configuration examples may be found
|
||||
@ -174,6 +183,8 @@ Usage of vmalert:
|
||||
How often to evaluate the rules (default 1m0s)
|
||||
-external.url string
|
||||
External URL is used as alert's source for sent alerts to the notifier
|
||||
-external.label array
|
||||
Optional label in the form 'name=value' to add to all generated recording rules and alerts. Pass multiple -label flags in order to add multiple label sets.
|
||||
-httpListenAddr string
|
||||
Address to listen for http connections (default ":8880")
|
||||
-metricsAuthKey string
|
||||
|
@ -331,15 +331,22 @@ func alertForToTimeSeries(name string, a *notifier.Alert, timestamp time.Time) p
|
||||
// Restore restores only Start field. Field State will be always Pending and supposed
|
||||
// to be updated on next Exec, as well as Value field.
|
||||
// Only rules with For > 0 will be restored.
|
||||
func (ar *AlertingRule) Restore(ctx context.Context, q datasource.Querier, lookback time.Duration) error {
|
||||
func (ar *AlertingRule) Restore(ctx context.Context, q datasource.Querier, lookback time.Duration, labels map[string]string) error {
|
||||
if q == nil {
|
||||
return fmt.Errorf("querier is nil")
|
||||
}
|
||||
|
||||
// account for external labels in filter
|
||||
var labelsFilter string
|
||||
for k, v := range labels {
|
||||
labelsFilter += fmt.Sprintf(",%s=%q", k, v)
|
||||
}
|
||||
|
||||
// Get the last datapoint in range via MetricsQL `last_over_time`.
|
||||
// We don't use plain PromQL since Prometheus doesn't support
|
||||
// remote write protocol which is used for state persistence in vmalert.
|
||||
expr := fmt.Sprintf("last_over_time(%s{alertname=%q}[%ds])",
|
||||
alertForStateMetricName, ar.Name, int(lookback.Seconds()))
|
||||
expr := fmt.Sprintf("last_over_time(%s{alertname=%q%s}[%ds])",
|
||||
alertForStateMetricName, ar.Name, labelsFilter, int(lookback.Seconds()))
|
||||
qMetrics, err := q.Query(ctx, expr)
|
||||
if err != nil {
|
||||
return err
|
||||
|
@ -419,7 +419,7 @@ func TestAlertingRule_Restore(t *testing.T) {
|
||||
fq := &fakeQuerier{}
|
||||
tc.rule.GroupID = fakeGroup.ID()
|
||||
fq.add(tc.metrics...)
|
||||
if err := tc.rule.Restore(context.TODO(), fq, time.Hour); err != nil {
|
||||
if err := tc.rule.Restore(context.TODO(), fq, time.Hour, nil); err != nil {
|
||||
t.Fatalf("unexpected err: %s", err)
|
||||
}
|
||||
if len(tc.rule.alerts) != len(tc.expAlerts) {
|
||||
|
@ -90,6 +90,14 @@ func (r *Rule) UnmarshalYAML(unmarshal func(interface{}) error) error {
|
||||
return nil
|
||||
}
|
||||
|
||||
// Name returns Rule name according to its type
|
||||
func (r *Rule) Name() string {
|
||||
if r.Record != "" {
|
||||
return r.Record
|
||||
}
|
||||
return r.Alert
|
||||
}
|
||||
|
||||
// HashRule hashes significant Rule fields into
|
||||
// unique hash value
|
||||
func HashRule(r Rule) uint64 {
|
||||
|
@ -32,7 +32,7 @@ type Group struct {
|
||||
updateCh chan *Group
|
||||
}
|
||||
|
||||
func newGroup(cfg config.Group, defaultInterval time.Duration) *Group {
|
||||
func newGroup(cfg config.Group, defaultInterval time.Duration, labels map[string]string) *Group {
|
||||
g := &Group{
|
||||
Name: cfg.Name,
|
||||
File: cfg.File,
|
||||
@ -50,6 +50,17 @@ func newGroup(cfg config.Group, defaultInterval time.Duration) *Group {
|
||||
}
|
||||
rules := make([]Rule, len(cfg.Rules))
|
||||
for i, r := range cfg.Rules {
|
||||
// override rule labels with external labels
|
||||
for k, v := range labels {
|
||||
if prevV, ok := r.Labels[k]; ok {
|
||||
logger.Infof("label %q=%q for rule %q.%q overwritten with external label %q=%q",
|
||||
k, prevV, g.Name, r.Name(), k, v)
|
||||
}
|
||||
if r.Labels == nil {
|
||||
r.Labels = map[string]string{}
|
||||
}
|
||||
r.Labels[k] = v
|
||||
}
|
||||
rules[i] = g.newRule(r)
|
||||
}
|
||||
g.Rules = rules
|
||||
@ -74,7 +85,7 @@ func (g *Group) ID() uint64 {
|
||||
}
|
||||
|
||||
// Restore restores alerts state for group rules
|
||||
func (g *Group) Restore(ctx context.Context, q datasource.Querier, lookback time.Duration) error {
|
||||
func (g *Group) Restore(ctx context.Context, q datasource.Querier, lookback time.Duration, labels map[string]string) error {
|
||||
for _, rule := range g.Rules {
|
||||
rr, ok := rule.(*AlertingRule)
|
||||
if !ok {
|
||||
@ -83,7 +94,7 @@ func (g *Group) Restore(ctx context.Context, q datasource.Querier, lookback time
|
||||
if rr.For < 1 {
|
||||
continue
|
||||
}
|
||||
if err := rr.Restore(ctx, q, lookback); err != nil {
|
||||
if err := rr.Restore(ctx, q, lookback, labels); err != nil {
|
||||
return fmt.Errorf("error while restoring rule %q: %w", rule, err)
|
||||
}
|
||||
}
|
||||
|
@ -150,7 +150,7 @@ func TestGroupStart(t *testing.T) {
|
||||
t.Fatalf("failed to parse rules: %s", err)
|
||||
}
|
||||
const evalInterval = time.Millisecond
|
||||
g := newGroup(groups[0], evalInterval)
|
||||
g := newGroup(groups[0], evalInterval, map[string]string{"cluster": "east-1"})
|
||||
g.Concurrency = 2
|
||||
|
||||
fn := &fakeNotifier{}
|
||||
|
@ -40,6 +40,8 @@ absolute path to all .yaml files in root.`)
|
||||
externalURL = flag.String("external.url", "", "External URL is used as alert's source for sent alerts to the notifier")
|
||||
externalAlertSource = flag.String("external.alert.source", "", `External Alert Source allows to override the Source link for alerts sent to AlertManager for cases where you want to build a custom link to Grafana, Prometheus or any other service.
|
||||
eg. 'explore?orgId=1&left=[\"now-1h\",\"now\",\"VictoriaMetrics\",{\"expr\": \"{{$expr|quotesEscape|pathEscape}}\"},{\"mode\":\"Metrics\"},{\"ui\":[true,true,true,\"none\"]}]'.If empty '/api/v1/:groupID/alertID/status' is used`)
|
||||
externalLabels = flagutil.NewArray("external.label", "Optional label in the form 'name=value' to add to all generated recording rules and alerts. "+
|
||||
"Pass multiple -label flags in order to add multiple label sets.")
|
||||
|
||||
remoteReadLookBack = flag.Duration("remoteRead.lookback", time.Hour, "Lookback defines how far to look into past for alerts timeseries."+
|
||||
" For example, if lookback=1h then range from now() to now()-1h will be scanned.")
|
||||
@ -125,6 +127,7 @@ func newManager(ctx context.Context) (*manager, error) {
|
||||
groups: make(map[uint64]*Group),
|
||||
querier: q,
|
||||
notifiers: nts,
|
||||
labels: map[string]string{},
|
||||
}
|
||||
rw, err := remotewrite.Init(ctx)
|
||||
if err != nil {
|
||||
@ -137,6 +140,14 @@ func newManager(ctx context.Context) (*manager, error) {
|
||||
return nil, fmt.Errorf("failed to init remoteRead: %w", err)
|
||||
}
|
||||
manager.rr = rr
|
||||
|
||||
for _, s := range *externalLabels {
|
||||
n := strings.IndexByte(s, '=')
|
||||
if n < 0 {
|
||||
return nil, fmt.Errorf("missing '=' in `-label`. It must contain label in the form `name=value`; got %q", s)
|
||||
}
|
||||
manager.labels[s[:n]] = s[n+1:]
|
||||
}
|
||||
return manager, nil
|
||||
}
|
||||
|
||||
|
@ -21,7 +21,8 @@ type manager struct {
|
||||
rw *remotewrite.Client
|
||||
rr datasource.Querier
|
||||
|
||||
wg sync.WaitGroup
|
||||
wg sync.WaitGroup
|
||||
labels map[string]string
|
||||
|
||||
groupsMu sync.RWMutex
|
||||
groups map[uint64]*Group
|
||||
@ -64,7 +65,7 @@ func (m *manager) close() {
|
||||
|
||||
func (m *manager) startGroup(ctx context.Context, group *Group, restore bool) {
|
||||
if restore && m.rr != nil {
|
||||
err := group.Restore(ctx, m.rr, *remoteReadLookBack)
|
||||
err := group.Restore(ctx, m.rr, *remoteReadLookBack, m.labels)
|
||||
if err != nil {
|
||||
logger.Errorf("error while restoring state for group %q: %s", group.Name, err)
|
||||
}
|
||||
@ -88,7 +89,7 @@ func (m *manager) update(ctx context.Context, path []string, validateTpl, valida
|
||||
|
||||
groupsRegistry := make(map[uint64]*Group)
|
||||
for _, cfg := range groupsCfg {
|
||||
ng := newGroup(cfg, *evaluationInterval)
|
||||
ng := newGroup(cfg, *evaluationInterval, m.labels)
|
||||
groupsRegistry[ng.ID()] = ng
|
||||
}
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user