app/vmalert: support external.label to specify global labelset for all rules #622 (#652)

`external.label` flag supposed to help to distinguish alert or recording rules
source in situations when more than one `vmalert` runs for the same datasource
or AlertManager.
This commit is contained in:
Roman Khavronenko 2020-07-28 12:20:31 +01:00 committed by GitHub
parent 0da202023b
commit 2f1e7298ce
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
9 changed files with 64 additions and 13 deletions

View File

@ -64,6 +64,8 @@ run-vmalert: vmalert
-notifier.url=http://127.0.0.1:9093 \
-remoteWrite.url=http://localhost:8428 \
-remoteRead.url=http://localhost:8428 \
-external.label=cluster=east-1 \
-external.label=replica=a \
-evaluationInterval=3s
vmalert-amd64:

View File

@ -44,10 +44,19 @@ compatible storage address for storing recording rules results and alerts state
Then configure `vmalert` accordingly:
```
./bin/vmalert -rule=alert.rules \
-datasource.url=http://localhost:8428 \
-notifier.url=http://localhost:9093
-datasource.url=http://localhost:8428 \ # PromQL compatible datasource
-notifier.url=http://localhost:9093 \ # AlertManager URL
-notifier.url=http://127.0.0.1:9093 \ # AlertManager replica URL
-remoteWrite.url=http://localhost:8428 \ # remote write compatible storage to persist rules
-remoteRead.url=http://localhost:8428 \ # PromQL compatible datasource to restore alerts state from
-external.label=cluster=east-1 \ # External label to be applied for each rule
-external.label=replica=a \ # Multiple external labels may be set
-evaluationInterval=3s # Default evaluation interval if not specified in rules group
```
If you run multiple `vmalert` services for the same datastore or AlertManager - do not forget
to specify different `external.label` flags in order to define which `vmalert` generated rules or alerts.
Configuration for [recording](https://prometheus.io/docs/prometheus/latest/configuration/recording_rules/)
and [alerting](https://prometheus.io/docs/prometheus/latest/configuration/alerting_rules/) rules is very
similar to Prometheus rules and configured using YAML. Configuration examples may be found
@ -174,6 +183,8 @@ Usage of vmalert:
How often to evaluate the rules (default 1m0s)
-external.url string
External URL is used as alert's source for sent alerts to the notifier
-external.label array
Optional label in the form 'name=value' to add to all generated recording rules and alerts. Pass multiple -label flags in order to add multiple label sets.
-httpListenAddr string
Address to listen for http connections (default ":8880")
-metricsAuthKey string

View File

@ -331,15 +331,22 @@ func alertForToTimeSeries(name string, a *notifier.Alert, timestamp time.Time) p
// Restore restores only Start field. Field State will be always Pending and supposed
// to be updated on next Exec, as well as Value field.
// Only rules with For > 0 will be restored.
func (ar *AlertingRule) Restore(ctx context.Context, q datasource.Querier, lookback time.Duration) error {
func (ar *AlertingRule) Restore(ctx context.Context, q datasource.Querier, lookback time.Duration, labels map[string]string) error {
if q == nil {
return fmt.Errorf("querier is nil")
}
// account for external labels in filter
var labelsFilter string
for k, v := range labels {
labelsFilter += fmt.Sprintf(",%s=%q", k, v)
}
// Get the last datapoint in range via MetricsQL `last_over_time`.
// We don't use plain PromQL since Prometheus doesn't support
// remote write protocol which is used for state persistence in vmalert.
expr := fmt.Sprintf("last_over_time(%s{alertname=%q}[%ds])",
alertForStateMetricName, ar.Name, int(lookback.Seconds()))
expr := fmt.Sprintf("last_over_time(%s{alertname=%q%s}[%ds])",
alertForStateMetricName, ar.Name, labelsFilter, int(lookback.Seconds()))
qMetrics, err := q.Query(ctx, expr)
if err != nil {
return err

View File

@ -419,7 +419,7 @@ func TestAlertingRule_Restore(t *testing.T) {
fq := &fakeQuerier{}
tc.rule.GroupID = fakeGroup.ID()
fq.add(tc.metrics...)
if err := tc.rule.Restore(context.TODO(), fq, time.Hour); err != nil {
if err := tc.rule.Restore(context.TODO(), fq, time.Hour, nil); err != nil {
t.Fatalf("unexpected err: %s", err)
}
if len(tc.rule.alerts) != len(tc.expAlerts) {

View File

@ -90,6 +90,14 @@ func (r *Rule) UnmarshalYAML(unmarshal func(interface{}) error) error {
return nil
}
// Name returns Rule name according to its type
func (r *Rule) Name() string {
if r.Record != "" {
return r.Record
}
return r.Alert
}
// HashRule hashes significant Rule fields into
// unique hash value
func HashRule(r Rule) uint64 {

View File

@ -32,7 +32,7 @@ type Group struct {
updateCh chan *Group
}
func newGroup(cfg config.Group, defaultInterval time.Duration) *Group {
func newGroup(cfg config.Group, defaultInterval time.Duration, labels map[string]string) *Group {
g := &Group{
Name: cfg.Name,
File: cfg.File,
@ -50,6 +50,17 @@ func newGroup(cfg config.Group, defaultInterval time.Duration) *Group {
}
rules := make([]Rule, len(cfg.Rules))
for i, r := range cfg.Rules {
// override rule labels with external labels
for k, v := range labels {
if prevV, ok := r.Labels[k]; ok {
logger.Infof("label %q=%q for rule %q.%q overwritten with external label %q=%q",
k, prevV, g.Name, r.Name(), k, v)
}
if r.Labels == nil {
r.Labels = map[string]string{}
}
r.Labels[k] = v
}
rules[i] = g.newRule(r)
}
g.Rules = rules
@ -74,7 +85,7 @@ func (g *Group) ID() uint64 {
}
// Restore restores alerts state for group rules
func (g *Group) Restore(ctx context.Context, q datasource.Querier, lookback time.Duration) error {
func (g *Group) Restore(ctx context.Context, q datasource.Querier, lookback time.Duration, labels map[string]string) error {
for _, rule := range g.Rules {
rr, ok := rule.(*AlertingRule)
if !ok {
@ -83,7 +94,7 @@ func (g *Group) Restore(ctx context.Context, q datasource.Querier, lookback time
if rr.For < 1 {
continue
}
if err := rr.Restore(ctx, q, lookback); err != nil {
if err := rr.Restore(ctx, q, lookback, labels); err != nil {
return fmt.Errorf("error while restoring rule %q: %w", rule, err)
}
}

View File

@ -150,7 +150,7 @@ func TestGroupStart(t *testing.T) {
t.Fatalf("failed to parse rules: %s", err)
}
const evalInterval = time.Millisecond
g := newGroup(groups[0], evalInterval)
g := newGroup(groups[0], evalInterval, map[string]string{"cluster": "east-1"})
g.Concurrency = 2
fn := &fakeNotifier{}

View File

@ -40,6 +40,8 @@ absolute path to all .yaml files in root.`)
externalURL = flag.String("external.url", "", "External URL is used as alert's source for sent alerts to the notifier")
externalAlertSource = flag.String("external.alert.source", "", `External Alert Source allows to override the Source link for alerts sent to AlertManager for cases where you want to build a custom link to Grafana, Prometheus or any other service.
eg. 'explore?orgId=1&left=[\"now-1h\",\"now\",\"VictoriaMetrics\",{\"expr\": \"{{$expr|quotesEscape|pathEscape}}\"},{\"mode\":\"Metrics\"},{\"ui\":[true,true,true,\"none\"]}]'.If empty '/api/v1/:groupID/alertID/status' is used`)
externalLabels = flagutil.NewArray("external.label", "Optional label in the form 'name=value' to add to all generated recording rules and alerts. "+
"Pass multiple -label flags in order to add multiple label sets.")
remoteReadLookBack = flag.Duration("remoteRead.lookback", time.Hour, "Lookback defines how far to look into past for alerts timeseries."+
" For example, if lookback=1h then range from now() to now()-1h will be scanned.")
@ -125,6 +127,7 @@ func newManager(ctx context.Context) (*manager, error) {
groups: make(map[uint64]*Group),
querier: q,
notifiers: nts,
labels: map[string]string{},
}
rw, err := remotewrite.Init(ctx)
if err != nil {
@ -137,6 +140,14 @@ func newManager(ctx context.Context) (*manager, error) {
return nil, fmt.Errorf("failed to init remoteRead: %w", err)
}
manager.rr = rr
for _, s := range *externalLabels {
n := strings.IndexByte(s, '=')
if n < 0 {
return nil, fmt.Errorf("missing '=' in `-label`. It must contain label in the form `name=value`; got %q", s)
}
manager.labels[s[:n]] = s[n+1:]
}
return manager, nil
}

View File

@ -22,6 +22,7 @@ type manager struct {
rr datasource.Querier
wg sync.WaitGroup
labels map[string]string
groupsMu sync.RWMutex
groups map[uint64]*Group
@ -64,7 +65,7 @@ func (m *manager) close() {
func (m *manager) startGroup(ctx context.Context, group *Group, restore bool) {
if restore && m.rr != nil {
err := group.Restore(ctx, m.rr, *remoteReadLookBack)
err := group.Restore(ctx, m.rr, *remoteReadLookBack, m.labels)
if err != nil {
logger.Errorf("error while restoring state for group %q: %s", group.Name, err)
}
@ -88,7 +89,7 @@ func (m *manager) update(ctx context.Context, path []string, validateTpl, valida
groupsRegistry := make(map[uint64]*Group)
for _, cfg := range groupsCfg {
ng := newGroup(cfg, *evaluationInterval)
ng := newGroup(cfg, *evaluationInterval, m.labels)
groupsRegistry[ng.ID()] = ng
}