mirror of
https://github.com/VictoriaMetrics/VictoriaMetrics.git
synced 2025-01-20 07:19:17 +01:00
`external.label` flag supposed to help to distinguish alert or recording rules source in situations when more than one `vmalert` runs for the same datasource or AlertManager.
This commit is contained in:
parent
0da202023b
commit
2f1e7298ce
@ -64,6 +64,8 @@ run-vmalert: vmalert
|
|||||||
-notifier.url=http://127.0.0.1:9093 \
|
-notifier.url=http://127.0.0.1:9093 \
|
||||||
-remoteWrite.url=http://localhost:8428 \
|
-remoteWrite.url=http://localhost:8428 \
|
||||||
-remoteRead.url=http://localhost:8428 \
|
-remoteRead.url=http://localhost:8428 \
|
||||||
|
-external.label=cluster=east-1 \
|
||||||
|
-external.label=replica=a \
|
||||||
-evaluationInterval=3s
|
-evaluationInterval=3s
|
||||||
|
|
||||||
vmalert-amd64:
|
vmalert-amd64:
|
||||||
|
@ -44,10 +44,19 @@ compatible storage address for storing recording rules results and alerts state
|
|||||||
Then configure `vmalert` accordingly:
|
Then configure `vmalert` accordingly:
|
||||||
```
|
```
|
||||||
./bin/vmalert -rule=alert.rules \
|
./bin/vmalert -rule=alert.rules \
|
||||||
-datasource.url=http://localhost:8428 \
|
-datasource.url=http://localhost:8428 \ # PromQL compatible datasource
|
||||||
-notifier.url=http://localhost:9093
|
-notifier.url=http://localhost:9093 \ # AlertManager URL
|
||||||
|
-notifier.url=http://127.0.0.1:9093 \ # AlertManager replica URL
|
||||||
|
-remoteWrite.url=http://localhost:8428 \ # remote write compatible storage to persist rules
|
||||||
|
-remoteRead.url=http://localhost:8428 \ # PromQL compatible datasource to restore alerts state from
|
||||||
|
-external.label=cluster=east-1 \ # External label to be applied for each rule
|
||||||
|
-external.label=replica=a \ # Multiple external labels may be set
|
||||||
|
-evaluationInterval=3s # Default evaluation interval if not specified in rules group
|
||||||
```
|
```
|
||||||
|
|
||||||
|
If you run multiple `vmalert` services for the same datastore or AlertManager - do not forget
|
||||||
|
to specify different `external.label` flags in order to define which `vmalert` generated rules or alerts.
|
||||||
|
|
||||||
Configuration for [recording](https://prometheus.io/docs/prometheus/latest/configuration/recording_rules/)
|
Configuration for [recording](https://prometheus.io/docs/prometheus/latest/configuration/recording_rules/)
|
||||||
and [alerting](https://prometheus.io/docs/prometheus/latest/configuration/alerting_rules/) rules is very
|
and [alerting](https://prometheus.io/docs/prometheus/latest/configuration/alerting_rules/) rules is very
|
||||||
similar to Prometheus rules and configured using YAML. Configuration examples may be found
|
similar to Prometheus rules and configured using YAML. Configuration examples may be found
|
||||||
@ -174,6 +183,8 @@ Usage of vmalert:
|
|||||||
How often to evaluate the rules (default 1m0s)
|
How often to evaluate the rules (default 1m0s)
|
||||||
-external.url string
|
-external.url string
|
||||||
External URL is used as alert's source for sent alerts to the notifier
|
External URL is used as alert's source for sent alerts to the notifier
|
||||||
|
-external.label array
|
||||||
|
Optional label in the form 'name=value' to add to all generated recording rules and alerts. Pass multiple -label flags in order to add multiple label sets.
|
||||||
-httpListenAddr string
|
-httpListenAddr string
|
||||||
Address to listen for http connections (default ":8880")
|
Address to listen for http connections (default ":8880")
|
||||||
-metricsAuthKey string
|
-metricsAuthKey string
|
||||||
|
@ -331,15 +331,22 @@ func alertForToTimeSeries(name string, a *notifier.Alert, timestamp time.Time) p
|
|||||||
// Restore restores only Start field. Field State will be always Pending and supposed
|
// Restore restores only Start field. Field State will be always Pending and supposed
|
||||||
// to be updated on next Exec, as well as Value field.
|
// to be updated on next Exec, as well as Value field.
|
||||||
// Only rules with For > 0 will be restored.
|
// Only rules with For > 0 will be restored.
|
||||||
func (ar *AlertingRule) Restore(ctx context.Context, q datasource.Querier, lookback time.Duration) error {
|
func (ar *AlertingRule) Restore(ctx context.Context, q datasource.Querier, lookback time.Duration, labels map[string]string) error {
|
||||||
if q == nil {
|
if q == nil {
|
||||||
return fmt.Errorf("querier is nil")
|
return fmt.Errorf("querier is nil")
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// account for external labels in filter
|
||||||
|
var labelsFilter string
|
||||||
|
for k, v := range labels {
|
||||||
|
labelsFilter += fmt.Sprintf(",%s=%q", k, v)
|
||||||
|
}
|
||||||
|
|
||||||
// Get the last datapoint in range via MetricsQL `last_over_time`.
|
// Get the last datapoint in range via MetricsQL `last_over_time`.
|
||||||
// We don't use plain PromQL since Prometheus doesn't support
|
// We don't use plain PromQL since Prometheus doesn't support
|
||||||
// remote write protocol which is used for state persistence in vmalert.
|
// remote write protocol which is used for state persistence in vmalert.
|
||||||
expr := fmt.Sprintf("last_over_time(%s{alertname=%q}[%ds])",
|
expr := fmt.Sprintf("last_over_time(%s{alertname=%q%s}[%ds])",
|
||||||
alertForStateMetricName, ar.Name, int(lookback.Seconds()))
|
alertForStateMetricName, ar.Name, labelsFilter, int(lookback.Seconds()))
|
||||||
qMetrics, err := q.Query(ctx, expr)
|
qMetrics, err := q.Query(ctx, expr)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return err
|
return err
|
||||||
|
@ -419,7 +419,7 @@ func TestAlertingRule_Restore(t *testing.T) {
|
|||||||
fq := &fakeQuerier{}
|
fq := &fakeQuerier{}
|
||||||
tc.rule.GroupID = fakeGroup.ID()
|
tc.rule.GroupID = fakeGroup.ID()
|
||||||
fq.add(tc.metrics...)
|
fq.add(tc.metrics...)
|
||||||
if err := tc.rule.Restore(context.TODO(), fq, time.Hour); err != nil {
|
if err := tc.rule.Restore(context.TODO(), fq, time.Hour, nil); err != nil {
|
||||||
t.Fatalf("unexpected err: %s", err)
|
t.Fatalf("unexpected err: %s", err)
|
||||||
}
|
}
|
||||||
if len(tc.rule.alerts) != len(tc.expAlerts) {
|
if len(tc.rule.alerts) != len(tc.expAlerts) {
|
||||||
|
@ -90,6 +90,14 @@ func (r *Rule) UnmarshalYAML(unmarshal func(interface{}) error) error {
|
|||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Name returns Rule name according to its type
|
||||||
|
func (r *Rule) Name() string {
|
||||||
|
if r.Record != "" {
|
||||||
|
return r.Record
|
||||||
|
}
|
||||||
|
return r.Alert
|
||||||
|
}
|
||||||
|
|
||||||
// HashRule hashes significant Rule fields into
|
// HashRule hashes significant Rule fields into
|
||||||
// unique hash value
|
// unique hash value
|
||||||
func HashRule(r Rule) uint64 {
|
func HashRule(r Rule) uint64 {
|
||||||
|
@ -32,7 +32,7 @@ type Group struct {
|
|||||||
updateCh chan *Group
|
updateCh chan *Group
|
||||||
}
|
}
|
||||||
|
|
||||||
func newGroup(cfg config.Group, defaultInterval time.Duration) *Group {
|
func newGroup(cfg config.Group, defaultInterval time.Duration, labels map[string]string) *Group {
|
||||||
g := &Group{
|
g := &Group{
|
||||||
Name: cfg.Name,
|
Name: cfg.Name,
|
||||||
File: cfg.File,
|
File: cfg.File,
|
||||||
@ -50,6 +50,17 @@ func newGroup(cfg config.Group, defaultInterval time.Duration) *Group {
|
|||||||
}
|
}
|
||||||
rules := make([]Rule, len(cfg.Rules))
|
rules := make([]Rule, len(cfg.Rules))
|
||||||
for i, r := range cfg.Rules {
|
for i, r := range cfg.Rules {
|
||||||
|
// override rule labels with external labels
|
||||||
|
for k, v := range labels {
|
||||||
|
if prevV, ok := r.Labels[k]; ok {
|
||||||
|
logger.Infof("label %q=%q for rule %q.%q overwritten with external label %q=%q",
|
||||||
|
k, prevV, g.Name, r.Name(), k, v)
|
||||||
|
}
|
||||||
|
if r.Labels == nil {
|
||||||
|
r.Labels = map[string]string{}
|
||||||
|
}
|
||||||
|
r.Labels[k] = v
|
||||||
|
}
|
||||||
rules[i] = g.newRule(r)
|
rules[i] = g.newRule(r)
|
||||||
}
|
}
|
||||||
g.Rules = rules
|
g.Rules = rules
|
||||||
@ -74,7 +85,7 @@ func (g *Group) ID() uint64 {
|
|||||||
}
|
}
|
||||||
|
|
||||||
// Restore restores alerts state for group rules
|
// Restore restores alerts state for group rules
|
||||||
func (g *Group) Restore(ctx context.Context, q datasource.Querier, lookback time.Duration) error {
|
func (g *Group) Restore(ctx context.Context, q datasource.Querier, lookback time.Duration, labels map[string]string) error {
|
||||||
for _, rule := range g.Rules {
|
for _, rule := range g.Rules {
|
||||||
rr, ok := rule.(*AlertingRule)
|
rr, ok := rule.(*AlertingRule)
|
||||||
if !ok {
|
if !ok {
|
||||||
@ -83,7 +94,7 @@ func (g *Group) Restore(ctx context.Context, q datasource.Querier, lookback time
|
|||||||
if rr.For < 1 {
|
if rr.For < 1 {
|
||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
if err := rr.Restore(ctx, q, lookback); err != nil {
|
if err := rr.Restore(ctx, q, lookback, labels); err != nil {
|
||||||
return fmt.Errorf("error while restoring rule %q: %w", rule, err)
|
return fmt.Errorf("error while restoring rule %q: %w", rule, err)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -150,7 +150,7 @@ func TestGroupStart(t *testing.T) {
|
|||||||
t.Fatalf("failed to parse rules: %s", err)
|
t.Fatalf("failed to parse rules: %s", err)
|
||||||
}
|
}
|
||||||
const evalInterval = time.Millisecond
|
const evalInterval = time.Millisecond
|
||||||
g := newGroup(groups[0], evalInterval)
|
g := newGroup(groups[0], evalInterval, map[string]string{"cluster": "east-1"})
|
||||||
g.Concurrency = 2
|
g.Concurrency = 2
|
||||||
|
|
||||||
fn := &fakeNotifier{}
|
fn := &fakeNotifier{}
|
||||||
|
@ -40,6 +40,8 @@ absolute path to all .yaml files in root.`)
|
|||||||
externalURL = flag.String("external.url", "", "External URL is used as alert's source for sent alerts to the notifier")
|
externalURL = flag.String("external.url", "", "External URL is used as alert's source for sent alerts to the notifier")
|
||||||
externalAlertSource = flag.String("external.alert.source", "", `External Alert Source allows to override the Source link for alerts sent to AlertManager for cases where you want to build a custom link to Grafana, Prometheus or any other service.
|
externalAlertSource = flag.String("external.alert.source", "", `External Alert Source allows to override the Source link for alerts sent to AlertManager for cases where you want to build a custom link to Grafana, Prometheus or any other service.
|
||||||
eg. 'explore?orgId=1&left=[\"now-1h\",\"now\",\"VictoriaMetrics\",{\"expr\": \"{{$expr|quotesEscape|pathEscape}}\"},{\"mode\":\"Metrics\"},{\"ui\":[true,true,true,\"none\"]}]'.If empty '/api/v1/:groupID/alertID/status' is used`)
|
eg. 'explore?orgId=1&left=[\"now-1h\",\"now\",\"VictoriaMetrics\",{\"expr\": \"{{$expr|quotesEscape|pathEscape}}\"},{\"mode\":\"Metrics\"},{\"ui\":[true,true,true,\"none\"]}]'.If empty '/api/v1/:groupID/alertID/status' is used`)
|
||||||
|
externalLabels = flagutil.NewArray("external.label", "Optional label in the form 'name=value' to add to all generated recording rules and alerts. "+
|
||||||
|
"Pass multiple -label flags in order to add multiple label sets.")
|
||||||
|
|
||||||
remoteReadLookBack = flag.Duration("remoteRead.lookback", time.Hour, "Lookback defines how far to look into past for alerts timeseries."+
|
remoteReadLookBack = flag.Duration("remoteRead.lookback", time.Hour, "Lookback defines how far to look into past for alerts timeseries."+
|
||||||
" For example, if lookback=1h then range from now() to now()-1h will be scanned.")
|
" For example, if lookback=1h then range from now() to now()-1h will be scanned.")
|
||||||
@ -125,6 +127,7 @@ func newManager(ctx context.Context) (*manager, error) {
|
|||||||
groups: make(map[uint64]*Group),
|
groups: make(map[uint64]*Group),
|
||||||
querier: q,
|
querier: q,
|
||||||
notifiers: nts,
|
notifiers: nts,
|
||||||
|
labels: map[string]string{},
|
||||||
}
|
}
|
||||||
rw, err := remotewrite.Init(ctx)
|
rw, err := remotewrite.Init(ctx)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
@ -137,6 +140,14 @@ func newManager(ctx context.Context) (*manager, error) {
|
|||||||
return nil, fmt.Errorf("failed to init remoteRead: %w", err)
|
return nil, fmt.Errorf("failed to init remoteRead: %w", err)
|
||||||
}
|
}
|
||||||
manager.rr = rr
|
manager.rr = rr
|
||||||
|
|
||||||
|
for _, s := range *externalLabels {
|
||||||
|
n := strings.IndexByte(s, '=')
|
||||||
|
if n < 0 {
|
||||||
|
return nil, fmt.Errorf("missing '=' in `-label`. It must contain label in the form `name=value`; got %q", s)
|
||||||
|
}
|
||||||
|
manager.labels[s[:n]] = s[n+1:]
|
||||||
|
}
|
||||||
return manager, nil
|
return manager, nil
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -22,6 +22,7 @@ type manager struct {
|
|||||||
rr datasource.Querier
|
rr datasource.Querier
|
||||||
|
|
||||||
wg sync.WaitGroup
|
wg sync.WaitGroup
|
||||||
|
labels map[string]string
|
||||||
|
|
||||||
groupsMu sync.RWMutex
|
groupsMu sync.RWMutex
|
||||||
groups map[uint64]*Group
|
groups map[uint64]*Group
|
||||||
@ -64,7 +65,7 @@ func (m *manager) close() {
|
|||||||
|
|
||||||
func (m *manager) startGroup(ctx context.Context, group *Group, restore bool) {
|
func (m *manager) startGroup(ctx context.Context, group *Group, restore bool) {
|
||||||
if restore && m.rr != nil {
|
if restore && m.rr != nil {
|
||||||
err := group.Restore(ctx, m.rr, *remoteReadLookBack)
|
err := group.Restore(ctx, m.rr, *remoteReadLookBack, m.labels)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
logger.Errorf("error while restoring state for group %q: %s", group.Name, err)
|
logger.Errorf("error while restoring state for group %q: %s", group.Name, err)
|
||||||
}
|
}
|
||||||
@ -88,7 +89,7 @@ func (m *manager) update(ctx context.Context, path []string, validateTpl, valida
|
|||||||
|
|
||||||
groupsRegistry := make(map[uint64]*Group)
|
groupsRegistry := make(map[uint64]*Group)
|
||||||
for _, cfg := range groupsCfg {
|
for _, cfg := range groupsCfg {
|
||||||
ng := newGroup(cfg, *evaluationInterval)
|
ng := newGroup(cfg, *evaluationInterval, m.labels)
|
||||||
groupsRegistry[ng.ID()] = ng
|
groupsRegistry[ng.ID()] = ng
|
||||||
}
|
}
|
||||||
|
|
||||||
|
Loading…
Reference in New Issue
Block a user