2020-02-16 19:59:02 +01:00
|
|
|
package main
|
|
|
|
|
|
|
|
import (
|
2020-03-13 11:19:31 +01:00
|
|
|
"context"
|
2020-02-16 19:59:02 +01:00
|
|
|
"flag"
|
2020-03-13 11:19:31 +01:00
|
|
|
"fmt"
|
2020-02-16 19:59:02 +01:00
|
|
|
"net/http"
|
2020-04-01 17:17:53 +02:00
|
|
|
"net/url"
|
|
|
|
"os"
|
2020-03-13 11:19:31 +01:00
|
|
|
"strings"
|
2020-04-06 13:44:03 +02:00
|
|
|
"sync"
|
2020-03-13 11:19:31 +01:00
|
|
|
"time"
|
2020-02-16 19:59:02 +01:00
|
|
|
|
|
|
|
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/datasource"
|
2020-04-06 13:44:03 +02:00
|
|
|
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/notifier"
|
2020-04-27 23:18:02 +02:00
|
|
|
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/remotewrite"
|
2020-02-16 19:59:02 +01:00
|
|
|
"github.com/VictoriaMetrics/VictoriaMetrics/lib/buildinfo"
|
|
|
|
"github.com/VictoriaMetrics/VictoriaMetrics/lib/envflag"
|
2020-03-29 00:48:30 +01:00
|
|
|
"github.com/VictoriaMetrics/VictoriaMetrics/lib/flagutil"
|
2020-02-16 19:59:02 +01:00
|
|
|
"github.com/VictoriaMetrics/VictoriaMetrics/lib/httpserver"
|
|
|
|
"github.com/VictoriaMetrics/VictoriaMetrics/lib/logger"
|
|
|
|
"github.com/VictoriaMetrics/VictoriaMetrics/lib/procutil"
|
2020-04-11 21:42:01 +02:00
|
|
|
"github.com/VictoriaMetrics/metrics"
|
2020-02-16 19:59:02 +01:00
|
|
|
)
|
|
|
|
|
|
|
|
var (
|
2020-04-12 13:47:26 +02:00
|
|
|
rulePath = flagutil.NewArray("rule", `Path to the file with alert rules.
|
|
|
|
Supports patterns. Flag can be specified multiple times.
|
2020-03-29 00:48:30 +01:00
|
|
|
Examples:
|
2020-04-12 13:47:26 +02:00
|
|
|
-rule /path/to/file. Path to a single file with alerting rules
|
|
|
|
-rule dir/*.yaml -rule /*.yaml. Relative path to all .yaml files in "dir" folder,
|
|
|
|
absolute path to all .yaml files in root.`)
|
2020-04-27 23:18:02 +02:00
|
|
|
validateTemplates = flag.Bool("rule.validateTemplates", true, "Indicates to validate annotation and label templates")
|
|
|
|
httpListenAddr = flag.String("httpListenAddr", ":8880", "Address to listen for http connections")
|
|
|
|
datasourceURL = flag.String("datasource.url", "", "Victoria Metrics or VMSelect url. Required parameter. e.g. http://127.0.0.1:8428")
|
|
|
|
basicAuthUsername = flag.String("datasource.basicAuth.username", "", "Optional basic auth username to use for -datasource.url")
|
|
|
|
basicAuthPassword = flag.String("datasource.basicAuth.password", "", "Optional basic auth password to use for -datasource.url")
|
|
|
|
remoteWriteURL = flag.String("remotewrite.url", "", "Optional URL to remote-write compatible storage where to write timeseries"+
|
|
|
|
"based on active alerts. E.g. http://127.0.0.1:8428")
|
2020-04-26 13:15:04 +02:00
|
|
|
evaluationInterval = flag.Duration("evaluationInterval", 1*time.Minute, "How often to evaluate the rules. Default 1m")
|
|
|
|
notifierURL = flag.String("notifier.url", "", "Prometheus alertmanager URL. Required parameter. e.g. http://127.0.0.1:9093")
|
|
|
|
externalURL = flag.String("external.url", "", "External URL is used as alert's source for sent alerts to the notifier")
|
2020-02-16 19:59:02 +01:00
|
|
|
)
|
|
|
|
|
2020-04-06 13:44:03 +02:00
|
|
|
// TODO: hot configuration reload
|
|
|
|
// TODO: alerts state persistence
|
2020-02-16 19:59:02 +01:00
|
|
|
func main() {
|
|
|
|
envflag.Parse()
|
|
|
|
buildinfo.Init()
|
|
|
|
logger.Init()
|
2020-03-29 00:48:30 +01:00
|
|
|
checkFlags()
|
2020-03-13 11:19:31 +01:00
|
|
|
ctx, cancel := context.WithCancel(context.Background())
|
2020-04-01 21:29:11 +02:00
|
|
|
eu, err := getExternalURL(*externalURL, *httpListenAddr, httpserver.IsTLS())
|
2020-04-01 17:17:53 +02:00
|
|
|
if err != nil {
|
|
|
|
logger.Fatalf("can not get external url:%s ", err)
|
|
|
|
}
|
2020-04-06 13:44:03 +02:00
|
|
|
notifier.InitTemplateFunc(eu)
|
2020-02-16 19:59:02 +01:00
|
|
|
|
2020-03-29 00:48:30 +01:00
|
|
|
logger.Infof("reading alert rules configuration file from %s", strings.Join(*rulePath, ";"))
|
2020-04-26 13:15:04 +02:00
|
|
|
groups, err := Parse(*rulePath, *validateTemplates)
|
2020-02-16 19:59:02 +01:00
|
|
|
if err != nil {
|
2020-04-27 23:18:02 +02:00
|
|
|
logger.Fatalf("cannot parse configuration file: %s", err)
|
2020-02-16 19:59:02 +01:00
|
|
|
}
|
2020-03-29 00:48:30 +01:00
|
|
|
|
2020-03-13 11:19:31 +01:00
|
|
|
w := &watchdog{
|
|
|
|
storage: datasource.NewVMStorage(*datasourceURL, *basicAuthUsername, *basicAuthPassword, &http.Client{}),
|
2020-04-11 17:49:23 +02:00
|
|
|
alertProvider: notifier.NewAlertManager(*notifierURL, func(group, name string) string {
|
2020-04-11 11:40:24 +02:00
|
|
|
return fmt.Sprintf("%s/api/v1/%s/%s/status", eu, group, name)
|
2020-03-13 11:19:31 +01:00
|
|
|
}, &http.Client{}),
|
|
|
|
}
|
2020-04-27 23:18:02 +02:00
|
|
|
|
|
|
|
if *remoteWriteURL != "" {
|
|
|
|
c, err := remotewrite.NewClient(ctx, remotewrite.Config{
|
|
|
|
Addr: *remoteWriteURL,
|
|
|
|
FlushInterval: *evaluationInterval,
|
|
|
|
})
|
|
|
|
if err != nil {
|
|
|
|
logger.Fatalf("failed to init remotewrite client: %s", err)
|
|
|
|
}
|
|
|
|
w.rw = c
|
|
|
|
}
|
|
|
|
|
2020-04-06 13:44:03 +02:00
|
|
|
wg := sync.WaitGroup{}
|
|
|
|
for i := range groups {
|
|
|
|
wg.Add(1)
|
|
|
|
go func(group Group) {
|
2020-03-13 11:19:31 +01:00
|
|
|
w.run(ctx, group, *evaluationInterval)
|
2020-04-06 13:44:03 +02:00
|
|
|
wg.Done()
|
|
|
|
}(groups[i])
|
2020-02-16 19:59:02 +01:00
|
|
|
}
|
2020-04-06 13:44:03 +02:00
|
|
|
|
2020-04-11 11:40:24 +02:00
|
|
|
go httpserver.Serve(*httpListenAddr, (&requestHandler{groups: groups}).handler)
|
2020-04-06 13:44:03 +02:00
|
|
|
|
2020-02-16 19:59:02 +01:00
|
|
|
sig := procutil.WaitForSigterm()
|
|
|
|
logger.Infof("service received signal %s", sig)
|
2020-02-21 22:15:05 +01:00
|
|
|
if err := httpserver.Stop(*httpListenAddr); err != nil {
|
|
|
|
logger.Fatalf("cannot stop the webservice: %s", err)
|
|
|
|
}
|
2020-03-13 11:19:31 +01:00
|
|
|
cancel()
|
2020-04-27 23:18:02 +02:00
|
|
|
if w.rw != nil {
|
|
|
|
err := w.rw.Close()
|
|
|
|
if err != nil {
|
|
|
|
logger.Fatalf("cannot stop the remotewrite: %s", err)
|
|
|
|
}
|
|
|
|
}
|
2020-04-06 13:44:03 +02:00
|
|
|
wg.Wait()
|
2020-02-16 19:59:02 +01:00
|
|
|
}
|
|
|
|
|
|
|
|
type watchdog struct {
|
2020-03-13 11:19:31 +01:00
|
|
|
storage *datasource.VMStorage
|
2020-04-06 13:44:03 +02:00
|
|
|
alertProvider notifier.Notifier
|
2020-04-27 23:18:02 +02:00
|
|
|
rw *remotewrite.Client
|
2020-03-13 11:19:31 +01:00
|
|
|
}
|
|
|
|
|
2020-04-11 21:42:01 +02:00
|
|
|
var (
|
|
|
|
iterationTotal = metrics.NewCounter(`vmalert_iteration_total`)
|
|
|
|
iterationDuration = metrics.NewSummary(`vmalert_iteration_duration_seconds`)
|
|
|
|
|
|
|
|
execTotal = metrics.NewCounter(`vmalert_execution_total`)
|
|
|
|
execErrors = metrics.NewCounter(`vmalert_execution_errors_total`)
|
|
|
|
execDuration = metrics.NewSummary(`vmalert_execution_duration_seconds`)
|
2020-04-27 23:18:02 +02:00
|
|
|
|
|
|
|
alertsFired = metrics.NewCounter(`vmalert_alerts_fired_total`)
|
|
|
|
alertsSent = metrics.NewCounter(`vmalert_alerts_sent_total`)
|
|
|
|
alertsSendErrors = metrics.NewCounter(`vmalert_alerts_send_errors_total`)
|
|
|
|
|
|
|
|
remoteWriteSent = metrics.NewCounter(`vmalert_remotewrite_sent_total`)
|
|
|
|
remoteWriteErrors = metrics.NewCounter(`vmalert_remotewrite_errors_total`)
|
2020-04-11 21:42:01 +02:00
|
|
|
)
|
|
|
|
|
2020-04-06 13:44:03 +02:00
|
|
|
func (w *watchdog) run(ctx context.Context, group Group, evaluationInterval time.Duration) {
|
2020-04-11 21:42:01 +02:00
|
|
|
logger.Infof("watchdog for %s has been started", group.Name)
|
2020-03-13 11:19:31 +01:00
|
|
|
t := time.NewTicker(evaluationInterval)
|
|
|
|
defer t.Stop()
|
|
|
|
for {
|
2020-04-11 21:42:01 +02:00
|
|
|
|
2020-03-13 11:19:31 +01:00
|
|
|
select {
|
|
|
|
case <-t.C:
|
2020-04-11 21:42:01 +02:00
|
|
|
iterationTotal.Inc()
|
|
|
|
iterationStart := time.Now()
|
2020-04-06 13:44:03 +02:00
|
|
|
for _, rule := range group.Rules {
|
2020-04-11 21:42:01 +02:00
|
|
|
execTotal.Inc()
|
|
|
|
|
|
|
|
execStart := time.Now()
|
|
|
|
err := rule.Exec(ctx, w.storage)
|
|
|
|
execDuration.UpdateDuration(execStart)
|
|
|
|
|
|
|
|
if err != nil {
|
|
|
|
execErrors.Inc()
|
2020-04-06 13:44:03 +02:00
|
|
|
logger.Errorf("failed to execute rule %q.%q: %s", group.Name, rule.Name, err)
|
2020-03-13 11:19:31 +01:00
|
|
|
continue
|
|
|
|
}
|
2020-04-11 21:42:01 +02:00
|
|
|
|
2020-04-27 23:18:02 +02:00
|
|
|
var alertsToSend []notifier.Alert
|
|
|
|
for _, a := range rule.alerts {
|
|
|
|
if a.State != notifier.StatePending {
|
|
|
|
alertsToSend = append(alertsToSend, *a)
|
|
|
|
}
|
|
|
|
if a.State == notifier.StateInactive || w.rw == nil {
|
|
|
|
continue
|
|
|
|
}
|
|
|
|
tss := rule.AlertToTimeSeries(a, execStart)
|
|
|
|
for _, ts := range tss {
|
|
|
|
remoteWriteSent.Inc()
|
|
|
|
if err := w.rw.Push(ts); err != nil {
|
|
|
|
remoteWriteErrors.Inc()
|
|
|
|
logger.Errorf("failed to push timeseries to remotewrite: %s", err)
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
alertsSent.Add(len(alertsToSend))
|
|
|
|
if err := w.alertProvider.Send(alertsToSend); err != nil {
|
|
|
|
alertsSendErrors.Inc()
|
2020-04-06 13:44:03 +02:00
|
|
|
logger.Errorf("failed to send alert for rule %q.%q: %s", group.Name, rule.Name, err)
|
2020-03-13 11:19:31 +01:00
|
|
|
}
|
|
|
|
}
|
2020-04-11 21:42:01 +02:00
|
|
|
iterationDuration.UpdateDuration(iterationStart)
|
2020-03-13 11:19:31 +01:00
|
|
|
case <-ctx.Done():
|
2020-04-06 13:44:03 +02:00
|
|
|
logger.Infof("%s received stop signal", group.Name)
|
2020-03-13 11:19:31 +01:00
|
|
|
return
|
|
|
|
}
|
|
|
|
}
|
2020-02-16 19:59:02 +01:00
|
|
|
}
|
|
|
|
|
2020-04-01 17:17:53 +02:00
|
|
|
func getExternalURL(externalURL, httpListenAddr string, isSecure bool) (*url.URL, error) {
|
|
|
|
if externalURL != "" {
|
|
|
|
return url.Parse(externalURL)
|
2020-03-13 11:19:31 +01:00
|
|
|
}
|
2020-04-01 17:17:53 +02:00
|
|
|
hname, err := os.Hostname()
|
2020-03-13 11:19:31 +01:00
|
|
|
if err != nil {
|
2020-04-01 17:17:53 +02:00
|
|
|
return nil, err
|
2020-03-13 11:19:31 +01:00
|
|
|
}
|
2020-04-01 17:17:53 +02:00
|
|
|
port := ""
|
|
|
|
if ipport := strings.Split(httpListenAddr, ":"); len(ipport) > 1 {
|
|
|
|
port = ":" + ipport[1]
|
|
|
|
}
|
|
|
|
schema := "http://"
|
|
|
|
if isSecure {
|
|
|
|
schema = "https://"
|
2020-03-13 11:19:31 +01:00
|
|
|
}
|
2020-04-01 17:17:53 +02:00
|
|
|
return url.Parse(fmt.Sprintf("%s%s%s", schema, hname, port))
|
2020-02-16 19:59:02 +01:00
|
|
|
}
|
|
|
|
|
2020-03-29 00:48:30 +01:00
|
|
|
func checkFlags() {
|
2020-04-11 17:49:23 +02:00
|
|
|
if *notifierURL == "" {
|
2020-03-29 00:48:30 +01:00
|
|
|
flag.PrintDefaults()
|
2020-04-11 17:49:23 +02:00
|
|
|
logger.Fatalf("notifier.url is empty")
|
2020-03-29 00:48:30 +01:00
|
|
|
}
|
|
|
|
if *datasourceURL == "" {
|
|
|
|
flag.PrintDefaults()
|
|
|
|
logger.Fatalf("datasource.url is empty")
|
|
|
|
}
|
|
|
|
}
|