2020-04-27 23:19:27 +02:00
package main
import (
"context"
"flag"
"fmt"
"net/url"
"os"
2020-06-21 12:32:46 +02:00
"strconv"
2020-04-27 23:19:27 +02:00
"strings"
"time"
2020-10-20 09:15:21 +02:00
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/config"
2020-04-27 23:19:27 +02:00
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/datasource"
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/notifier"
2020-06-28 13:26:22 +02:00
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/remoteread"
2020-04-27 23:19:27 +02:00
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/remotewrite"
2022-05-14 11:38:44 +02:00
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/templates"
2020-04-27 23:19:27 +02:00
"github.com/VictoriaMetrics/VictoriaMetrics/lib/buildinfo"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/envflag"
2020-05-14 21:01:51 +02:00
"github.com/VictoriaMetrics/VictoriaMetrics/lib/fasttime"
2020-04-27 23:19:27 +02:00
"github.com/VictoriaMetrics/VictoriaMetrics/lib/flagutil"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/httpserver"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/logger"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/procutil"
2022-07-21 18:58:22 +02:00
"github.com/VictoriaMetrics/VictoriaMetrics/lib/pushmetrics"
2020-04-27 23:19:27 +02:00
"github.com/VictoriaMetrics/metrics"
)
var (
2022-10-01 17:26:05 +02:00
rulePath = flagutil . NewArrayString ( "rule" , ` Path to the file with alert rules .
2021-03-09 21:49:50 +01:00
Supports patterns . Flag can be specified multiple times .
2020-04-27 23:19:27 +02:00
Examples :
2020-08-20 23:36:38 +02:00
- rule = "/path/to/file" . Path to a single file with alerting rules
2021-03-09 21:49:50 +01:00
- rule = "dir/*.yaml" - rule = "/*.yaml" . Relative path to all . yaml files in "dir" folder ,
2020-08-13 15:43:55 +02:00
absolute path to all . yaml files in root .
Rule files may contain % { ENV_VAR } placeholders , which are substituted by the corresponding env vars . ` )
2020-06-06 22:27:09 +02:00
2022-10-01 17:26:05 +02:00
ruleTemplatesPath = flagutil . NewArrayString ( "rule.templates" , ` Path or glob pattern to location with go template definitions
2022-05-14 11:38:44 +02:00
for rules annotations templating . Flag can be specified multiple times .
Examples :
- rule . templates = "/path/to/file" . Path to a single file with go templates
- rule . templates = "dir/*.tpl" - rule . templates = "/*.tpl" . Relative path to all . tpl files in "dir" folder ,
absolute path to all . tpl files in root . ` )
2021-05-25 15:27:22 +02:00
rulesCheckInterval = flag . Duration ( "rule.configCheckInterval" , 0 , "Interval for checking for changes in '-rule' files. " +
2022-02-02 13:11:41 +01:00
"By default the checking is disabled. Send SIGHUP signal in order to force config check for changes. DEPRECATED - see '-configCheckInterval' instead" )
configCheckInterval = flag . Duration ( "configCheckInterval" , 0 , "Interval for checking for changes in '-rule' or '-notifier.config' files. " +
"By default the checking is disabled. Send SIGHUP signal in order to force config check for changes." )
2021-05-25 15:27:22 +02:00
2020-06-28 13:26:22 +02:00
httpListenAddr = flag . String ( "httpListenAddr" , ":8880" , "Address to listen for http connections" )
evaluationInterval = flag . Duration ( "evaluationInterval" , time . Minute , "How often to evaluate the rules" )
2020-06-06 22:27:09 +02:00
validateTemplates = flag . Bool ( "rule.validateTemplates" , true , "Whether to validate annotation and label templates" )
validateExpressions = flag . Bool ( "rule.validateExpressions" , true , "Whether to validate rules expressions via MetricsQL engine" )
2021-09-13 14:48:18 +02:00
maxResolveDuration = flag . Duration ( "rule.maxResolveDuration" , 0 , "Limits the maximum duration for automatic alert expiration, " +
"which is by default equal to 3 evaluation intervals of the parent group." )
2022-03-16 16:26:33 +01:00
resendDelay = flag . Duration ( "rule.resendDelay" , 0 , "Minimum amount of time to wait before resending an alert to notifier" )
2020-06-21 12:32:46 +02:00
externalURL = flag . String ( "external.url" , "" , "External URL is used as alert's source for sent alerts to the notifier" )
2022-10-05 21:52:30 +02:00
externalAlertSource = flag . String ( "external.alert.source" , "" , ` External Alert Source allows to override the Source link for alerts sent to AlertManager ` +
` for cases where you want to build a custom link to Grafana, Prometheus or any other service. ` +
` Supports templating - see https://docs.victoriametrics.com/vmalert.html#templating . ` +
` For example, link to Grafana: -external.alert.source='explore?orgId=1&left=[\"now-1h\",\"now\",\"VictoriaMetrics\", { \"expr\": \" {{ $expr | quotesEscape | crlfEscape | queryEscape }} \"}, { \"mode\":\"Metrics\"}, { \"ui\":[true,true,true,\"none\"]}]' . ` +
2022-10-27 21:30:27 +02:00
` If empty 'vmalert/alert?group_id= {{ .GroupID }} &alert_id= {{ .AlertID }} ' is used ` )
2022-10-01 17:26:05 +02:00
externalLabels = flagutil . NewArrayString ( "external.label" , "Optional label in the form 'Name=value' to add to all generated recording rules and alerts. " +
2020-07-28 13:20:31 +02:00
"Pass multiple -label flags in order to add multiple label sets." )
2020-06-28 13:26:22 +02:00
remoteReadLookBack = flag . Duration ( "remoteRead.lookback" , time . Hour , "Lookback defines how far to look into past for alerts timeseries." +
" For example, if lookback=1h then range from now() to now()-1h will be scanned." )
2021-05-05 09:07:19 +02:00
remoteReadIgnoreRestoreErrors = flag . Bool ( "remoteRead.ignoreRestoreErrors" , true , "Whether to ignore errors from remote storage when restoring alerts state on startup." )
2020-10-20 09:15:21 +02:00
2022-02-02 13:11:41 +01:00
disableAlertGroupLabel = flag . Bool ( "disableAlertgroupLabel" , false , "Whether to disable adding group's Name as label to generated alerts and time series." )
2021-08-21 19:08:55 +02:00
2022-09-15 12:00:36 +02:00
dryRun = flag . Bool ( "dryRun" , false , "Whether to check only config files without running vmalert. The rules file are validated. The -rule flag must be specified." )
2020-04-27 23:19:27 +02:00
)
2021-10-13 14:25:11 +02:00
var alertURLGeneratorFn notifier . AlertURLGenerator
2020-04-27 23:19:27 +02:00
func main ( ) {
2020-05-16 10:59:30 +02:00
// Write flags and help message to stdout, since it is easier to grep or pipe.
flag . CommandLine . SetOutput ( os . Stdout )
2020-06-05 09:42:56 +02:00
flag . Usage = usage
2020-04-27 23:19:27 +02:00
envflag . Parse ( )
2022-08-11 09:56:40 +02:00
remoteread . InitSecretFlags ( )
remotewrite . InitSecretFlags ( )
datasource . InitSecretFlags ( )
2020-04-27 23:19:27 +02:00
buildinfo . Init ( )
logger . Init ( )
2022-07-22 12:35:58 +02:00
pushmetrics . Init ( )
2022-05-14 11:38:44 +02:00
err := templates . Load ( * ruleTemplatesPath , true )
if err != nil {
logger . Fatalf ( "failed to parse %q: %s" , * ruleTemplatesPath , err )
}
2020-06-23 21:45:45 +02:00
2020-10-20 09:15:21 +02:00
if * dryRun {
2022-07-22 13:50:41 +02:00
groups , err := config . Parse ( * rulePath , notifier . ValidateTemplates , true )
2020-10-20 09:15:21 +02:00
if err != nil {
2021-06-09 11:20:38 +02:00
logger . Fatalf ( "failed to parse %q: %s" , * rulePath , err )
2020-10-20 09:15:21 +02:00
}
if len ( groups ) == 0 {
logger . Fatalf ( "No rules for validation. Please specify path to file(s) with alerting and/or recording rules using `-rule` flag" )
}
return
}
2021-10-13 14:25:11 +02:00
eu , err := getExternalURL ( * externalURL , * httpListenAddr , httpserver . IsTLS ( ) )
if err != nil {
logger . Fatalf ( "failed to init `external.url`: %s" , err )
}
2022-05-14 11:38:44 +02:00
2021-10-13 14:25:11 +02:00
alertURLGeneratorFn , err = getAlertURLGenerator ( eu , * externalAlertSource , * validateTemplates )
if err != nil {
logger . Fatalf ( "failed to init `external.alert.source`: %s" , err )
}
2022-07-22 13:50:41 +02:00
var validateTplFn config . ValidateTplFn
if * validateTemplates {
validateTplFn = notifier . ValidateTemplates
}
2021-06-09 11:20:38 +02:00
if * replayFrom != "" || * replayTo != "" {
rw , err := remotewrite . Init ( context . Background ( ) )
if err != nil {
logger . Fatalf ( "failed to init remoteWrite: %s" , err )
}
2021-12-21 19:25:47 +01:00
if rw == nil {
logger . Fatalf ( "remoteWrite.url can't be empty in replay mode" )
}
2022-07-22 13:50:41 +02:00
groupsCfg , err := config . Parse ( * rulePath , validateTplFn , * validateExpressions )
2021-06-09 11:20:38 +02:00
if err != nil {
logger . Fatalf ( "cannot parse configuration file: %s" , err )
}
2021-08-31 13:57:47 +02:00
// prevent queries from caching and boundaries aligning
// when querying VictoriaMetrics datasource.
2021-12-02 13:45:08 +01:00
q , err := datasource . Init ( url . Values { "nocache" : { "1" } } )
2021-06-09 11:20:38 +02:00
if err != nil {
logger . Fatalf ( "failed to init datasource: %s" , err )
}
if err := replay ( groupsCfg , q , rw ) ; err != nil {
logger . Fatalf ( "replay failed: %s" , err )
}
return
}
2020-06-28 13:26:22 +02:00
ctx , cancel := context . WithCancel ( context . Background ( ) )
manager , err := newManager ( ctx )
2020-06-23 21:45:45 +02:00
if err != nil {
2020-06-28 13:26:22 +02:00
logger . Fatalf ( "failed to init: %s" , err )
2020-04-27 23:19:27 +02:00
}
2021-05-25 15:27:22 +02:00
logger . Infof ( "reading rules configuration file from %q" , strings . Join ( * rulePath , ";" ) )
2022-07-22 13:50:41 +02:00
groupsCfg , err := config . Parse ( * rulePath , validateTplFn , * validateExpressions )
2021-05-25 15:27:22 +02:00
if err != nil {
logger . Fatalf ( "cannot parse configuration file: %s" , err )
}
2021-05-21 15:34:03 +02:00
2021-10-19 15:35:27 +02:00
// Register SIGHUP handler for config re-read just before manager.start call.
// This guarantees that the config will be re-read if the signal arrives during manager.start call.
// See https://github.com/VictoriaMetrics/VictoriaMetrics/issues/1240
sighupCh := procutil . NewSighupChan ( )
2021-05-25 15:27:22 +02:00
if err := manager . start ( ctx , groupsCfg ) ; err != nil {
2020-05-10 18:58:17 +02:00
logger . Fatalf ( "failed to start: %s" , err )
}
2020-05-09 11:32:12 +02:00
2021-10-19 15:35:27 +02:00
go configReload ( ctx , manager , groupsCfg , sighupCh )
2020-05-09 11:32:12 +02:00
2020-05-10 18:58:17 +02:00
rh := & requestHandler { m : manager }
2020-05-18 10:55:16 +02:00
go httpserver . Serve ( * httpListenAddr , rh . handler )
2020-04-27 23:19:27 +02:00
sig := procutil . WaitForSigterm ( )
logger . Infof ( "service received signal %s" , sig )
if err := httpserver . Stop ( * httpListenAddr ) ; err != nil {
logger . Fatalf ( "cannot stop the webservice: %s" , err )
}
cancel ( )
2020-05-10 18:58:17 +02:00
manager . close ( )
2020-04-27 23:19:27 +02:00
}
var (
2020-05-10 18:58:17 +02:00
configReloads = metrics . NewCounter ( ` vmalert_config_last_reload_total ` )
configReloadErrors = metrics . NewCounter ( ` vmalert_config_last_reload_errors_total ` )
configSuccess = metrics . NewCounter ( ` vmalert_config_last_reload_successful ` )
configTimestamp = metrics . NewCounter ( ` vmalert_config_last_reload_success_timestamp_seconds ` )
2020-04-27 23:19:27 +02:00
)
2020-06-28 13:26:22 +02:00
func newManager ( ctx context . Context ) ( * manager , error ) {
2021-08-31 13:57:47 +02:00
q , err := datasource . Init ( nil )
2020-06-28 13:26:22 +02:00
if err != nil {
2020-06-30 21:58:18 +02:00
return nil , fmt . Errorf ( "failed to init datasource: %w" , err )
2020-06-28 13:26:22 +02:00
}
2022-02-15 14:59:45 +01:00
2022-05-09 10:11:56 +02:00
labels := make ( map [ string ] string )
2022-02-15 14:59:45 +01:00
for _ , s := range * externalLabels {
if len ( s ) == 0 {
continue
}
n := strings . IndexByte ( s , '=' )
if n < 0 {
return nil , fmt . Errorf ( "missing '=' in `-label`. It must contain label in the form `Name=value`; got %q" , s )
}
labels [ s [ : n ] ] = s [ n + 1 : ]
}
nts , err := notifier . Init ( alertURLGeneratorFn , labels , * externalURL )
2020-06-28 13:26:22 +02:00
if err != nil {
2020-06-30 21:58:18 +02:00
return nil , fmt . Errorf ( "failed to init notifier: %w" , err )
2020-06-28 13:26:22 +02:00
}
manager := & manager {
2021-04-28 22:41:15 +02:00
groups : make ( map [ uint64 ] * Group ) ,
querierBuilder : q ,
notifiers : nts ,
2022-02-15 14:59:45 +01:00
labels : labels ,
2020-06-28 13:26:22 +02:00
}
rw , err := remotewrite . Init ( ctx )
if err != nil {
2020-06-30 21:58:18 +02:00
return nil , fmt . Errorf ( "failed to init remoteWrite: %w" , err )
2020-06-28 13:26:22 +02:00
}
manager . rw = rw
rr , err := remoteread . Init ( )
if err != nil {
2020-06-30 21:58:18 +02:00
return nil , fmt . Errorf ( "failed to init remoteRead: %w" , err )
2020-06-28 13:26:22 +02:00
}
manager . rr = rr
2020-07-28 13:20:31 +02:00
2020-06-28 13:26:22 +02:00
return manager , nil
}
2020-04-27 23:19:27 +02:00
func getExternalURL ( externalURL , httpListenAddr string , isSecure bool ) ( * url . URL , error ) {
if externalURL != "" {
return url . Parse ( externalURL )
}
hname , err := os . Hostname ( )
if err != nil {
return nil , err
}
port := ""
if ipport := strings . Split ( httpListenAddr , ":" ) ; len ( ipport ) > 1 {
port = ":" + ipport [ 1 ]
}
schema := "http://"
if isSecure {
schema = "https://"
}
return url . Parse ( fmt . Sprintf ( "%s%s%s" , schema , hname , port ) )
}
2020-06-21 12:32:46 +02:00
func getAlertURLGenerator ( externalURL * url . URL , externalAlertSource string , validateTemplate bool ) ( notifier . AlertURLGenerator , error ) {
if externalAlertSource == "" {
2022-07-08 10:26:13 +02:00
return func ( a notifier . Alert ) string {
gID , aID := strconv . FormatUint ( a . GroupID , 10 ) , strconv . FormatUint ( a . ID , 10 )
2022-08-17 14:46:28 +02:00
return fmt . Sprintf ( "%s/vmalert/alert?%s=%s&%s=%s" , externalURL , paramGroupID , gID , paramAlertID , aID )
2020-06-21 12:32:46 +02:00
} , nil
}
if validateTemplate {
if err := notifier . ValidateTemplates ( map [ string ] string {
"tpl" : externalAlertSource ,
} ) ; err != nil {
2020-06-30 21:58:18 +02:00
return nil , fmt . Errorf ( "error validating source template %s: %w" , externalAlertSource , err )
2020-06-21 12:32:46 +02:00
}
}
m := map [ string ] string {
"tpl" : externalAlertSource ,
}
return func ( alert notifier . Alert ) string {
2022-10-05 19:25:03 +02:00
templated , err := alert . ExecTemplate ( nil , alert . Labels , m )
2020-06-21 12:32:46 +02:00
if err != nil {
logger . Errorf ( "can not exec source template %s" , err )
}
2022-10-27 21:30:27 +02:00
return fmt . Sprintf ( "%s/%s" , externalURL , templated [ "tpl" ] )
2020-06-21 12:32:46 +02:00
} , nil
}
2020-06-05 09:42:56 +02:00
func usage ( ) {
const s = `
vmalert processes alerts and recording rules .
2021-04-20 19:16:17 +02:00
See the docs at https : //docs.victoriametrics.com/vmalert.html .
2020-06-05 09:42:56 +02:00
`
2020-12-03 20:40:30 +01:00
flagutil . Usage ( s )
2020-06-05 09:42:56 +02:00
}
2021-05-25 15:27:22 +02:00
2021-10-19 15:35:27 +02:00
func configReload ( ctx context . Context , m * manager , groupsCfg [ ] config . Group , sighupCh <- chan os . Signal ) {
2021-05-25 15:27:22 +02:00
var configCheckCh <- chan time . Time
2022-02-02 13:11:41 +01:00
checkInterval := * configCheckInterval
if checkInterval == 0 && * rulesCheckInterval > 0 {
logger . Warnf ( "flag `rule.configCheckInterval` is deprecated - use `configCheckInterval` instead" )
checkInterval = * rulesCheckInterval
}
if checkInterval > 0 {
ticker := time . NewTicker ( checkInterval )
2021-05-25 15:27:22 +02:00
configCheckCh = ticker . C
defer ticker . Stop ( )
}
2022-07-22 13:50:41 +02:00
var validateTplFn config . ValidateTplFn
if * validateTemplates {
validateTplFn = notifier . ValidateTemplates
}
2021-05-25 15:27:22 +02:00
// init reload metrics with positive values to improve alerting conditions
configSuccess . Set ( 1 )
configTimestamp . Set ( fasttime . UnixTimestamp ( ) )
for {
select {
case <- ctx . Done ( ) :
return
case <- sighupCh :
2022-05-14 11:38:44 +02:00
tmplMsg := ""
if len ( * ruleTemplatesPath ) > 0 {
tmplMsg = fmt . Sprintf ( "and templates %q " , * ruleTemplatesPath )
}
logger . Infof ( "SIGHUP received. Going to reload rules %q %s..." , * rulePath , tmplMsg )
2021-05-25 15:27:22 +02:00
configReloads . Inc ( )
case <- configCheckCh :
}
2022-02-02 13:11:41 +01:00
if err := notifier . Reload ( ) ; err != nil {
configReloadErrors . Inc ( )
configSuccess . Set ( 0 )
logger . Errorf ( "failed to reload notifier config: %s" , err )
continue
}
2022-05-14 11:38:44 +02:00
err := templates . Load ( * ruleTemplatesPath , false )
if err != nil {
configReloadErrors . Inc ( )
configSuccess . Set ( 0 )
logger . Errorf ( "failed to load new templates: %s" , err )
continue
}
2022-07-22 13:50:41 +02:00
newGroupsCfg , err := config . Parse ( * rulePath , validateTplFn , * validateExpressions )
2021-05-25 15:27:22 +02:00
if err != nil {
2021-08-03 11:55:29 +02:00
configReloadErrors . Inc ( )
configSuccess . Set ( 0 )
2021-05-25 15:27:22 +02:00
logger . Errorf ( "cannot parse configuration file: %s" , err )
continue
}
if configsEqual ( newGroupsCfg , groupsCfg ) {
2022-05-14 11:38:44 +02:00
templates . Reload ( )
2021-08-31 11:28:02 +02:00
// set success to 1 since previous reload
// could have been unsuccessful
configSuccess . Set ( 1 )
2021-05-25 15:27:22 +02:00
// config didn't change - skip it
continue
}
2021-11-30 00:23:49 +01:00
if err := m . update ( ctx , newGroupsCfg , false ) ; err != nil {
2021-05-25 15:27:22 +02:00
configReloadErrors . Inc ( )
configSuccess . Set ( 0 )
logger . Errorf ( "error while reloading rules: %s" , err )
continue
}
2022-05-14 11:38:44 +02:00
templates . Reload ( )
2021-11-30 00:23:49 +01:00
groupsCfg = newGroupsCfg
2021-05-25 15:27:22 +02:00
configSuccess . Set ( 1 )
configTimestamp . Set ( fasttime . UnixTimestamp ( ) )
logger . Infof ( "Rules reloaded successfully from %q" , * rulePath )
}
}
func configsEqual ( a , b [ ] config . Group ) bool {
if len ( a ) != len ( b ) {
return false
}
for i := range a {
if a [ i ] . Checksum != b [ i ] . Checksum {
return false
}
}
return true
}